### 1. model, tokenizer

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [3]:
inputs1 = tokenizer("I like it", return_tensors="pt")

with torch.no_grad():
    logits1 = model(**inputs1).logits

predicted_class_id = logits1.argmax().item()
predicted_class_id

1

In [4]:
logits1

tensor([[-0.0500,  0.0438]])

In [5]:
model.config.id2label[predicted_class_id]

'LABEL_1'

In [6]:
inputs2 = tokenizer("I don't like it", return_tensors="pt")

with torch.no_grad():
    logits2 = model(**inputs2).logits

predicted_class_id = logits2.argmax().item()
predicted_class_id

1

### 2. the dataset

In [7]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
!kaggle datasets download -d team-ai/spam-text-message-classification

Downloading spam-text-message-classification.zip to /content
100% 208k/208k [00:00<00:00, 396kB/s]
100% 208k/208k [00:00<00:00, 396kB/s]


In [9]:
!unzip /content/spam-text-message-classification.zip

Archive:  /content/spam-text-message-classification.zip
  inflating: SPAM text message 20170820 - Data.csv  


In [10]:
import pandas as pd

data = pd.read_csv('SPAM text message 20170820 - Data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
label_mapping = {'ham': 1, 'spam': 0}
data['Label'] = data['Category'].map(label_mapping)

data.head()

Unnamed: 0,Category,Message,Label
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


In [12]:
data.isnull().values.any()

False

In [13]:
Messages = list(data['Message'])

Labels=list(data['Label'])

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Messages, Labels, test_size=0.2, random_state=42)

In [15]:
abs(len(X_train) - 0.8*len(Messages)) <= 1

True

### 3. Text Tokenization

In [16]:
tokenized_train = tokenizer(X_train, return_tensors="np", padding=True)
tokenized_test = tokenizer(X_test, return_tensors="np", padding=True)

In [17]:
import numpy as np
train_labels=np.array(y_train) 

In [18]:
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.20.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.5/227.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.1


In [19]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device=",device)
model.to(device)

device= cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [20]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=3e-5)



### 4. TensorDataset, DataLoader 

In [21]:
import torch

input_ids = torch.tensor(tokenized_train['input_ids'])
attention_mask = torch.tensor(tokenized_train['attention_mask'])
labels = torch.tensor(train_labels)

from torch.utils.data import TensorDataset
dataset = TensorDataset(input_ids, attention_mask, labels)

In [22]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### 5. Feed 

In [None]:

model.train()

for epoch in range(10):
    total_loss = 0

    for batch in tqdm(dataloader, desc='Epoch {}'.format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask).logits
       
        CE =nn.CrossEntropyLoss()
        loss=CE(outputs, labels)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print('Epoch {}: Average Loss = {:.4f}'.format(epoch + 1, avg_loss))

Epoch 1: 100%|██████████| 140/140 [01:22<00:00,  1.70it/s]


Epoch 1: Average Loss = 0.0874


Epoch 2: 100%|██████████| 140/140 [01:22<00:00,  1.70it/s]


Epoch 2: Average Loss = 0.0194


Epoch 3: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 3: Average Loss = 0.0091


Epoch 4: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 4: Average Loss = 0.0012


Epoch 5: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 5: Average Loss = 0.0003


Epoch 6: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 6: Average Loss = 0.0002


Epoch 7: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 7: Average Loss = 0.0001


Epoch 8: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 8: Average Loss = 0.0001


Epoch 9: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 9: Average Loss = 0.0001


Epoch 10: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]

Epoch 10: Average Loss = 0.0001





In [23]:
from tqdm import tqdm
import torch.nn as nn
model.train()

for epoch in range(10):
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in tqdm(dataloader, desc='Epoch {}'.format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask).logits
        predicted_labels = torch.argmax(outputs, dim=1)
       
        CE = nn.CrossEntropyLoss()
        loss = CE(outputs, labels)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    print('Epoch {}: Average Loss = {:.4f}, Accuracy = {:.4f}'.format(epoch + 1, avg_loss, accuracy))


Epoch 1: 100%|██████████| 140/140 [01:19<00:00,  1.75it/s]


Epoch 1: Average Loss = 0.1064, Accuracy = 0.9623


Epoch 2: 100%|██████████| 140/140 [01:19<00:00,  1.76it/s]


Epoch 2: Average Loss = 0.0186, Accuracy = 0.9942


Epoch 3: 100%|██████████| 140/140 [01:21<00:00,  1.71it/s]


Epoch 3: Average Loss = 0.0040, Accuracy = 0.9989


Epoch 4: 100%|██████████| 140/140 [01:22<00:00,  1.69it/s]


Epoch 4: Average Loss = 0.0008, Accuracy = 1.0000


Epoch 5: 100%|██████████| 140/140 [01:23<00:00,  1.68it/s]


Epoch 5: Average Loss = 0.0003, Accuracy = 1.0000


Epoch 6: 100%|██████████| 140/140 [01:23<00:00,  1.68it/s]


Epoch 6: Average Loss = 0.0002, Accuracy = 1.0000


Epoch 7: 100%|██████████| 140/140 [01:23<00:00,  1.67it/s]


Epoch 7: Average Loss = 0.0001, Accuracy = 1.0000


Epoch 8: 100%|██████████| 140/140 [01:23<00:00,  1.67it/s]


Epoch 8: Average Loss = 0.0001, Accuracy = 1.0000


Epoch 9: 100%|██████████| 140/140 [01:23<00:00,  1.67it/s]


Epoch 9: Average Loss = 0.0000, Accuracy = 1.0000


Epoch 10: 100%|██████████| 140/140 [01:23<00:00,  1.67it/s]

Epoch 10: Average Loss = 0.0000, Accuracy = 1.0000



