In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
!pip install transformers==2.4.1


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==2.4.1
  Downloading transformers-2.4.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.0.11 (from transformers==2.4.1)
  Downloading tokenizers-0.0.11.tar.gz (30 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting boto3 (from transformers==2.4.1)
  Downloading boto3-1.34.108-py3-none-any.whl.metadata (6.6 kB)
Collecting sacremoses (from transformers==2.4.1)
  Using cached sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting botocore<1.35.0,>=1.34.108 (from boto3->transformers==2.4.1)
  Downloading botocore-1.34.108-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->transformers==2.4.1

### Import

In [46]:
import sys
sys.path.append('..')
from tqdm import tqdm
import pandas as pd

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import pickle

from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics import classification_report

from constants import CATEGORIES

#### GPU Usage

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
print(torch.cuda.get_device_name(0))

Using device: cuda
NVIDIA GeForce RTX 3070


### Import Data

In [48]:
df_train = pd.read_parquet('data/df_train_preprocessed.parquet')
df_val = pd.read_parquet('data/df_val_preprocessed.parquet')
df_test = pd.read_parquet('data/df_test_preprocessed.parquet')

In [49]:
preprocess_types = ['baseline',
                    'word_tokenize_simple_normalization',
                    'word_tokenize_full_normalization',
                    'bpe_tokenize_no_dup_no_punc_normalization',
                    'bpe_tokenize_simple_dup_normalization'
                    ]

In [51]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [64]:
train_encodings = tokenizer.batch_encode_plus(
    df_train['comment_text_baseline'].tolist(),
    max_length=128,
    truncation=True,
    padding=True,
    return_tensors='pt'
)
val_encodings = tokenizer.batch_encode_plus(
    df_val['comment_text_baseline'].tolist(),
    max_length=128,
    truncation=True,
    padding=True,
    return_tensors='pt'
)
test_encodings = tokenizer.batch_encode_plus(
    df_test['comment_text_baseline'].tolist(),
    max_length=128,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

In [65]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [66]:
train_dataset = CustomDataset(train_encodings, df_train[CATEGORIES].values)
val_dataset = CustomDataset(val_encodings, df_val[CATEGORIES].values)
test_dataset = CustomDataset(test_encodings, df_test[CATEGORIES].values)


In [67]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [71]:
class TransformerModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TransformerModel, self).__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.fc_out = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = outputs[0] 
        out = out.mean(dim=1)
        out = self.fc_out(out)
        out = torch.sigmoid(out)
        return out

In [72]:
torch.cuda.empty_cache()

In [73]:
model = TransformerModel('distilbert-base-uncased', len(CATEGORIES)).to(device)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [75]:
for epoch in range(10):
    train_loss = 0
    val_loss = 0

    model.train()
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()  # Ensure labels are Float
        outputs = model(input_ids, attention_mask).float()  # Ensure outputs are Float
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float()  # Ensure labels are Float
            outputs = model(input_ids, attention_mask).float()  # Ensure outputs are Float
            loss = loss_function(outputs, labels)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}: Train loss = {train_loss/len(train_loader)}, Val loss = {val_loss/len(val_loader)}")

Training Epoch 1:   0%|          | 0/7979 [00:00<?, ?it/s]

Training Epoch 1: 100%|██████████| 7979/7979 [13:24<00:00,  9.91it/s]
Validation Epoch 1: 100%|██████████| 1995/1995 [00:46<00:00, 42.50it/s]


Epoch 1: Train loss = 0.05172750367182185, Val loss = 0.04855524124309215


Training Epoch 2: 100%|██████████| 7979/7979 [11:09<00:00, 11.92it/s]
Validation Epoch 2: 100%|██████████| 1995/1995 [00:47<00:00, 42.33it/s]


Epoch 2: Train loss = 0.03663804238750803, Val loss = 0.03818007691806815


Training Epoch 3: 100%|██████████| 7979/7979 [11:09<00:00, 11.91it/s]
Validation Epoch 3: 100%|██████████| 1995/1995 [00:47<00:00, 42.34it/s]


Epoch 3: Train loss = 0.030609446337259174, Val loss = 0.03909039616330648


Training Epoch 4: 100%|██████████| 7979/7979 [11:13<00:00, 11.85it/s]
Validation Epoch 4: 100%|██████████| 1995/1995 [00:56<00:00, 35.41it/s]


Epoch 4: Train loss = 0.024891172007476946, Val loss = 0.04371529924217174


Training Epoch 5: 100%|██████████| 7979/7979 [11:11<00:00, 11.88it/s]
Validation Epoch 5: 100%|██████████| 1995/1995 [00:47<00:00, 42.37it/s]


Epoch 5: Train loss = 0.019785032129114317, Val loss = 0.0466820252579456


Training Epoch 6: 100%|██████████| 7979/7979 [11:09<00:00, 11.92it/s]
Validation Epoch 6: 100%|██████████| 1995/1995 [00:47<00:00, 42.31it/s]


Epoch 6: Train loss = 0.015599894846360346, Val loss = 0.054467033658335384


Training Epoch 7:  37%|███▋      | 2926/7979 [04:13<07:18, 11.52it/s]


KeyboardInterrupt: 

In [None]:
for epoch in range(10):
    train_loss = 0
    val_loss = 0

    model.train()
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()  # Ensure labels are Float
        outputs = model(input_ids, attention_mask).float()  # Ensure outputs are Float
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float()  # Ensure labels are Float
            outputs = model(input_ids, attention_mask).float()  # Ensure outputs are Float
            loss = loss_function(outputs, labels)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}: Train loss = {train_loss/len(train_loader)}, Val loss = {val_loss/len(val_loader)}")

In [81]:
def evaluate_model(model, test_loader, device, categories):
    model.eval()

    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in tqdm(test_loader):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)  # Get the attention_mask
            targets = batch['labels'].to(device)
            outputs = model(inputs, attention_mask)  # Pass the attention_mask to the model
            y_pred.extend(outputs.cpu().numpy())
            y_true.extend(targets.cpu().numpy())

    # rest of your code...
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred) > 0.5

    y_true = y_true.numpy()
    y_pred = y_pred.numpy()

    overall_non_toxic = np.zeros(y_pred.shape[0])
    overall_non_toxic[y_pred.sum(axis=1) == 0] = 1
    y_pred = np.hstack([y_pred, overall_non_toxic.reshape(-1,1)])

    overall_non_toxic = np.zeros(y_true.shape[0])
    overall_non_toxic[y_true.sum(axis=1) == 0] = 1
    y_true = np.hstack([y_true, overall_non_toxic.reshape(-1,1)])

    print(classification_report(y_true, y_pred, target_names=categories+['overall_non_toxic']))

In [82]:
evaluate_model(model, test_loader, device, CATEGORIES)

  0%|          | 0/3999 [00:00<?, ?it/s]

100%|██████████| 3999/3999 [01:51<00:00, 35.85it/s]
  y_true = torch.tensor(y_true)


                   precision    recall  f1-score   support

            toxic       0.50      0.91      0.65      6090
     severe_toxic       0.26      0.44      0.33       367
          obscene       0.68      0.72      0.70      3691
           threat       0.53      0.57      0.55       211
           insult       0.72      0.59      0.65      3427
    identity_hate       0.56      0.61      0.59       712
overall_non_toxic       0.99      0.91      0.95     57735

        micro avg       0.88      0.88      0.88     72233
        macro avg       0.61      0.68      0.63     72233
     weighted avg       0.91      0.88      0.89     72233
      samples avg       0.90      0.89      0.89     72233

