# **Toxic Comment Classification Challenge**

Identify and classify toxic online comments.

> [**Kaggle Dataset**](https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data)

In [None]:
# Install Kaggle.
!pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
# Files Upload.
from google.colab import files

files.upload()

In [None]:
# Create a Kaggle Folder.
!mkdir ~/.kaggle

# Copy the kaggle.json to the folder created.
!cp kaggle.json ~/.kaggle/

# Permission for the json file to act.
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Dataset Download.
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

In [None]:
# Unzip Dataset.
!unzip jigsaw-toxic-comment-classification-challenge.zip

In [None]:
# Unzip all Files.
!unzip sample_submission.csv.zip
!unzip test.csv.zip
!unzip test_labels.csv.zip
!unzip train.csv.zip

# **Toxic Comment Classification using PyTorch and BERT.**

In [None]:
!pip install transformers

In [8]:
# Import Library.
import pandas as pd
import numpy as np
from typing import Tuple, List
from functools import partial
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import (
    BertTokenizer,
    BertModel,
    AdamW,
    get_linear_schedule_with_warmup,
)
import warnings

warnings.filterwarnings("ignore")

# Load Dataset.
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

train_df, valid_df = train_test_split(train, test_size=0.1)

LABELS = train.columns.tolist()[2:]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


class ToxicDataset(Dataset):
    def __init__(
        self, tokenizer: BertTokenizer, dataframe: pd.DataFrame, lazy: bool = False
    ):
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.lazy = lazy
        if not self.lazy:
            self.X = []
            self.Y = []
            for i, (row) in tqdm(dataframe.iterrows()):
                x, y = self.row_to_tensor(self.tokenizer, row)
                self.X.append(x)
                self.Y.append(y)
        else:
            self.df = dataframe

    @staticmethod
    def row_to_tensor(
        tokenizer: BertTokenizer, row: pd.Series
    ) -> Tuple[torch.LongTensor, torch.LongTensor]:
        tokens = tokenizer.encode(row["comment_text"], add_special_tokens=True)
        if len(tokens) > 120:
            tokens = tokens[:119] + [tokens[-1]]
        x = torch.LongTensor(tokens)
        y = torch.FloatTensor(row[LABELS])
        return x, y

    def __len__(self):
        return len(self.df) if self.lazy else len(self.X)

    def __getitem__(self, index: int) -> Tuple[torch.LongTensor, torch.LongTensor]:
        return (
            self.row_to_tensor(self.tokenizer, self.df.iloc[index])
            if self.lazy
            else (self.X[index], self.Y[index])
        )


def collate_fn(
    batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device
) -> Tuple[torch.LongTensor, torch.LongTensor]:
    x, y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.to(device), y.to(device)


train_dataset = ToxicDataset(tokenizer, train_df, lazy=True)
dev_dataset = ToxicDataset(tokenizer, valid_df, lazy=True)
collate_fn = partial(collate_fn, device=DEVICE)
BATCH_SIZE = 32
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(dev_dataset)
train_iterator = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn
)
dev_iterator = DataLoader(
    dev_dataset, batch_size=BATCH_SIZE, sampler=dev_sampler, collate_fn=collate_fn
)


class BertClassifier(nn.Module):
    def __init__(self, bert: BertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(
        self,
        input_ids,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
        )
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
        cls_output = torch.sigmoid(cls_output)
        criterion = nn.BCELoss()
        loss = criterion(cls_output, labels) if labels is not None else 0
        return loss, cls_output


model = BertClassifier(BertModel.from_pretrained("bert-base-uncased"), len(LABELS)).to(
    DEVICE
)


def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        optimizer.zero_grad()
        mask = (x != 0).float()
        loss, outputs = model(x, attention_mask=mask, labels=y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Training Loss: {total_loss / len(iterator)}")


def evaluate(model, iterator):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(iterator):
            mask = (x != 0).float()
            loss, outputs = model(x, attention_mask=mask, labels=y)
            total_loss += loss
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.array(true)
    pred = np.array(pred)
    for i, name in enumerate(LABELS):
        print(f"{name} - ROC-AUC Score: {roc_auc_score(true[:, i], pred[:, i])}")
    print(f"Evaluation Loss: {total_loss / len(iterator)}")


no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if all(nd not in n for nd in no_decay)
        ],
        "weight_decay": 0.01,
    },
    {
        "params": [
            p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
EPOCH_NUM = 2
warmup_steps = 10**3
total_steps = len(train_iterator) * EPOCH_NUM - warmup_steps
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)


for i in range(EPOCH_NUM):
    print("=" * 50, f"EPOCH {i}", "=" * 50)
    train(model, train_iterator, optimizer, scheduler)
    evaluate(model, dev_iterator)

model.eval()


for i in tqdm(range(len(test) // BATCH_SIZE + 1)):
    batch_df = test.iloc[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
    assert (
        batch_df["id"] == submission["id"][i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
    ).all(), "Id Mismatch"
    texts = []
    for text in batch_df["comment_text"].tolist():
        text = tokenizer.encode(text, add_special_tokens=True)
        if len(text) > 120:
            text = text[:119] + [tokenizer.sep_token_id]
        texts.append(torch.LongTensor(text))
    x = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id).to(
        DEVICE
    )
    mask = (x != tokenizer.pad_token_id).float().to(DEVICE)
    with torch.no_grad():
        _, outputs = model(x, attention_mask=mask)
    outputs = outputs.cpu().numpy()
    submission.iloc[i * BATCH_SIZE : (i + 1) * BATCH_SIZE][LABELS] = outputs

submission.to_csv("submission.csv", index=False)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/4488 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (856 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 4488/4488 [48:55<00:00,  1.53it/s]


Training Loss: 0.07776854970983609


100%|██████████| 499/499 [02:12<00:00,  3.76it/s]


toxic - ROC-AUC Score: 0.987401977866778
severe_toxic - ROC-AUC Score: 0.9901347262999449
obscene - ROC-AUC Score: 0.9930468907193929
threat - ROC-AUC Score: 0.9917605360077276
insult - ROC-AUC Score: 0.9890616654649362
identity_hate - ROC-AUC Score: 0.9909251284133735
Evaluation Loss: 0.0402463935315609


100%|██████████| 4488/4488 [48:46<00:00,  1.53it/s]


Training Loss: 0.033025456944864126


100%|██████████| 499/499 [02:12<00:00,  3.76it/s]


toxic - ROC-AUC Score: 0.9884520447978917
severe_toxic - ROC-AUC Score: 0.9910497920982431
obscene - ROC-AUC Score: 0.9938146105393852
threat - ROC-AUC Score: 0.9960874411993725
insult - ROC-AUC Score: 0.9896310687389841
identity_hate - ROC-AUC Score: 0.9929323665818888
Evaluation Loss: 0.03730156645178795


100%|██████████| 4787/4787 [19:18<00:00,  4.13it/s]
