# 🤗 Toxic Comment Classification

### Import Libraries

In [1]:
# data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# torch
import torch

# huggingface
from datasets import load_from_disk, Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

#### Some Default Configs

In [2]:
MODEL_CKPT = 'distilbert-base-uncased'
LR = 2e-5
BATCH_SIZE = 8
WEIGHT_DECAY = 0.01

### Data Exploration

In [17]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv').merge(pd.read_csv('test_labels.csv'),
                                     on='id',
                                     how='inner')

In [23]:
train['tokens'] = train['comment_text'].map(lambda x: len(x.split()))
test['tokens'] = test['comment_text'].map(lambda x: len(x.split()))

In [28]:
display(train.tokens.describe())
display(test.tokens.describe())

count    159571.000000
mean         67.273527
std          99.230702
min           1.000000
25%          17.000000
50%          36.000000
75%          75.000000
max        1411.000000
Name: tokens, dtype: float64

count    153164.000000
mean         61.610751
std          98.959698
min           0.000000
25%          14.000000
50%          31.000000
75%          67.000000
max        2321.000000
Name: tokens, dtype: float64

### Dataset Preparation

In [45]:
toxics = DatasetDict({
        'train': Dataset.from_pandas(train.drop('tokens', axis=1)),
        'test': Dataset.from_pandas(test.drop('tokens', axis=1))
    })

toxics['test'] = toxics['test'].remove_columns('__index_level_0__')
toxics

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
})

In [46]:
# examine a little bit
toxics['train'][0]

{'id': '0000997932d777bf',
 'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

#### Create Validation Split

In [49]:
# makes split
toxics_split = toxics['train'].train_test_split(0.25, seed=42)
# assign split to main Dataset
toxics['train'] = toxics_split['train']
toxics['validation'] = toxics_split['test']

toxics

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 119678
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 39893
    })
})

In [50]:
toxics.save_to_disk('jigsaw-toxic-comments')

Flattening the indices:   0%|          | 0/120 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/40 [00:00<?, ?ba/s]

### Tokenization

In [3]:
###################################################
# RUN THIS CELL FOR DIRECT LOAD DATASET FROM DISK #
###################################################

toxics = load_from_disk('jigsaw-toxic-comments')
toxics

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 119678
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 39893
    })
})

In [8]:
# label map
labels = [label for label in toxics['train'].features.keys() if label not in ['id', 'comment_text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

In [10]:
# get to know tokenizer properties
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)
print(tokenizer.model_input_names)

30522
512
['input_ids', 'attention_mask']


In [11]:
def tokenize_data(batch):    
    tokenized = tokenizer(
        batch['comment_text'],
        truncation=True,
        max_length=256,
        return_overflowing_tokens=True
    )

    # extract mapping between new and old indices
    ### this operation simply copy the original data attribute and map it
    ### to the tokenized one based on the sample_mapping, as we have truncated
    ### tokenization result
    sample_map = tokenized.pop('overflow_to_sample_mapping')
    for k, v in batch.items():
        tokenized[k] = [v[i] for i in sample_map]

    # preprocess labels
    labels_batch = {k: batch[k] for k in batch.keys() if k in labels}
    labels_matrix = np.zeros((len(tokenized), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    tokenized['labels'] = labels_matrix.tolist()

    return tokenized

In [12]:
toxics_tokenized = toxics.map(tokenize_data, batched=True)
toxics_tokenized



  0%|          | 0/120 [00:00<?, ?ba/s]

ValueError: could not broadcast input array from shape (1000,) into shape (8,)

In [17]:
toxic_tokenized = toxics_tokenized.remove_columns('comment_text')

#### Dynamic Padding with DataCollator

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)