## Import the needed libraries and data

In [4]:
import pandas as pd
import numpy as np
import os
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer

import warnings
warnings.filterwarnings('ignore')

from utils.preprocess import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
file_loc = "data/"

In [6]:
train_data = pd.read_csv(file_loc+"train.csv")
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
test_data = pd.read_csv(file_loc+"/test.csv")
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


## Data preprocessing

In [8]:
bal_data = balance_class(train_data.copy())

Before Undersampling
--------------------
Train data: (159571, 8)
Toxic data: (16225, 8)
Clean data: (143346, 8)

After Undersampling
-------------------
Balance data: (33225, 8)
Balance toxic data: (16225, 8)
Balance clean data: (17000, 8)


In [9]:
clean_data = clean(bal_data.copy())

-------------
Cleaning data
-------------
Data cleaned!


In [10]:
text, label = generate_random(clean_data, 'comment_text', 3)
print(f"Original sample comment:\n------------------------\n{text}")
print()
text, label = generate_random(clean_data, 'clean', 3)
print(f"Clean sample comment:\n---------------------\n{text}")

Original sample comment:
------------------------
Yup! Of course I'm blocked! Because you bot assholes can't take the fact that you have no lives. You spend every minute of your life editing WP, and for what?! Organization? More like ORDER.

Clean sample comment:
---------------------
yup of course i'm blocked because bot assholes can't take fact lives you spend every minute life editing wp organization more like order


In [11]:
clean_data = decontract(clean_data)

------------------
Decontracting text
------------------
Texts decontracted!


In [12]:
text, label = generate_random(clean_data, 'clean', 3)
print(f"Clean sample comment:\n---------------------\n{text}")
print()
text, label = generate_random(clean_data, 'decontracted', 3)
print(f"Decontracted sample comment:\n----------------------------\n{text}")

Clean sample comment:
---------------------
yup of course i'm blocked because bot assholes can't take fact lives you spend every minute life editing wp organization more like order

Decontracted sample comment:
----------------------------
yup of course i am blocked because bot assholes cannot take fact lives you spend every minute life editing wp organization more like order


In [13]:
clean_data = lemmatize(clean_data)

----------------
Lemmatizing text
----------------
Text lemmatized!


In [14]:
text, label = generate_random(clean_data, 'decontracted', 3)
print(f"Decontracted sample comment:\n----------------------------\n{text}")
print()
text, label = generate_random(clean_data, 'lemmatized', 3)
print(f"Lemmatized sample comment:\n--------------------------\n{text}")

Decontracted sample comment:
----------------------------
yup of course i am blocked because bot assholes cannot take fact lives you spend every minute life editing wp organization more like order

Lemmatized sample comment:
--------------------------
 yup of course i be block because bot asshole cannot take fact life you spend every minute life edit wp organization more like order


In [15]:
#Splitting the data in train and validation

label_columns = clean_data.iloc[:,2:8].columns
clean_data['labels'] = clean_data[label_columns].apply(lambda x: list(x), axis=1)
clean_data.drop(label_columns, inplace=True, axis=1)

train_size = 0.8

train_df = clean_data.sample(frac=train_size, random_state=42)
val_df = clean_data.drop(train_df.index).reset_index(drop=True)
train_df.reset_index(inplace=True, drop=True)

train_df.head()

Unnamed: 0,id,comment_text,clean,decontracted,lemmatized,labels
0,38dd995c105311e6,She's insane and a zealot.,she's insane zealot,she is insane zealot,she be insane zealot,"[1, 0, 0, 0, 1, 0]"
1,7862e3e66d0351b1,REDIRECT User talk:Deepak/India-Pakistan relat...,redirect user talkdeepakindiapakistan relations,redirect user talkdeepakindiapakistan relations,redirect user talkdeepakindiapakistan relation,"[0, 0, 0, 0, 0, 0]"
2,2fff09fe62dd7395,SANDBOX?? \n\nI DID YOUR MADRE DID IN THE SANDBOX,sandbox i did your madre did in the sandbox,sandbox i did your madre did in the sandbox,sandbox i do your madre do in the sandbox,"[1, 0, 0, 0, 0, 0]"
3,fce1412ca06c50fc,"Ok, I wont do it anymore. But I have to say, t...",ok i wont anymore but i say snob factor i'm de...,ok i will not anymore but i say snob factor i ...,ok i will not anymore but i say snob factor i...,"[1, 0, 0, 0, 0, 0]"
4,6a6b16e9d691ee36,They are called Syed \nYou dirty chump.Muhajir...,they called syed you dirty chumpmuhajirs worst...,they called syed you dirty chumpmuhajirs worst...,they call syed you dirty chumpmuhajirs bad ra...,"[1, 0, 1, 1, 1, 1]"


In [16]:
train_df.shape, val_df.shape

((26580, 6), (6645, 6))

## Toxic comment classification using DistilBERT

In [31]:
class MultiLabelDataset(Dataset):
    
    def __init__(self, df, text, tokenizer, max_len, new_data=False):
        self.tokenizer = tokenizer
        self.data = df
        self.text = df[text]
        self.new_data = new_data
        self.max_len = max_len
        
        if not new_data:
            self.targets = self.data.labels
            
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        
        inputs = self.tokenizer.encode_plus(text,None,
                                           add_special_tokens=True,
                                           max_length=self.max_len,
                                           pad_to_max_length=True,
                                           return_token_type_ids=True)
        out = {
            "input_ids": torch.tensor(inputs['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(inputs['attention_mask'], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        }
        if not self.new_data:
            out["targets"] = torch.tensor(self.targets[index], dtype=torch.float)
            
        return out

In [18]:
#Defining the configurations

MAX_LEN = 200
EPOCHS = 5
LEARNING_RATE = 1e-05
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda:0'

In [19]:
#setting seed
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [20]:
#Tokenizing and creating dataset then loading it to the DataLoader
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_set = MultiLabelDataset(train_df, 'lemmatized', tokenizer, MAX_LEN)
val_set = MultiLabelDataset(val_df, 'lemmatized', tokenizer, MAX_LEN)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=8)
val_loader = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=8)

In [21]:
#building the Model

from transformers import DistilBertModel

class DistilBertClass(torch.nn.Module):
    def __init__(self):
        super(DistilBertClass, self).__init__()
        
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = torch.nn.Sequential(torch.nn.Linear(768, 768),
                                             torch.nn.ReLU(),
                                             torch.nn.Dropout(0.1),
                                             torch.nn.Linear(768, 6))
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        out = hidden_state[:,0]
        out = self.classifier(out)
        return out

In [22]:
#Initializing the model and moving it to the DEVICE i.e GPU

model = DistilBertClass()
model.to(DEVICE)
print(f"Model on {DEVICE}")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model on cuda:0


In [23]:
#Defining the optimizer and loss function

optimizer = torch.optim.RMSprop(params=model.parameters(), lr=LEARNING_RATE)

def loss_fn(outputs, targets):
    return torch.nn.functional.binary_cross_entropy_with_logits(outputs,targets)

In [24]:
#Defining the training and validation loop

from tqdm.auto import tqdm

def train(epoch):
    model.train()
    
    for _, data in tqdm(enumerate(train_loader)):
        input_ids = data['input_ids'].to(DEVICE, dtype=torch.long)
        attention_mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
        targets = data['targets'].to(DEVICE, dtype=torch.float)
        
        outputs = model(input_ids, attention_mask, token_type_ids)
        
        loss = loss_fn(outputs, targets)
        
        if _ % 3000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")
            
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [25]:
from sklearn import metrics

def validate():
  model.eval()

  fin_targets = []
  fin_outputs = []

  with torch.inference_mode():
    for _, data in tqdm(enumerate(val_loader, 0)):
      ids = data['input_ids'].to(DEVICE, dtype=torch.long)
      mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
      token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
      targets = data['targets'].to(DEVICE, dtype=torch.float)

      outputs = model(ids, mask, token_type_ids)
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
      
  fin_outputs = np.array(fin_outputs) >= 0.5
  accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
  precision = metrics.precision_score(fin_targets, fin_outputs, average='macro')
  recall = metrics.recall_score(fin_targets, fin_outputs, average='macro')
  roc_auc = metrics.roc_auc_score(fin_targets, fin_outputs, average='macro')
  f1_score_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro')
  f1_score_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro') 
  

  return {
      "Accuracy Score": accuracy,
      "Precision Score": precision,
      "Recall Score": recall,
      "ROC-AUC Score": roc_auc,
      "F1 score(micro)": f1_score_micro,
      "F1 score(macro)": f1_score_macro
  }

In [26]:
#Executing the training and validation process

for epoch in range(EPOCHS):
  train(epoch)
  print(validate())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

Epoch: 0, Loss: 0.6833714842796326


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

{'Accuracy Score': 0.6948081264108352, 'Precision Score': 0.5555846096793374, 'Recall Score': 0.3984705745782478, 'ROC-AUC Score': 0.6845780766766308, 'F1 score(micro)': 0.7876576505116205, 'F1 score(macro)': 0.4333074621109672}


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

Epoch: 1, Loss: 0.12008650600910187


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

{'Accuracy Score': 0.7136192626034612, 'Precision Score': 0.6245910951052481, 'Recall Score': 0.5362851809895492, 'ROC-AUC Score': 0.7480824853105851, 'F1 score(micro)': 0.8212339861581504, 'F1 score(macro)': 0.5640003787221127}


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

Epoch: 2, Loss: 0.1259920299053192


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

{'Accuracy Score': 0.6845748683220466, 'Precision Score': 0.7111020523088752, 'Recall Score': 0.5731940640776112, 'ROC-AUC Score': 0.7551651321188634, 'F1 score(micro)': 0.8173660960795139, 'F1 score(macro)': 0.5847680095620511}


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

Epoch: 3, Loss: 0.1450495421886444


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

{'Accuracy Score': 0.7047404063205418, 'Precision Score': 0.680561916683485, 'Recall Score': 0.6923178456708924, 'ROC-AUC Score': 0.8176293810594202, 'F1 score(micro)': 0.828132534270166, 'F1 score(macro)': 0.6787301661588258}


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

Epoch: 4, Loss: 0.11832541227340698


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

{'Accuracy Score': 0.7169300225733635, 'Precision Score': 0.7494594775847857, 'Recall Score': 0.5744570372910968, 'ROC-AUC Score': 0.7710888928153952, 'F1 score(micro)': 0.8173754832840573, 'F1 score(macro)': 0.6354833822170289}


In [32]:
#testing the model

# test_data['comment_text'] = test_data['comment_text'].apply(lambda x: clean_text(x))
test_set = MultiLabelDataset(test_data, 'comment_text', tokenizer, MAX_LEN, new_data=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=8)

In [33]:
#Using the trained model for prediction

all_test_pred = []

def prediction():
    model.eval()
    with torch.inference_mode():
        for _, data in tqdm(enumerate(test_loader, 0)):
            ids = data['input_ids'].to(DEVICE, dtype=torch.long)
            mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            all_test_pred.append(probas)
    return probas

In [34]:
probabilities = prediction()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

In [35]:
all_test_pred = torch.cat(all_test_pred)

In [36]:
submit_df = test_data.copy()

In [37]:
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for i,name in enumerate(label_columns):

    submit_df[name] = all_test_pred[:, i].cpu()
    # submit_df.head()

In [38]:
submit_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.994536,0.489863,0.979685,0.103699,0.945051,0.417002
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.687455,0.002561,0.269121,0.002858,0.044465,0.000899
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.006334,0.000899,0.004343,0.00172,0.003503,0.002039
3,00017563c3f7919a,":If you have a look back at the source, the in...",0.012694,0.000399,0.001606,0.001404,0.002453,0.000677
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.042304,0.000329,0.002042,0.001103,0.002694,0.000933


In [41]:
submit_df.to_csv('distilBERT_prediction.csv', index=False)

In [39]:
text, label = generate_random(submit_df, 'comment_text', 50)
print(f"Sample comment:\n----------------\n{text}")
print()
print(f"Predicted labels:\n--------------------\n{label}")

Sample comment:
----------------
== Wikipedia:Articles_for_deletion/Vedic_mathematics_(book) == 

 Please participate in the debate.

Predicted labels:
--------------------
        toxic  severe_toxic   obscene  threat    insult  identity_hate
14000  0.0067      0.000367  0.001496  0.0009  0.001607       0.000924


In [44]:
text, label = generate_random(submit_df, 'comment_text', 3)
print(f"Sample comment:\n----------------\n{text}")
print()
print(f"Predicted labels:\n--------------------\n{label}")

Sample comment:
----------------
this is a load of BULLSHIT

Predicted labels:
--------------------
         toxic  severe_toxic   obscene    threat    insult  identity_hate
5994  0.974378      0.038962  0.918394  0.002555  0.276083       0.001613
