### Toxicity Detection With BERT

This code is largely adapted from the [this tutorial](https://github.com/icmpnorequest/Pytorch_BERT_Text_Classification?tab=readme-ov-file) and tailored to our own dataset.

In [71]:
import csv
import datetime
import numpy as np
import pandas as pd
import random
from sklearn.metrics import f1_score
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import transformers
from transformers import BertTokenizer

In [72]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Reading the data

In [73]:
train_filename = "train_2024.csv"
test_filename = "dev_2024.csv"

df_train = pd.read_csv("train_2024.csv", quoting=csv.QUOTE_NONE)
df_test = pd.read_csv("test_2024.csv", quoting=csv.QUOTE_NONE)
df_val= pd.read_csv("dev_2024.csv", quoting=csv.QUOTE_NONE)

In [74]:
df_train.head()

Unnamed: 0,id,text,label
0,0,Except that Desmond played first base last nig...,0
1,1,What i find funny is the loyalty and blindness...,0
2,2,Read the article not just the headline & you ...,0
3,3,Speaking of a horses backside is that where y...,1
4,4,Michael Barone- gee are you dumb. No other wo...,1


In [75]:
train_labels = df_train['label'].tolist()
train_text = df_train['text'].tolist()

test_labels = df_test['label'].tolist()
test_text = df_test['text'].tolist()

val_labels =  df_val['label'].tolist()
val_text = df_val['text'].tolist()

print("len(train_text) = {}, len(test_text) = {}".format(len(train_text), len(test_text)))

len(train_text) = 99000, len(test_text) = 12001


 ### BERT Tokenization

In [76]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [77]:
# Print the original sentence.
print('Original: ', train_text[0])
print("\n")

print('Tokenized: ', tokenizer.tokenize(train_text[0]))
print("\n")

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0])))
print("\n")

Original:  Except that Desmond played first base last night. Tapia was in LF  and Reynolds had a night off.


Tokenized:  ['except', 'that', 'desmond', 'played', 'first', 'base', 'last', 'night', '.', 'tap', '##ia', 'was', 'in', 'l', '##f', 'and', 'reynolds', 'had', 'a', 'night', 'off', '.']


Token IDs:  [3272, 2008, 16192, 2209, 2034, 2918, 2197, 2305, 1012, 11112, 2401, 2001, 1999, 1048, 2546, 1998, 9579, 2018, 1037, 2305, 2125, 1012]


### BERT Input Formatting

In [78]:
import logging

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

def text_to_id(tokenizer, text_list):
    """
    It is a function to transform text to id.
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    """
    ids_list = []
    
    for item in text_list:
        # Sentence to id and add [CLS] and [SEP]
        encoded_item = tokenizer.encode(item, add_special_tokens=True)
        ids_list.append(encoded_item)
    
    return ids_list

In [79]:
train_text_ids = text_to_id(tokenizer, train_text)
test_text_ids = text_to_id(tokenizer, test_text)
val_text_ids = text_to_id(tokenizer, val_text)


# Print sentence 0, now as a list of IDs.
print('Original: {}\n'.format(train_text[0]))
print('Token IDs: {}\n'.format(train_text_ids[0]))
print("len(train_text_ids) = {}\n".format(len(train_text_ids)))
print("len(test_text_ids) = {}".format(len(test_text_ids)))

Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors


Original: Except that Desmond played first base last night. Tapia was in LF  and Reynolds had a night off.

Token IDs: [101, 3272, 2008, 16192, 2209, 2034, 2918, 2197, 2305, 1012, 11112, 2401, 2001, 1999, 1048, 2546, 1998, 9579, 2018, 1037, 2305, 2125, 1012, 102]

len(train_text_ids) = 99000

len(test_text_ids) = 12001


In [80]:
def padding_truncating(input_ids_list, max_length):
    """
    It is a function to perform padding and truncating
    @param input_ids_list: <List> text_ids
    @param max_length: <Integer> the number we wanna the sentence to be padding or truncating
    @return: processed input_ids_list
    """
    processed_input_ids_list = []
    for item in input_ids_list:
        seq_list = []
        
        if len(item) < max_length:
            # Define a seq_list with the length of max_length
            seq_list = [0] * (max_length - len(item))
            item = item + seq_list
        
        elif len(item) >= max_length:
            item = item[:max_length]
            
        processed_input_ids_list.append(item)
    
    return processed_input_ids_list

In [81]:
train_padding_list = padding_truncating(train_text_ids, max_length=50)
test_padding_list = padding_truncating(test_text_ids, max_length=50)
val_padding_list = padding_truncating(val_text_ids, max_length=50)

### Attention Masks

In [83]:
def get_attention_masks(pad_input_ids_list):
    """
    It is a function to get attention masks:
    
    - If a token ID is 0, then it's padding, set the mask to 0.
    - If a token ID is > 0, then it's a real token, set the mask to 1.
    """
    attention_masks_list = []
    
    for item in pad_input_ids_list:
        
        mask_list = []
        for subitem in item:
            if subitem > 0:
                mask_list.append(1)
            else:
                mask_list.append(0)
        attention_masks_list.append(mask_list)
    
    return attention_masks_list

In [84]:
train_attention_masks = get_attention_masks(train_padding_list)
test_attention_masks = get_attention_masks(test_padding_list)
validation_attention_masks = get_attention_masks(val_padding_list)

assert len(train_text) == len(train_labels) == len(train_attention_masks) == len(train_padding_list)
assert len(test_text) == len(test_labels) == len(test_attention_masks) == len(test_padding_list)

### Converting inputs and labels into tensors

In [85]:
train_inputs = torch.tensor(train_padding_list)
validation_inputs = torch.tensor(val_padding_list)
test_inputs = torch.tensor(test_padding_list)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(val_labels)

train_masks = torch.tensor(train_attention_masks)
validation_masks = torch.tensor(validation_attention_masks)
test_masks = torch.tensor(test_attention_masks)

### Creating dataloaders

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 100 

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### Training BERT Classification Model

In [87]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch

# Load BertForSequenceClassification, the pretrained BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
     num_labels = 2,      # The number of output labels -- 2 for binary classification.
                    # You can increase this for multi-class tasks.   
     output_attentions = False, # Whether the model returns attentions weights.
     output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [88]:
optimizer = AdamW(model.parameters(),lr = 5e-5, eps = 1e-8)



In [89]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
print("total_steps = {}".format(total_steps))

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

total_steps = 2970


### Training

In [90]:
# Function to calculate the f1 score 
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [91]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [92]:
# Set the seed value all over the place to make this reproducible.
seed_val = 12345

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(epochs):
    
    ##########################################
    #               Training                 #
    ##########################################
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 10 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear the gradients.
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we have provided the `labels`.
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end. 
        # `loss` is a Tensor containing a single value; the `.item()` function just returns the Python value from the tensor.
        total_loss += loss.item()

        # Perform a `backward` pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
        
    ##########################################
    #               Validation               #
    ##########################################
    # After the completion of each training epoch, measure our performance on our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to device
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            # token_type_ids is the same as the "segment ids", which differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        # flat_accuracy(y_pred, y_true)
        pred = np.argmax(logits, axis=1)
        tmp_eval_accuracy = f1_score(pred, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    10  of    990.    Elapsed: 0:00:52.
  Batch    20  of    990.    Elapsed: 0:01:39.
  Batch    30  of    990.    Elapsed: 0:02:21.
  Batch    40  of    990.    Elapsed: 0:03:02.
  Batch    50  of    990.    Elapsed: 0:03:44.
  Batch    60  of    990.    Elapsed: 0:04:26.
  Batch    70  of    990.    Elapsed: 0:05:07.
  Batch    80  of    990.    Elapsed: 0:05:48.
  Batch    90  of    990.    Elapsed: 0:06:31.
  Batch   100  of    990.    Elapsed: 0:07:11.
  Batch   110  of    990.    Elapsed: 0:07:52.
  Batch   120  of    990.    Elapsed: 0:08:32.
  Batch   130  of    990.    Elapsed: 0:09:12.
  Batch   140  of    990.    Elapsed: 0:09:53.
  Batch   150  of    990.    Elapsed: 0:10:34.
  Batch   160  of    990.    Elapsed: 0:11:25.
  Batch   170  of    990.    Elapsed: 0:12:06.
  Batch   180  of    990.    Elapsed: 0:12:46.
  Batch   190  of    990.    Elapsed: 0:13:26.
  Batch   200  of    990.    Elapsed: 0:14:06.
  Batch   210  of    990.    Elapsed: 0:14:47.


In [None]:
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=12001)

In [None]:
#We used 0 as the test labels, only for using the pre-written function.
test_labels = torch.zeros((12001))
print(test_labels)

In [None]:
model.eval()
with torch.no_grad():
    outputs = model(test_inputs, token_type_ids=None, 
                        attention_mask=test_masks)

    logits = outputs[0]

    pred = np.argmax(logits, axis=1)

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

model.eval()

predictions , true_labels = [], []

# Predict 
idx = 0
correct = 0
for batch in test_dataloader:
    
    print("Batch {}".format(idx + 1))
    idx += 1
    
    # Add batch to device
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
  
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    # Get the prediction probability
    logits = outputs[0]
    
    # Get the prediction label
    pred = np.argmax(logits, axis=1)
   
    # Get the total correct number between pred and true labels in each batch
    correct += (pred == b_labels).sum().item()
    print("correct = {}\n".format(correct))

In [None]:
MODEL_NAME = "BERT"

with open("{}.csv".format(MODEL_NAME), "w") as f:
    f.write("id,label\n")
    for i,l in enumerate(pred):
        f.write(str(i)+","+str(l) +"\n")
    f.close()

In [None]:
df = pd.read_csv('BERT.csv')
# Convert tensor objects to integers
df['label'] = df['label'].apply(lambda x: int(x.split('(')[1].split(')')[0]))
# Saving the modified data back to a CSV file
df.to_csv('BERT.csv', index=False)