importing libraries and connecting to gpu

In [None]:
!nvidia-smi

In [2]:
import tensorflow as tf
import torch

In [3]:
# connecting torch to the GPU
device = torch.device("cuda")

In [None]:
# installing interface used for working with the BERT model, 
!pip install transformers

In [5]:
import pandas as pd
import numpy as np

taking dataset from google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import os

In [None]:
df = pd.read_csv('/content/drive/My Drive/richard bot/dataset.csv',
                 encoding='ISO-8859-1', 
        )

In [None]:
df.head()

In [None]:
df.groupby(['toxicity']).size().plot.bar()

In [None]:
df.groupby('toxicity').describe()

balancing dataset using downsampling

In [None]:
df_toxic = df[df['toxicity']==1]
df_toxic.shape

In [None]:
df_nontoxic = df[df['toxicity']==0]
df_nontoxic.shape

In [None]:
df_nontoxic_downsampled = df_nontoxic.sample(df_toxic.shape[0])
df_nontoxic_downsampled.shape

In [None]:
#concanating the two now balanced datasets
df_balanced = pd.concat([df_toxic, df_nontoxic_downsampled])
df_balanced.shape

In [None]:
df_balanced['toxicity'].value_counts()

creating two numpy arrays with the sentences and their toxicity label


In [None]:
sentences= df_balanced.text.values
labels = df_balanced.toxicity.values

In [None]:
sentences

In [None]:
labels

preparing the data for the BERT model - tokenizing

In [None]:
from transformers import BertTokenizer

# loading the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# tokenizing sentences - turning into words into numbers and adding special tokens to start and end
# special tokens: [CLS] at start, [SEP] at end

#creating an array for the new sentences
tokenized_sentences = []

# for each sentence
for sent in sentences:
    encoded_sent = tokenizer.encode(
                        sent,                      
                        add_special_tokens = True, 
                   )
    
    # adding encoded sentence to the list.
    tokenized_sentences.append(encoded_sent)

preparing the data for the BERT model - padding

---



In [None]:
a = 0
b = 0
c = 0

In [None]:
#seeing how many sentences are a certain length so that max_len for padding can be determined
for sent in tokenized_sentences :
  if len(sent) >= 512: 
    a+=1
  elif len(sent) >= 256: 
    b+=1


In [None]:
print(a)
print(b)

1180
2304


In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
# setting the maximum sequence length to 256
MAX_LEN = 256

# padding encoded sentences with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence instead of the beginning
tokenized_sentences = pad_sequences(tokenized_sentences, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")



In [None]:
tokenized_sentences

preparing the data for the BERT model - attention masks

In [None]:
#attention masks to recognize whether something is padding or not
attention_masks = []

for sent in tokenized_sentences:
    #  mask is 0 for padding, 1 for non-zero input
    att_mask = [int(token_id > 0) for token_id in sent]
    
   
    attention_masks.append(att_mask)

splitting into train and validation set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# 90% for training and 10% for validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(tokenized_sentences, labels, random_state=2018, test_size=0.1)

# doing the same for masks
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

converting numpy arrays to tensors for model usage

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

creating an iterator to save memory during training (the whole dataset wont be loaded into memory)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
# setting a batch size
batch_size = 16

# creating DataLoader for training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# creating DataLoader for validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

Choosing a BERT model
since it is a classifcation task (toxic or non-toxic), we use BertForSequenceClassification

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [None]:
# loading the model with a linear classification layer on top

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # the 12-layer BERT model which does not care about uppercase or lower case
    num_labels = 2, # the number of output labels which is 2 in this case (binary classfication)
    output_attentions = False, 
    output_hidden_states = False, 
)

# telling pytorch to run this model on the GPU.
model.cuda()

creating an optimizer for training the model

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # how much the weights are adjusted each time
                  eps = 1e-8 
                )

creating a learning rate scheduler for our model

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
#training epochs (how many times the model will be trained)
epochs = 4

# total number of training steps (number of batches * number of epochs, each epoch you train x amount of batches)
total_steps = len(train_dataloader) * epochs

# creating the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

TRAINING THE MODEL

helper function to calculate current accuracy through every epoch

In [None]:
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

helper function for calculating elapsed times (takes time in seconds and formats it nicely)

In [None]:
import time
import datetime

In [None]:
def format_time(elapsed):
    # rounding to nearest second
    elapsed_rounded = int(round((elapsed)))
    
    # formatting to hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


starting the training

In [None]:
import random

In [None]:
# seed value is unique and makes the training reproducable
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# to store the average loss after each epoch
loss_values = []

In [None]:
model_run = 0

In [None]:
#for each time you run the cell
model_run += 1
os.mkdir('/content/drive/My Drive/richard bot/{}'.format(model_run))

# for each epoch
for epoch_i in range(0, epochs):
    # TRAINING STEP
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    # to measure how long each training takes
    t0 = time.time()
    # resetting the total loss for epoch
    total_loss = 0

    # putting the model into training mode
    model.train()

    # for every batch of data
    for step, batch in enumerate(train_dataloader):

        # Progress update every 32 batches.
        if step % 200 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        #  unpacking each batch and copying the tensors to GPU (using .to)
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # clearing gradients
        model.zero_grad()        

        # performing a forward pass (passing the training data into the model)
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # taking the loss value out of the returns
        loss = outputs[0]

        # adding the losses of each batch (will calculate average loss at the end using this)
        total_loss += loss.item()

        # doing a backwards pass to calculate gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # updating parameters  based on computed gradient to minimize loss
        # modified differently according to how optimizer is defined
        optimizer.step()

        # updating the learning rate.
        scheduler.step()

    # caculating average loss for training data
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # storing loss values for each batch
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # VALIDATION STEP
    # measuring performance of model after each epoch

    print("")
    print("Running Validation...")

    t0 = time.time()

    # putting model in evaluation mode
    model.eval()

    # tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # evaluating data for one epoch
    for batch in validation_dataloader:
        
        # adding batch GPU
        batch = tuple(t.to(device) for t in batch)
        
        # unpacking inputs from dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # tellinng model to not compute gradients (since we are not training model) will speed up training
        with torch.no_grad():        

            # will predict the labels or logits
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # getting the "logits"
        logits = outputs[0]

        # moving logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # calculating accuracy for the current batch
        tmp_eval_accuracy =  accuracy(logits, label_ids)
        
        # adding it to total accuracy
        eval_accuracy += tmp_eval_accuracy

        # tracking number of batches
        nb_eval_steps += 1

    # reporting the final accuracy of the validation set
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

    torch.save(model,  os.path.join('/content/drive/My Drive/richard bot/{}'.format(model_run),'epoch-{}.pth'.format(epoch_i)))

print("")
print("TRAINING COMPLETE")

testing the models

In [None]:
model = torch.load('/content/drive/My Drive/richard bot/0/epoch-0.pth')

In [None]:
model2 = torch.load('/content/drive/My Drive/richard bot/0/epoch-1.pth')

In [None]:
model3 = torch.load('/content/drive/My Drive/richard bot/0/epoch-2.pth')

In [None]:
import pandas as pd

# creating sentence and label lists
sentences = ["love you","fuck you","you're a loser","you're so cool"]
labels = [0,1,1,0]

# tokenizing
input_ids = []

for sent in sentences:
    encoded_sent = tokenizer.encode(
                        sent,                
                        add_special_tokens = True,
                   )
    
    input_ids.append(encoded_sent)

# padding
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# creating attention masks
attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# converting to tensor
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# setting batch size
batch_size = 16

# creating DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# putting model in evaluation mode
model2.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  
  # unpacking inputs from dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  #making predictions
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # moving logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # storing predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)


In [None]:
# turning predictions into 0 for nontoxic and 1 for toxic
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
flat_predictions

array([0, 1, 1, 0])