In [9]:
import os
import pandas as pd
import numpy as np
import time
import random
import torch
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModelForSequenceClassification, FunnelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import classification_report, matthews_corrcoef

import modules.deep_learning_modules as dl_modules

#### Source: https://mccormickml.com/2019/07/22/BERT-fine-tuning/#31-bert-tokenizer
#### https://huggingface.co/docs/transformers/tasks/sequence_classification

In [10]:
EPOCHS = 1
MAX_LEN = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
MODELTYPE = "transformer" 
SEED_VAL = 42


model_mapping = {   
                 
    "bert_base": "google-bert/bert-base-uncased",
    "distilbert":"distilbert/distilbert-base-uncased",
    "albert_base": "albert/albert-base-v2",
    "mistral_small": "mistralai/Mistral-7B-v0.1",
    "transformer":"funnel-transformer/small-base",
}

## Test if GPU is available

In [11]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Use the GPU: NVIDIA GeForce RTX 3050 Laptop GPU


## Data Loading and Tokenization & Input Formatting

In [12]:
sentences, labels = dl_modules.read_data()

In [13]:
# Load the Model tokenizer.
print('Loading Model tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_mapping[MODELTYPE], do_lower_case=True)

Loading Model tokenizer...


In [14]:
# example tokenization
print('Original: ', sentences[0])
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Original:  i hate woman
Tokenized:  ['i', 'hate', 'woman']
Token IDs:  [1045, 5223, 2450]


In [15]:
max_len = 0

for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length:', max_len)

Max sentence length: 443


In [16]:
input_ids, attention_masks, labels = dl_modules.tokenize_data(sentences, labels, tokenizer, MAX_LEN)

# Print sentence 0, now as a list of IDs.
print('Original:', sentences[0])
print('Token IDs:', input_ids[0])

Original: i hate woman
Token IDs: tensor([ 101, 1045, 5223, 2450,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])


## Data Splitting

In [17]:
train_dataset, val_dataset, test_dataset = dl_modules.create_data_split(input_ids, attention_masks, labels, SEED_VAL)

print('{:>5,} training samples'.format(len(train_dataset)))
print('{:>6,} validation samples'.format(len(val_dataset)))
print('{:>6,} test samples\n'.format(len(test_dataset)))

print('Whole dataset size:', len(train_dataset) + len(val_dataset) + len(test_dataset))

28,134 training samples
 9,378 validation samples
 9,378 test samples

Whole dataset size: 46890


In [18]:
train_dataloader = dl_modules.create_data_loader(train_dataset, BATCH_SIZE)
validation_dataloader = dl_modules.create_data_loader(val_dataset, BATCH_SIZE, False)

In [19]:
# delete unused variables
del sentences, labels, input_ids, attention_masks, train_dataset, val_dataset, test_dataset

## Creating Model

In [20]:
if MODELTYPE == "transformer":
    
    model = FunnelForSequenceClassification.from_pretrained(
        model_mapping[MODELTYPE],
        num_labels = 2, 
        output_attentions = False,
        output_hidden_states = False
    )
    
else:

    model = AutoModelForSequenceClassification.from_pretrained(
        model_mapping[MODELTYPE],
        num_labels = 2, 
        output_attentions = False,
        output_hidden_states = False
    )

# Tell pytorch to run this model on the GPU.
model.to(device)

model.safetensors:   0%|          | 0.00/462M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of FunnelForSequenceClassification were not initialized from the model checkpoint at funnel-transformer/small-base and are newly initialized: ['classifier.linear_out.bias', 'classifier.linear_hidden.weight', 'classifier.linear_out.weight', 'classifier.linear_hidden.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FunnelForSequenceClassification(
  (funnel): FunnelBaseModel(
    (embeddings): FunnelEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (layer_norm): LayerNorm((768,), eps=1e-09, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): FunnelEncoder(
      (attention_structure): FunnelAttentionStructure(
        (sin_dropout): Dropout(p=0.1, inplace=False)
        (cos_dropout): Dropout(p=0.1, inplace=False)
      )
      (blocks): ModuleList(
        (0-2): 3 x ModuleList(
          (0-3): 4 x FunnelLayer(
            (attention): FunnelRelMultiheadAttention(
              (hidden_dropout): Dropout(p=0.1, inplace=False)
              (attention_dropout): Dropout(p=0.1, inplace=False)
              (q_head): Linear(in_features=768, out_features=768, bias=False)
              (k_head): Linear(in_features=768, out_features=768, bias=True)
              (v_head): Linear(in_features=768, out_features=768, bias=True)
              (post_pro

In [21]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print(f'The {model_mapping[MODELTYPE]} model has {len(params)} different named parameters.\n')

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The funnel-transformer/small-base model has 247 different named parameters.

==== Embedding Layer ====

funnel.embeddings.word_embeddings.weight                (30522, 768)
funnel.embeddings.layer_norm.weight                           (768,)
funnel.embeddings.layer_norm.bias                             (768,)
funnel.encoder.blocks.0.0.attention.r_w_bias                (12, 64)
funnel.encoder.blocks.0.0.attention.r_r_bias                (12, 64)

==== First Transformer ====

funnel.encoder.blocks.0.0.attention.r_kernel            (768, 12, 64)
funnel.encoder.blocks.0.0.attention.r_s_bias                (12, 64)
funnel.encoder.blocks.0.0.attention.seg_embed            (2, 12, 64)
funnel.encoder.blocks.0.0.attention.q_head.weight         (768, 768)
funnel.encoder.blocks.0.0.attention.k_head.weight         (768, 768)
funnel.encoder.blocks.0.0.attention.k_head.bias               (768,)
funnel.encoder.blocks.0.0.attention.v_head.weight         (768, 768)
funnel.encoder.blocks.0.0.attention.v

In [22]:
optimizer = AdamW(model.parameters(),
                  LEARNING_RATE, # default is 5e-5
                  eps = 1e-8 # default is 1e-8.
                )

In [23]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## Training the Model

In [None]:
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

training_stats = []
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, EPOCHS):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print(f'======== Epoch {epoch_i + 1} / {EPOCHS} ========')
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_accuracy = 0
    total_train_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 200 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = dl_modules.format_time(time.time() - t0)
            
            # Report progress.
            print(f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels).loss
        
        logits = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels).logits

        total_train_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_train_accuracy += dl_modules.flat_accuracy(logits, label_ids)

        # Perform a backward pass to calculate the gradients.
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average accuracy over all of the batches.
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = dl_modules.format_time(time.time() - t0)

    print("")
    print(f"  Average training accuracy: {avg_train_accuracy:.2f}")
    print(f"  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time:}")
        
    # ========================================
    #               Validation
    # ========================================
 
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
   
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            loss = model(b_input_ids, 
                         token_type_ids=None, 
                         attention_mask=b_input_mask,
                         labels=b_labels).loss
        
            logits = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels).logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += dl_modules.flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Average Validation Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = dl_modules.format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Epoch': epoch_i + 1,
            'Training_Loss': avg_train_loss,
            'Training_Accuracy': avg_train_accuracy,
            'Validation_Loss': avg_val_loss,
            'Validation_Accuracy': avg_val_accuracy,
            'Training_Time': training_time,
            'Validation_Time': validation_time
        }
    )
    
    df_stats = pd.DataFrame(data=training_stats)
    
    # Create output directory if needed
    if not os.path.exists("../performance_statistics"):
        os.makedirs("../performance_statistics")
    
    df_stats.to_csv(f"../performance_statistics/training_stats_{MODELTYPE}_{LEARNING_RATE}.csv", index=True)
    
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(dl_modules.format_time(time.time()-total_t0)))

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
output_dir = f'../models/'
filename = f"model_{MODELTYPE}_{LEARNING_RATE}"

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
output_dir = output_dir + filename

In [None]:
print(f"Saving model to {output_dir}")

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

## View Training/Validation Results

In [None]:
# Create a DataFrame from our training statistics.
df_stats = pd.read_csv(f"../performance_statistics/training_stats_{MODELTYPE}.csv")
df_stats = df_stats.set_index('Epoch')
df_stats

In [None]:
plt.figure(figsize=(12, 6))

# Plot the learning curve.
plt.plot(df_stats['Training_Loss'], 'b-o', label="Training")
plt.plot(df_stats['Validation_Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([np.arange(1, EPOCHS + 1)])

plt.show()

In [None]:
plt.figure(figsize=(12,6))

# Plot the learning curve.
plt.plot(df_stats['Training_Accuracy'], 'b-o', label="Training")
plt.plot(df_stats['Validation_Accuracy'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks([np.arange(1, EPOCHS + 1)])

plt.show()

In [None]:
plt.figure(figsize=(12,6))

# Plot the learning curve.
plt.plot(df_stats['Training_Time'], 'b-o', label="Training")
plt.plot(df_stats['Validation_Time'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Time")
plt.xlabel("Epoch")
plt.ylabel("Time")
plt.legend()
plt.xticks([np.arange(1, EPOCHS + 1)])

plt.show()

## Load Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)

In [None]:
# Load the test data
sentences, labels = dl_modules.read_data()
input_ids, attention_masks, labels = dl_modules.tokenize_data(sentences, labels, tokenizer, MAX_LEN)
_, _, test_dataset = dl_modules.create_data_split(input_ids, attention_masks, labels, SEED_VAL)
testing_dataloader = dl_modules.create_data_loader(test_dataset, BATCH_SIZE, False)

## Eval on Test Set

In [None]:
# Prediction on test set
print(f'Predicting labels for {len(test_dataset)} test sentences...')

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []
predictions_accuracy, true_labels_accuracy = [], []

# Predict 
for index, batch in enumerate(testing_dataloader):
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
    prediction_list = np.argmax(logits, axis=1).flatten()
    true_labels_list = label_ids.flatten()
    
    for i, prediction in enumerate(prediction_list):
        predictions_accuracy.append(prediction)
        true_labels_accuracy.append(true_labels_list[i])


    if index % 50 == 0:
    
      print(f"Done with {index} out of {len(testing_dataloader)} batches.")
  
print('DONE.')

In [None]:
report = classification_report(predictions_accuracy, true_labels_accuracy, target_names=['not_hate', 'hate'])

# Save the classification report to a file
with open(f'../performance_statistics/classification_report_{MODELTYPE}.txt', 'w') as file:
    file.write(report)

print("Classification Report:\n", report)

## Matthews Correlation Coefficient

In [None]:
print(f'Positive hateful samples: {labels.sum()} of {len(labels)} ({labels.sum() / len(labels.label) * 100:.2f}%)')

In [None]:
matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):
  
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # Calculate and store the coef for this batch.  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

In [None]:
# Create a barplot showing the MCC score for each batch of test samples.
plt.figure(figsize=(25, 12))

ax = sns.barplot(x = list(range(len(matthews_set)- 20, len(matthews_set))), y = matthews_set[len(matthews_set) - 20: len(matthews_set)], errorbar=None)

for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

plt.title('MCC Score per Batch')
plt.ylabel('MCC Score (-1 to +1)')
plt.xlabel('Batch #')

plt.show()

In [None]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print(f'Total MCC: {mcc:.3f}')