In [1]:
import os
import pandas as pd
import numpy as np
import time
import random
import torch
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModelForSequenceClassification, FunnelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

import modules.deep_learning_modules as dl_modules

#### Source: https://mccormickml.com/2019/07/22/BERT-fine-tuning/#31-bert-tokenizer
#### https://huggingface.co/docs/transformers/tasks/sequence_classification

In [2]:
MODELTYPE = "bert_base" 
EPOCHS = 1
MAX_LEN = 512
BATCH_SIZE = 6
LEARNING_RATE = 2e-5
DATASET = 'dataset_preprocessed_no_transformation' # dataset_preprocessed_no_transformation, dataset_preprocessed_stopwords
SEED_VAL = 42

base_path = '../datasets/01_preprocessed_datasets/' + DATASET + '.csv'

model_mapping = {   
                 
    "bert_base": "google-bert/bert-base-uncased",
    "distilbert":"distilbert/distilbert-base-uncased",
    "albert_base": "albert/albert-base-v2",
    "roberta": "openai-community/roberta-base-openai-detector",
    "transformer":"funnel-transformer/small-base",
}

## Test if GPU is available

In [3]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Use the GPU: NVIDIA GeForce RTX 3050 Laptop GPU


## Data Loading and Tokenization & Input Formatting

In [4]:
sentences, labels = dl_modules.read_data(base_path)

In [5]:
# Load the Model tokenizer.
print('Loading Model tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_mapping[MODELTYPE], do_lower_case=True)

Loading Model tokenizer...


In [6]:
# example tokenization
print('Original: ', sentences[0])
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Original:  i hate woman
Tokenized:  ['i', 'hate', 'woman']
Token IDs:  [1045, 5223, 2450]


In [7]:
max_len = 0

for sent in sentences:
        
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length:', max_len)

Max sentence length: 443


In [9]:
input_ids, attention_masks, labels = dl_modules.tokenize_data(sentences, labels, tokenizer, MAX_LEN)

# Print sentence 0, now as a list of IDs.
print('Original:', sentences[0])
print('Token IDs:', input_ids[0])

Original: i hate woman
Token IDs: tensor([ 101, 1045, 5223, 2450,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
      

  labels = torch.tensor(labels)


## Data Splitting

In [None]:
train_dataset, val_dataset, test_dataset = dl_modules.create_data_split(input_ids, attention_masks, labels, SEED_VAL)

print('{:>5,} training samples'.format(len(train_dataset)))
print('{:>6,} validation samples'.format(len(val_dataset)))
print('{:>6,} test samples\n'.format(len(test_dataset)))

print('Whole dataset size:', len(train_dataset) + len(val_dataset) + len(test_dataset))

In [None]:
train_dataloader = dl_modules.create_data_loader(train_dataset, BATCH_SIZE)
validation_dataloader = dl_modules.create_data_loader(val_dataset, BATCH_SIZE, False)

In [None]:
if not os.path.exists(f'../datasets/02_LLM_datasets/{MODELTYPE}'):
    os.makedirs(f'../datasets/02_LLM_datasets/{MODELTYPE}')

if not os.path.exists(f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}'):
    
    os.makedirs(f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}/training')
    os.makedirs(f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}/validation')
    
    for index, batch in enumerate(train_dataloader):
        torch.save(batch, f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}/training/batch_{index}.pt')
        
    for index, batch in enumerate(validation_dataloader):
        torch.save(batch, f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}/validation/validation_batch_{index}.pt')

In [None]:
# delete unused variables
del sentences, labels, input_ids, attention_masks, train_dataset, val_dataset, test_dataset

## Creating Model

In [None]:
if MODELTYPE == "transformer":
    
    model = FunnelForSequenceClassification.from_pretrained(
        model_mapping[MODELTYPE],
        num_labels = 2, 
        output_attentions = False,
        output_hidden_states = False
    )
    
else:

    model = AutoModelForSequenceClassification.from_pretrained(
        model_mapping[MODELTYPE],
        num_labels = 2, 
        output_attentions = False,
        output_hidden_states = False
    )

# Tell pytorch to run this model on the GPU.
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(),
                  LEARNING_RATE, # default is 5e-5
                  eps = 1e-8 # default is 1e-8.
                )

In [None]:
# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * EPOCHS
length_train_dataloader = len(train_dataloader)
length_val_dataloader = len(validation_dataloader)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
del train_dataloader, validation_dataloader

## Training the Model

In [None]:
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

training_stats = []
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, EPOCHS):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print(f'======== Epoch {epoch_i + 1} / {EPOCHS} ========')
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_accuracy = 0
    total_train_loss = 0
    model.train()

    # For each batch of training data...
    for step in range(length_train_dataloader):
        
        batch = torch.load(f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}/training/batch_{step}.pt')

        # Progress update every 40 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = dl_modules.format_time(time.time() - t0)
            
            # Report progress.
            print(f'  Batch {step:>5,}  of  {length_train_dataloader:>5,}.    Elapsed: {elapsed}.')

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels).loss
        
        logits = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels).logits

        total_train_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_train_accuracy += dl_modules.flat_accuracy(logits, label_ids)

        # Perform a backward pass to calculate the gradients.
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average accuracy over all of the batches.
    avg_train_accuracy = total_train_accuracy / length_train_dataloader

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / length_train_dataloader            
    
    # Measure how long this epoch took.
    training_time = dl_modules.format_time(time.time() - t0)

    print("")
    print(f"  Average training accuracy: {avg_train_accuracy:.2f}")
    print(f"  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time:}")
        
    # ========================================
    #               Validation
    # ========================================
 
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for step in range(length_val_dataloader):
        
        batch = torch.load(f'../datasets/02_LLM_datasets/{MODELTYPE}/{DATASET}/{MAX_LEN}/validation/batch_{step}.pt')
   
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            loss = model(b_input_ids, 
                         attention_mask=b_input_mask,
                         labels=b_labels).loss
            
            logits = model(b_input_ids,
                           attention_mask=b_input_mask,
                           labels=b_labels).logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += dl_modules.flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / length_val_dataloader
    print("  Average Validation Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / length_val_dataloader
    
    # Measure how long the validation run took.
    validation_time = dl_modules.format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Epoch': epoch_i + 1,
            'Training_Loss': avg_train_loss,
            'Training_Accuracy': avg_train_accuracy,
            'Validation_Loss': avg_val_loss,
            'Validation_Accuracy': avg_val_accuracy,
            'Training_Time': training_time,
            'Validation_Time': validation_time
        }
    )
    
    df_stats = pd.DataFrame(data=training_stats)
    
    # Create output directory if needed
    if not os.path.exists("../performance_statistics"):
        os.makedirs("../performance_statistics")
    
    df_stats.to_csv(f"../performance_statistics/{MODELTYPE}/training_stats_{MODELTYPE}_{LEARNING_RATE}_{MAX_LEN}_{DATASET}.csv", index=False)
    
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(dl_modules.format_time(time.time()-total_t0)))

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
output_dir = f'../models/'
folder_name = f"model_{MODELTYPE}_{LEARNING_RATE}_{MAX_LEN}_{DATASET}"

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
output_dir = output_dir + folder_name

In [None]:
print(f"Saving model to {output_dir}")

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)