# TODO:

In [None]:
pip install gdown pytorch_pretrained_bert transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from pytorch_pretrained_bert import BertForSequenceClassification

from transformers import BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler

import matplotlib.pyplot as plt

import random
import pickle
import time
from datetime import datetime

import os
import gdown
import gc

Setting random seeds for reproducibility:

In [None]:
seed = 31

## Set the random seeds for Python and Torch
random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)

def seed_worker(worker_id): #function to initalize the seeds for the workers of DataLoader
    worker_seed = torch.initial_seed() %2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g_seed = torch.Generator()
g_seed.manual_seed(seed)

In [None]:
my_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #load pre-trained tokenizer

Defining some helper functions:

In [None]:
#Tokenize, pad, and tensorize the features
#the apply returns a series of dictionaries, so we turn into a list -> DataFrame so that we can 
#store everything together efficiently
def process_data_frame(input_df):
    """Process DataFrame to format required for Pytorch

    Args:
        input_df (pandas.DataFrame): DataFrame read from csv

    Returns:
        pd.DataFrame: DataFrame with tensor data
    """
    tensor_df = pd.DataFrame(list(input_df['text'].apply(lambda x: my_tokenizer(x,truncation = True, 
                                                           max_length = 512, 
                                                           add_special_tokens= True,
                                                           padding = 'max_length',
                                                           return_tensors = 'pt',
                                                           return_attention_mask = True))))
    tensor_df['label'] = torch.tensor(input_df['label'].values) #tensorize labels
    
    return tensor_df

#Turn the pandas dataframes into lists then tensors as shown in https://mccormickml.com/2019/07/22/BERT-fine-tuning/#31-bert-tokenizer
def custom_train_test_split(df,features = 'input_ids',target = 'label',attention = 'attention_mask' ,test_size = 0.2,val_size = 0.2,gen_seed= g_seed):
    """Return two dataset objects of training and testing samples respectively

    Args:
        df (pandas.DataFrame): DataFrame containing all the relevant columns 
        features (str, optional): DataFrame column correspondign to the feature components. Defaults to 'input_ids'.
        target (str, optional): DataFrame column corresponding to the label/target . Defaults to 'label'.
        attention (str, optional): DataFrame column corresponding to the attention tokens. Defaults to 'attention_mask'.
        test_size (float, optional): Percent size assigned to testing. Defaults to 0.2.
        val_size (float, optional): Percent size assigne to validation. Defaults to 0.2.
        gen_seed(torch.Generator, optional): Generator for seeding the random split. Defaults to g_seed defined at the start.

    Returns:
        tuple: Training,testing, and validation datasets objects respectively
    """
    
    
    #Turn DataFrame into tensor objects and then into a dataset
    X_label = torch.cat(df[features].to_list(),dim = 0) #tokenized data
    X_attention = torch.cat(df[attention].to_list(),dim = 0) #whether its a word or padding
    y = torch.tensor(df[target].to_list())
    dataset = TensorDataset(X_label,X_attention,y)
    
    #Split into training and testing datasets
    num_samps = df.shape[0]
    num_test = int(num_samps*test_size)
    num_val = int(num_samps*val_size)
    num_train = num_samps - num_test - num_val
    
    train_data, test_data, val_data = random_split(dataset,[num_train,num_test,num_val],generator = gen_seed)
    
    return train_data,test_data,val_data

def set_device():
  """
  Set the device. CUDA if available, CPU otherwise

  Args:
    None

  Returns:
    Nothing
  """
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: Running on cpu ")
  else:
    print("GPU is enabled in this notebook.")

  return device

def freeze_all_but_classifier(model, freeze_status = True):
  """Freeze all the layers but the classification layer

  Args:
      model (BERT model): model to freeze layers on 
      freeze_status (bool, optional): Sets the requires_grad of all the previous layers to this value. Defaults to True.

  Returns:
      BERT model: the model that we are using, with layers frozen or not depending on do.
  """
  for name,tensors in model.named_parameters():
   if 'classifier' not in name:
      tensors.requires_grad = freeze_status
    
  return model

def save_checkpoint_model(empty_model,empty_optim,curr_epoch=None,train_losses=None,val_losses=None,accuracies=None,save_path = '10min_2023_07_checkpoint'):
  """Save the model, optimizer, and current properties of the system.

  Args:
      model (BERT model) : model to save into.
      optimizer (torch.optim) : optimizer to save into.
      curr_epoch (int) : epoch to save on.
      train_losses (list) : list of training losses so far calculated.
      val_losses (list) : list of validation losses so far calculated.
      accuracies (list) : list of accuracies so far calculated.
      save_path (str) : path to save to.

  Returns:
      tuple : BERT model,torch.optim, curr_epoch, train_losses,val_losses, accuracies
  """  
  checkpoint_dict = {'curr_epoch':curr_epoch,'train_losses':train_losses,
                       'val_losses':val_losses,'accuracies':accuracies}
    
  with open(f'{save_path}','wb') as fid:
    pickle.dump(checkpoint_dict,fid,protocol = pickle.HIGHEST_PROTOCOL)
      
  torch.save(empty_model,f'{save_path}_model') #save model 
  torch.save(empty_optim,f'{save_path}_optim') #save optimization method
  return
  
def load_checkpoint_model(load_path = '10min_2023_07_checkpoint'):
  """Load model, optimizer, and system properties.

  Args:
      load_path (str) : path to load from.

  Returns:
      tuple : BERT model,torch.optim, curr_epoch, train_losses,val_losses, accuracies
  """ 
  with open(f'{load_path}','rb') as fid:
    checkpoint_dict = pickle.load(fid) 
      
  empty_model = torch.load(f'{load_path}_model')
  empty_optim = torch.load(f'{load_path}_optim')
      
  return empty_model,empty_optim,checkpoint_dict['curr_epoch'],checkpoint_dict['train_losses'],checkpoint_dict['val_losses'],checkpoint_dict['accuracies']
  

def generate_random_id():
  """Generates a random ID of 6 digits."""
  # id_digits = []
  # for _ in range(6):
  #   id_digits.append(str(random.randint(0, 9)))
  # return ''.join(id_digits)
  return str(datetime.now().microsecond)

Getting the dataset:

In [None]:
if not os.path.isfile('data/modeling_data.zip'):
    id = '1p3wPDlGq7fsIjiEV2hce90N-OI3YCOdw'
    output = "data/modeling_data.zip"
    gdown.download(id=id, output=output, quiet=False)

In [None]:
if not os.path.isfile('data/modeling_data.zip'): #if the data has not been processed into tensors, read from csv and process
    df = pd.read_csv('data/cleandata.zip') #load dataset
    df.dropna(inplace = True) #drop nans (4 samples)
    
    df = process_data_frame(df)
    df.to_pickle('data/modeling_data.zip')
else: #read from pickle object if it has already been processed
    df = pd.read_pickle('data/modeling_data.zip')

In [None]:
df.label = df.label.astype('uint8')

Define Hyperparameters:

In [None]:
max_batches = 798 #If we just want to do a subset of batches. Equal to np.nan to ignore
batch_size = 260
lr = 0.001 # increasing to make use of the scheduler
eps = 1e-8
epochs = 6 #The BERT authors recommend between 2 and 4. Adjust to 2 if overfitting.
freeze_layers = False #Set to True if we want to freeze all the layers but the classification layer
batch_n_report = 6 #Report every this amount of batches
test_data_size = 0.15
val_data_size = 0.15

info = {'max_batches':max_batches,'batch_size':batch_size,'lr':lr,'eps':eps,
        'epochs':epochs,'freeze_layers':freeze_layers,'seed':seed,
        'val_data_size':val_data_size,'test_data_size':test_data_size}

# generates a simple random id per execution
execution_id = generate_random_id()

In [None]:
train_dataset, test_dataset, validation_dataset = custom_train_test_split(df,test_size = test_data_size, val_size = val_data_size)

test_loader = DataLoader(test_dataset,batch_size = batch_size , shuffle = False, num_workers = 0, worker_init_fn = seed_worker, generator = g_seed)
train_loader = DataLoader(train_dataset,batch_size = batch_size , drop_last = True, shuffle = True, worker_init_fn = seed_worker, generator = g_seed)
validation_loader = DataLoader(validation_dataset,batch_size = batch_size , shuffle = False, num_workers = 0, worker_init_fn = seed_worker, generator = g_seed)

In [None]:
this_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
this_model = freeze_all_but_classifier(this_model,freeze_status = freeze_layers)
this_optim = my_optim = AdamW(this_model.parameters(),
                              lr=lr,
                              eps = eps)

# Total number of training steps is [number of batches] x [number of epochs]. NOT the total number of samples
if max_batches == np.nan:
    scheduler_total_steps = len(train_loader) * epochs 
else:
    scheduler_total_steps = max_batches * epochs

scheduler = get_linear_schedule_with_warmup(my_optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = scheduler_total_steps)

info['scheduler_total_steps'] = scheduler_total_steps

In [None]:
def train_loop(model,
               optimizer,
               train_dataset,
               test_dataset,
               val_dataset = [],
               epochs = 4,
               device = 'cpu',
               report_every_n_batches = 10,
               max_batch = np.nan,
               load = ''): 
    """Training loop for the model that outputs a loss

    Args:
        model (torch.model): Output of model has to be a loss, not labels
        optimizer (torch.optimizer): Optimizer with the model parameters already fed in
        train_dataset (torch.DataLoader) : Training dataset that has 3 outputs (features, attention, labels)
        test_dataset (torch.DataLoader) : Test dataset that has 3 outputs (features, attention, labels)
        val_dataset (torch.DataLoader,optional): Validation dataset that has 3 outputs (features, attention, labels). Defaults to [].
        epochs (int, optional): Number of epochs to reiterate through batches. Defaults to 4.
        device (str, optional): Device to load the data unto. Defaults to 'cpu'.
        report_every_n_batches (int, optional): Number of epochs to update the loss everytime. Defaults to 10.
        max_batch(int, optional): Max number of batches to asses. Defaults to np.nan.
        load(str,optional): Path to load a checkpoint model. Defaults to ''.

    Returns:
        tuple: Training loss, Validation Loss, Accuracies, total time
    """
  
    ts = time.time()
    append = ''
    if max_batch is not np.nan:
        append  = f'.\nOnly doing {max_batch} batches.'
        
    print(f'Begin model Training. \nTotal epochs: {epochs}.\nTotal training batches: {len(train_dataset)}.\
          \nTotal validation batches: {len(val_dataset)}.\nTotal testing batches: {len(test_dataset)}' + append + '\n')
    
    all_train_loss, all_eval_loss, all_accuracy = [],[],[]
    
    if load != '': #If we did early stopping, we can resume 
        model, optimizer, start_epoch, all_train_loss, all_eval_loss, all_accuracy = load_checkpoint_model(load_path = load)
    else:
        start_epoch = 0
        
    # Added to address partial data executions
    end_epoch = start_epoch + epochs
    
    model.to(device)
    
    for ep in range(start_epoch, end_epoch): #iterate the desired epochs
        tb = time.time()
        
        print(f'------------------- Epoch {ep} / {end_epoch - 1} ------------------- ')
        train_loss,eval_loss,accuracy = 0 ,0 ,0
        
        #Perform training of the model
        model.train()
        for train_iter,batch in enumerate(train_dataset): #iterate through all batches
            t0 = time.time()    
            if max_batch is not np.nan and train_iter > max_batch: #in case we want to do a batch_subset
                break
                
            # Added to skip the first 300 batches
            if train_iter < 300:
                # print(f'Skipping {train_iter} iteration')
                continue
            
            feature = batch[0].to(device) #extract relevant values and move to device
            attention = batch[1].to(device)
            label = batch[2].to(device)
            
            optimizer.zero_grad() #zero the gradients
            loss = model(feature,token_type_ids = None, attention_mask = attention, labels = label)
            
            train_loss += loss.detach().cpu().numpy() #calculate epoch loss and detach
            
            loss.backward() #estimate the gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #clip gradients to avoid "exploding gradients"
            optimizer.step() #adjust parameters
            scheduler.step() #adjust the scheduler
            
            if train_iter % report_every_n_batches == 0 and not train_iter ==0: #update output every 10 batches
                elapsed_t = round(time.time() - t0,2)
                print(f'Batch {train_iter} done in {elapsed_t} seconds. Total training loss: {np.round(float(train_loss),4)}')
        
        #Perform validation of the model
        model.eval()
        for eval_iter, batch in enumerate(val_dataset):
            if max_batch is not np.nan and eval_iter > max_batch: #in case we want to do a batch_subset
                break
                
            # Added to skip the first 300 batches (scaled to 15%)
            if eval_iter < (300 * 0.15):
                continue
            
            feature = batch[0].to(device) #extract relevant values and move to device
            attention = batch[1].to(device)
            label = batch[2].to(device)
            
            with torch.no_grad(): #we don't want to estimate gradients when validating
                loss = model(feature,token_type_ids = None, attention_mask = attention, labels = label)
            
            eval_loss += loss.detach().cpu().numpy() #detach from tensors and turn into numpy arrays to plot and calculate
        
        number_test_batches = 0
        
        for test_iter,batch in enumerate(test_dataset):
            if max_batch is not np.nan and test_iter > max_batch: #in case we want to do a batch subset
                break
                
            # Added to skip the first 300 batches (scaled to 15%)
            if test_iter < (300 * 0.15):
                continue
                
            feature = batch[0].to(device)
            attention = batch[1].to(device)
            labels = batch[2]
            
            with torch.no_grad(): #we don't want to estimate gradients when validating
                logits = model(feature,token_type_ids = None, attention_mask = attention)
                
            # logits.detach().to('cpu').numpy()
            preds = np.argmax(logits.detach().cpu().numpy(),axis = 1) #find the maximum probits
            
            accuracy  += np.sum(preds == labels.detach().cpu().numpy())/ len(labels.detach().cpu().numpy()) #find percent of accurate predictions
            number_test_batches += 1
        
        accuracy = accuracy / number_test_batches * 100
                             
        all_train_loss.append(train_loss/train_iter)
        all_eval_loss.append(eval_loss/eval_iter) #append losses to visualize across epochs
        all_accuracy.append(accuracy)
        
        save_checkpoint_model(model,
                         optimizer,
                         curr_epoch=ep,
                         train_losses=all_train_loss,
                         val_losses=all_eval_loss,
                         accuracies=all_accuracy,
                         save_path = f'executions/{execution_id}_{datetime.now().strftime("%Y_%m_%d_%H_%Mmin_info")}'
                            )
        
        
        t_ep = round(time.time() - tb,2)
        print(f'\nEpoch {ep} done in {t_ep} seconds.\nTotal training loss: {np.round(float(train_loss),4)}, Total validation loss: {np.round(float(eval_loss),4)}')
        print(f'Averaged across batches: Training loss: {np.round(train_loss / train_iter,4)}, Validation loss: {np.round(eval_loss / eval_iter,4)}')
        print(f'Accuracy on Testing Dataset for this epoch: {np.round(accuracy,2)}%\n')
      
    total_time = time.time() - ts
    print(f'-------------------  DONE TRAINING -------------------------------------- \n total time: {round(total_time,2)} seconds')
    return all_train_loss,all_eval_loss,all_accuracy,total_time
    

```python
def train_loop(model,
               optimizer,
               train_dataset,
               test_dataset,
               val_dataset = [],
               epochs = 4,
               device = 'cpu',
               report_every_n_batches = 10,
               max_batch = np.nan,
               load = ''): 
    """Training loop for the model that outputs a loss

    Args:
        model (torch.model): Output of model has to be a loss, not labels
        optimizer (torch.optimizer): Optimizer with the model parameters already fed in
        train_dataset (torch.DataLoader) : Training dataset that has 3 outputs (features, attention, labels)
        test_dataset (torch.DataLoader) : Test dataset that has 3 outputs (features, attention, labels)
        val_dataset (torch.DataLoader,optional): Validation dataset that has 3 outputs (features, attention, labels). Defaults to [].
        epochs (int, optional): Number of epochs to reiterate through batches. Defaults to 4.
        device (str, optional): Device to load the data unto. Defaults to 'cpu'.
        report_every_n_batches (int, optional): Number of epochs to update the loss everytime. Defaults to 10.
        max_batch(int, optional): Max number of batches to asses. Defaults to np.nan.
        load(str,optional): Path to load a checkpoint model. Defaults to ''.

    Returns:
        tuple: Training loss, Validation Loss, Accuracies, total time
    """
  
    ts = time.time()
    append = ''
    if max_batch is not np.nan:
        append  = f'.\nOnly doing {max_batch} batches.'
        
    print(f'Begin model Training. \nTotal epochs: {epochs}.\nTotal training batches: {len(train_dataset)}.\
          \nTotal validation batches: {len(val_dataset)}.\nTotal testing batches: {len(test_dataset)}' + append + '\n')
    
    all_train_loss, all_eval_loss, all_accuracy = [],[],[]
    
    if load != '': #If we did early stopping, we can resume 
        model,optimizer,start_epoch,all_train_loss,all_eval_loss,all_accuracy = load_checkpoint_model(load_path = load)
    else:
        start_epoch = 0
    
    model.to(device)
    
    for ep in range(start_epoch,epochs): #iterate the desired epochs
        tb = time.time()
        
        print(f'------------------- Epoch {ep} ------------------- ')
        train_loss,eval_loss,accuracy = 0 ,0 ,0
        
        #Perform training of the model
        model.train()
        for train_iter,batch in enumerate(train_dataset): #iterate through all batches
            t0 = time.time()    
            if max_batch is not np.nan and train_iter > max_batch: #in case we want to do a batch_subset
                break
            
            feature = batch[0].to(device) #extract relevant values and move to device
            attention = batch[1].to(device)
            label = batch[2].to(device)
            
            optimizer.zero_grad() #zero the gradients
            loss = model(feature,token_type_ids = None, attention_mask = attention, labels = label)
            
            train_loss += loss.detach().cpu().numpy() #calculate epoch loss and detach
            
            loss.backward() #estimate the gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #clip gradients to avoid "exploding gradients"
            optimizer.step() #adjust parameters
            scheduler.step() #adjust the scheduler
            
            if train_iter % report_every_n_batches == 0 and not train_iter ==0: #update output every 10 batches
                elapsed_t = round(time.time() - t0,2)
                print(f'Batch {train_iter} done in {elapsed_t} seconds. Total training loss: {np.round(float(train_loss),4)}')
        
        #Perform validation of the model
        model.eval()
        for eval_iter,batch in enumerate(val_dataset):
            if max_batch is not np.nan and eval_iter > max_batch: #in case we want to do a batch_subset
                break
            
            feature = batch[0].to(device) #extract relevant values and move to device
            attention = batch[1].to(device)
            label = batch[2].to(device)
            
            with torch.no_grad(): #we don't want to estimate gradients when validating
                loss = model(feature,token_type_ids = None, attention_mask = attention, labels = label)
            
            eval_loss += loss.detach().cpu().numpy() #detach from tensors and turn into numpy arrays to plot and calculate
        
        number_test_batches = 0
        
        for test_iter,batch in enumerate(test_dataset):
            if max_batch is not np.nan and test_iter > max_batch: #in case we want to do a batch subset
                break
                
            feature = batch[0].to(device)
            attention = batch[1].to(device)
            labels = batch[2]
            
            with torch.no_grad(): #we don't want to estimate gradients when validating
                logits = model(feature,token_type_ids = None, attention_mask = attention)
                
            # logits.detach().to('cpu').numpy()
            preds = np.argmax(logits.detach().cpu().numpy(),axis = 1) #find the maximum probits
            
            accuracy  += np.sum(preds == labels.detach().cpu().numpy())/ len(labels.detach().cpu().numpy()) #find percent of accurate predictions
            number_test_batches += 1
        
        accuracy = accuracy / number_test_batches * 100
                             
        all_train_loss.append(train_loss/train_iter)
        all_eval_loss.append(eval_loss/eval_iter) #append losses to visualize across epochs
        all_accuracy.append(accuracy)
        
        save_checkpoint_model(model,
                         optimizer,
                         curr_epoch=ep,
                         train_losses=all_train_loss,
                         val_losses=all_eval_loss,
                         accuracies=all_accuracy,
                         save_path = f'executions/{execution_id}_{datetime.now().strftime("%Y_%m_%d_%H_%Mmin_info")}'
                            )
        
        
        t_ep = round(time.time() - tb,2)
        print(f'\nEpoch {ep} done in {t_ep} seconds.\nTotal training loss: {np.round(float(train_loss),4)}, Total validation loss: {np.round(float(eval_loss),4)}')
        print(f'Averaged across batches: Training loss: {np.round(train_loss / train_iter,4)}, Validation loss: {np.round(eval_loss / eval_iter,4)}')
        print(f'Accuracy on Testing Dataset for this epoch: {np.round(accuracy,2)}%\n')
      
    total_time = time.time() - ts
    print(f'-------------------  DONE TRAINING -------------------------------------- \n total time: {round(total_time,2)} seconds')
    return all_train_loss,all_eval_loss,all_accuracy,total_time
    
```

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
checkpoint_name = '911665_04min_2023_07_info'
load_path = f'executions/{checkpoint_name}'

In [None]:
DEVICE = set_device()
train_loss,eval_loss,accuracies,total_time = train_loop(this_model,
                                             this_optim,
                                             train_loader,
                                             test_loader,
                                             val_dataset = validation_loader,
                                             epochs = epochs,
                                             report_every_n_batches= batch_n_report,
                                             device = DEVICE,
                                             max_batch = max_batches,
                                             load = load_path)

info.update({'avg_train_loss':train_loss,'avg_eval_loss':eval_loss,'accuracies':accuracies,'total_time':total_time})
f_name_to_save = f'{execution_id}_{datetime.now().strftime("%Y_%m_%d_%H_%Mmin_summary_info")}'

with open(f'executions/{f_name_to_save}','wb') as fid: #save information from training and validation as well as parameters
    pickle.dump(info,fid,protocol = pickle.HIGHEST_PROTOCOL)
    
torch.save(this_model,f'executions/{execution_id}_{f_name_to_save}_model')
torch.save(this_optim,f'executions/{execution_id}_{f_name_to_save}_optim')

fig, ax = plt.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (8,8))

ax[0].plot(range(len(train_loss)), train_loss, color = 'black', linestyle = 'solid', marker = 'o', label = 'Training Loss')
ax[0].plot(range(len(train_loss)), eval_loss, color = 'black', linestyle = 'dashed', marker = 'o', fillstyle = 'none',label = 'Validation Loss')
ax[0].set_ylabel('Average Loss Across Batches')
ax[0].legend()

ax[1].plot(range(len(train_loss)), accuracies, color = 'black',linestyle = 'solid',marker = 'o')
ax[1].set_xlabel('Epoch')
ax[1].set_ylabel('Accuracy on Testing Dataset');

fig.savefig(f'executions/{execution_id}_curves.png')   # save the figure to file

In [None]:
## To read file:
# with open('executions/22min_2023_07_info_summary','rb') as rfid:
#    info = pickle.load(rfid)

In [None]:
# info