# TODO:
* Check seed worker reference  because the parameter is currently not being used

In [35]:
import pandas as pd
import numpy as np
import torch
from pytorch_pretrained_bert import BertForSequenceClassification

from transformers import BertTokenizer
from torch.optim import SGD
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler

import matplotlib.pyplot as plt

import random

import os

In [2]:
seed = 31

## Set the random seeds for Python and Torch
random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)

def seed_worker(worker_id): #function to initalize the seeds for the workers of DataLoader
    worker_seed = torch.initial_seed() %2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g_seed = torch.Generator()
g_seed.manual_seed(seed)

<torch._C.Generator at 0x12b526930>

In [3]:
my_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") #load pre-trained tokenizer

In [42]:
#Tokenize, pad, and tensorize the features
#the apply returns a series of dictionaries, so we turn into a list -> DataFrame so that we can 
#store everything together efficiently
def process_data_frame(input_df):
    """Process DataFrame to format required for Pytorch

    Args:
        input_df (pandas.DataFrame): DataFrame read from csv

    Returns:
        pd.DataFrame: DataFrame with tensor data
    """
    tensor_df = pd.DataFrame(list(input_df['text'].apply(lambda x: my_tokenizer(x,truncation = True, 
                                                           max_length = 512, 
                                                           add_special_tokens= True,
                                                           padding = 'max_length',
                                                           return_tensors = 'pt',
                                                           return_attention_mask = True))))
    tensor_df['label'] = torch.tensor(input_df['label'].values) #tensorize labels
    
    return tensor_df

#Turn the pandas dataframes into lists then tensors as shown in https://mccormickml.com/2019/07/22/BERT-fine-tuning/#31-bert-tokenizer
def custom_train_test_split(df,features = 'input_ids',target = 'label',attention = 'attention_mask' ,test_size = 0.2):
    """Return two dataset objects of training and testing samples respectively

    Args:
        df (pandas.DataFrame): DataFrame containing all the relevant columns 
        features (str, optional): DataFrame column correspondign to the feature components. Defaults to 'input_ids'.
        target (str, optional): DataFrame column corresponding to the label/target . Defaults to 'label'.
        attention (str, optional): DataFrame column corresponding to the attention tokens. Defaults to 'attention_mask'.
        test_size (float, optional): Percent size assigned to testing. Defaults to 0.2.

    Returns:
        tuple: Training and testing dataset objects respectively
    """
    
    
    #Turn DataFrame into tensor objects and then into a dataset
    X_label = torch.cat(df[features].to_list(),dim = 0)
    X_attention = torch.cat(df[attention].to_list(),dim = 0)
    y = torch.tensor(df[target].to_list())
    dataset = TensorDataset(X_label,X_attention,y)
    
    #Split into training and testing datasets
    num_samps = df.shape[0]
    num_test = int(num_samps*test_size)
    num_train = num_samps - num_test
    
    train_data, test_data = random_split(dataset,[num_train,num_test])
    
    return train_data,test_data

In [5]:
if not os.path.isfile('data/modeling_data.zip'): #if the data has not been processed into tensors, read from csv and process
    df = pd.read_csv('data/cleandata.zip') #load dataset
    df.dropna(inplace = True) #drop nans (4 samples)
    
    df = process_data_frame(df)
    df.to_pickle('data/modeling_data.zip')
else: #read from pickle object if it has already been processed
    df = pd.read_pickle('data/modeling_data.zip')

In [43]:
train_dataset, test_dataset = custom_train_test_split(df)

In [51]:
#Define Hyperparameters
batch_size = 1
lr = 0.01
epochs = 100

In [45]:
test_loader = DataLoader(test_dataset,batch_size = batch_size , shuffle = False, num_workers = 0, worker_init_fn = seed_worker, generator = g_seed)
train_loader = DataLoader(train_dataset,batch_size = batch_size , drop_last = True, shuffle = True, worker_init_fn = seed_worker, generator = g_seed)

In [40]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
model.train() #set the model to train mode
                                                   
my_optim = SGD(model.parameters(), lr=lr)

In [53]:
#Test that this work on a single batch
batch = next(iter(train_loader))
features = batch[0]
attention = batch[1]
labels = batch[2]

loss = model(features,token_type_ids = None ,attention_mask = attention, labels = labels)

print(loss)

tensor(0.9050, grad_fn=<NllLossBackward0>)
