In [2]:
# Installing the transformers library and additional libraries if looking process 

!pip install -q transformers

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [82]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn import preprocessing
import csv

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [39]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# #this mounts your Google Drive to the Colab VM.
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

# # enter the foldername in the Shared Google Drive
# FOLDERNAME = 'My Drive/Colab Notebooks'
# assert FOLDERNAME is not None, "[!] Enter the foldername."

# # now that we've mounted your Drive, this ensures that
# # the Python interpreter of the Colab VM can load
# # python files from within it.
# import sys
# sys.path.append('/content/drive/{}'.format(FOLDERNAME))

# %cd /content/drive/$FOLDERNAME/

#### Global, One-time declarations

In [83]:
df = pd.read_csv('datasets/lcp_single_train.tsv', delimiter='\t')
df_trial = pd.read_csv('datasets/lcp_single_trial.tsv', delimiter='\t')
#------------------------------------------------------------------------------
complexity = df['complexity']
trial_complexity = df_trial['complexity']
#------------------------------------------------------------------------------
# linreg = LinearRegression()
# clf = Ridge(alpha=30)
#------------------------------------------------------------------------------
f = open('datasets/unigram_freq.csv')
reader = csv.reader(f)
#------------------------------------------------------------------------------
wordToCounts = {}
for row in reader: wordToCounts[row[0]] = row[1]
    

#### Declaring all manual features:

In [84]:
freqCounts = [wordToCounts[str(word).lower()] 
               if str(word).lower() in wordToCounts else 0.0001 
               for word in df['token'].tolist()]  
freqCounts = np.reshape(np.array(freqCounts, dtype='float64'), (len(complexity), 1))
trial_freqCounts = [wordToCounts[str(word).lower()] if str(word).lower() in wordToCounts else 0.0001 for word in df_trial['token'].tolist()]
trial_freqCounts = np.reshape(np.array(trial_freqCounts, dtype='float64'), (len(trial_complexity), 1))
#------------------------------------------------------------------------------
charCounts = np.reshape(np.array([len(str(word)) for word in df['token']]), (len(complexity), 1))
trial_charCounts = np.reshape(np.array([len(str(word)) for word in df_trial['token']], dtype='float64'), (len(trial_complexity), 1))
#------------------------------------------------------------------------------
capitalCounts = []
for word in df['token']:
    count = 0
    for ch in str(word):
        if ch.isupper():
            count += 1
    capitalCounts.append(count)
    
capitalCounts = np.reshape(np.array(capitalCounts), (len(complexity), 1))
                           
trial_capCounts = []
for word in df_trial['token']:
    count = 0
    for ch in str(word):
        if ch.isupper():
            count += 1
    trial_capCounts.append(count)
trial_capCounts = np.reshape(np.array(trial_capCounts), (len(trial_complexity), 1))

#### Standard scaling all features

In [86]:
freqScaler = preprocessing.StandardScaler().fit(freqCounts)
freqCounts = freqScaler.transform(freqCounts)
trial_freqCounts = freqScaler.transform(trial_freqCounts)
charScaler = preprocessing.StandardScaler().fit(charCounts)
charCounts = charScaler.transform(charCounts)
trial_charCounts = charScaler.transform(trial_charCounts)
capitalScaler = preprocessing.StandardScaler().fit(capitalCounts)
capitalCounts = capitalScaler.transform(capitalCounts)
trial_capCounts = capitalScaler.transform(trial_capCounts)

In [93]:
df['freqCounts'] = freqCounts;
df['charCounts'] = charCounts;
df['capitalCounts'] = capitalCounts;
df['list'] = df[df.columns[0:]].values.tolist()
df = df[['sentence', 'freqCounts', 'charCounts', 'capitalCounts', 'list', 'complexity']].copy()

df_trial['freqCounts'] = trial_freqCounts;
df_trial['charCounts'] = trial_charCounts;
df_trial['capitalCounts'] = trial_capCounts;
df_trial['list'] = df_trial[df_trial.columns[0:]].values.tolist()
df_trial = df_trial[['sentence', 'freqCounts', 'charCounts', 'capitalCounts', 'list', 'complexity']].copy()

# df.head()
# df_trial.head()

In [94]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [95]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.sentence
        self.targets = self.data.complexity
        self.max_len = max_len

        print(self.targets)

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [96]:
# Creating the dataset and dataloader for the neural network
print("TRAIN Dataset: {}".format(df.shape))
print("TEST Dataset: {}".format(df_trial.shape))

training_set = CustomDataset(df, tokenizer, MAX_LEN)
testing_set = CustomDataset(df_trial, tokenizer, MAX_LEN)

TRAIN Dataset: (7232, 6)
TEST Dataset: (311, 6)
0       0.000000
1       0.000000
2       0.050000
3       0.150000
4       0.055556
          ...   
7227    0.656250
7228    0.671875
7229    0.675000
7230    0.678571
7231    0.680556
Name: complexity, Length: 7232, dtype: float64
0      0.000000
1      0.102941
2      0.109375
3      0.160714
4      0.000000
         ...   
306    0.482143
307    0.500000
308    0.605263
309    0.482143
310    0.571429
Name: complexity, Length: 311, dtype: float64


In [97]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [98]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=True)
        output_2 = self.l2(output.pooler_output)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [99]:
def loss_fn(outputs, targets):
    return torch.nn.L1Loss()(outputs, targets)

In [100]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [101]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # print(ids.dtype)

        outputs = model(ids, mask, token_type_ids).squeeze()

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [102]:
print(model.device)

ModuleAttributeError: 'BERTClass' object has no attribute 'device'

In [103]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.2995516061782837


KeyboardInterrupt: 

<a id='section06'></a>
### Validating the Model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data. 

This unseen data is the 20% of `train.csv` which was seperated during the Dataset creation stage. 
During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model. 

As defined above to get a measure of our models performance we are using the following metrics. 
- Accuracy Score
- F1 Micro
- F1 Macro

We are getting amazing results for all these 3 categories just by training the model for 1 Epoch.

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")



ValueError: ignored

<a id='section07'></a>
### Saving the Trained Model Artifacts for inference

This is the final step in the process of fine tuning the model. 

The model and its vocabulary are saved locally. These files are then used in the future to make inference on new inputs of news headlines.

Please remember that a trained neural network is only useful when used in actual inference after its training. 

In the lifecycle of an ML projects this is only half the job done. We will leave the inference of these models for some other day. 