For introduction and problem statement, please refer to notebook 1

Note: this notebook may take more than 10 hours to run. To shorten the run time, I ran this notebook using Google Colab GPU.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
#google colab requires a pip install of transformers everytime its refreshed. hence this cell 
#is required if this notebook is to be run again on google colab
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 7.1MB/s 
[?25hCollecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 18.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.5MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K   

In [3]:
#importing the library
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import spacy
from nltk import tokenize
from nltk.corpus import stopwords 
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer

In [4]:
#importing the csv file
new_reviews = pd.read_csv('/content/drive/My Drive/Capstone pytorch/cleaned_combined_data_with_keywords.csv',na_filter=False)

## With BERT

In [6]:
#importing libary
import torch
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [7]:
#confirm the shape
new_reviews.shape

(22041, 26)

In [9]:
new_reviews['multi_class_sentiment'].value_counts(normalize=True)
#baseline accuracy is 63.3%. It has also been shown when we did logistic
#regression modelling 

2    0.633592
0    0.269906
1    0.096502
Name: multi_class_sentiment, dtype: float64

In [10]:
#create train and validation set 
X_train, X_val, y_train, y_val = train_test_split(new_reviews.index.values,
                                                  new_reviews.multi_class_sentiment.values,
                                                  test_size = 0.25,
                                                  random_state= 42,
                                                  stratify=new_reviews.multi_class_sentiment.values)

In [11]:
#confirm the shape of train dataset
y_train.shape

(16530,)

In [12]:
#confirm the shape of validation dataset
y_val.shape

(5511,)

In [13]:
#create a data_type column that contains "not_set" as a placeholder before
# we label it with train and validation
new_reviews['data_type'] = ['not_set']*new_reviews.shape[0]

In [14]:
#label the rows with train or val
new_reviews.loc[X_train, 'data_type'] = 'train'
new_reviews.loc[X_val, 'data_type'] = 'val'

In [15]:
new_reviews.head(2)

Unnamed: 0,asin,name,rating,date,verified,review_title,body,helpfulVotes,brand,item_title,url,image,reviewUrl,totalReviews,price,originalPrice,sellers,reviews,cleaned_reviews,pos_neg_reviews,tokens,summary,features_and_sentiments,filter summary,data_type,multi_class_sentiment
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Flip n Smart Phones,"Def not best, but not worst I had the Samsung ...",def best worst samsung awhile absolute doo doo...,0,"['def', 'best', 'worst', 'samsung', 'awhile', ...","['nice bright large screen', 'ring tone loud e...","[(5, 'screen'), (5, 'battery')]",True,val,1
1,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0,Flip n Smart Phones,"Love This Phone This is a great, reliable phon...",love great reliable also purchased samsung die...,1,"['love', 'great', 'reliable', 'also', 'purchas...",['however ringtones not available online downl...,"[(3, 'ringtones')]",True,train,2


In [16]:
#sanity check to ensure that the train and val are 
#represented correctly and evenly within each class
new_reviews.groupby(['multi_class_sentiment', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,asin,name,rating,date,verified,review_title,body,helpfulVotes,brand,item_title,url,image,reviewUrl,totalReviews,price,originalPrice,sellers,reviews,cleaned_reviews,pos_neg_reviews,tokens,summary,features_and_sentiments,filter summary
multi_class_sentiment,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,train,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462,4462
0,val,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487,1487
1,train,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595
1,val,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532,532
2,train,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473,10473
2,val,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492,3492


## Loading Tokenizer and

In [18]:
#importing libraries
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [19]:
#loading BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [21]:
#prepare the data in a format that is readable by BERT
encoded_data_train = tokenizer.batch_encode_plus(
    new_reviews[new_reviews.data_type=='train'].reviews.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    new_reviews[new_reviews.data_type=='val'].reviews.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(new_reviews[new_reviews.data_type=='train'].multi_class_sentiment.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val= torch.tensor(new_reviews[new_reviews.data_type=='val'].multi_class_sentiment.values)

In [22]:
#preparing the data for BERT input
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [24]:
#sanity check
len(dataset_val)

5511

In [24]:
#sanity check
len(dataset_val)

5511

## Setting up BERT Pretrained Model

In [25]:
#import library
from transformers import BertForSequenceClassification

In [26]:
#instantiate the pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




## Creating Data Loaders

In [27]:
#importing library
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [28]:
#defining batchsize
batch_size = 32

#final processing of data input
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [29]:
dataloader_train 

<torch.utils.data.dataloader.DataLoader at 0x7f4011aa7048>

In [30]:
16530/32

516.5625

## Setting Up Optimiser and Scheduler

In [31]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [32]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [33]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

## Creating Training Loop

In [35]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [37]:
def evaluate(dataloader_val):
    """
    to evaluate training loss while running the model - will be used in 
    this notebook to choose the best epoch
    
    Parameter
    ----------
    dataloader_val: the validation set that has been processed 

    Returns
    -------
    validation loss, predictions of target variable and actual target variable values 
    """
    #activate model evaluation
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    #loop through the validationd data set by batch
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        #index the batch to find input_ids,attention_masks and labels
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        #disable gradient calculation
        with torch.no_grad():        
            outputs = model(**inputs)
        #index the outputs to find loss and logits    
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        
        #convert tensor to numpy array
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [38]:
#wrapping every epoch with tqdm to show the progress
for epoch in tqdm(range(1, epochs+1)):
    #setting the model to training mode
    model.train()
    
    loss_train_total = 0
    
    #wrapping the dataloader input in tqdm to create progress bar 
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    #fine-tuning the model
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #save the model and export it     
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
    
    #print the epoch
    tqdm.write(f'\nEpoch {epoch}')
    
    #calculate training loss and print it as fine-tuning happens
    loss_train_avg = loss_train_total/len(dataloader_train)            
    #print the training loss
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    #calculate validation loss and print it as fine-tuning happens
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    
    tqdm.write(f'Validation loss: {val_loss}')
    

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=517.0, style=ProgressStyle(description_widt…


Epoch 1
Training loss: 0.47671686403986563


HBox(children=(FloatProgress(value=0.0, max=173.0), HTML(value='')))


Validation loss: 0.3717515245079994


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=517.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.3232948561442198


HBox(children=(FloatProgress(value=0.0, max=173.0), HTML(value='')))


Validation loss: 0.3608986152432902


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=517.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.2729301833884513


HBox(children=(FloatProgress(value=0.0, max=173.0), HTML(value='')))


Validation loss: 0.364634551043283

