In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.5MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 19.2MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 30.2MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB

In [3]:
import glob
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Test sentence
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
input_sentence = "Hello my name is Jin"
encoded = tokenizer(input_sentence)
print('Input sentence: ', input_sentence, '\n')
print('Encoded: ', encoded, '\n')
print('Decoded: ', tokenizer.decode(encoded['input_ids']), '\n')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Input sentence:  Hello my name is Jin 

Encoded:  {'input_ids': [101, 7592, 2026, 2171, 2003, 9743, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]} 

Decoded:  [CLS] hello my name is jin [SEP] 



## Get Preprocessed Review Data

In [6]:
# Get all review files
file_path = '/content/drive/My Drive/BERT project/Reviews/*[0-9].csv'
files = glob.glob(file_path)

In [7]:
# Concat all review data from different products into one big dataframe
df_list = []

for file in files:
    df = pd.read_csv(file)
    df_list.append(df)
    
df = pd.concat(df_list, axis=0, ignore_index=True)
df

Unnamed: 0,comment,stars,verified,date,country,helpful,has-media
0,I could sit here and write all about the specs...,5,0,2019-6-25,1,3,0
1,A very reasonably priced laptop for basic comp...,4,0,2019-7-2,1,1,1
2,"This is the best laptop deal you can get, full...",5,1,2019-8-3,1,1,1
3,A few months after the purchase....It is still...,5,1,2019-7-12,1,1,0
4,BUYER BE AWARE: This computer has Microsoft 10...,1,1,2019-8-7,1,6,0
...,...,...,...,...,...,...,...
120384,Great overall product,5,1,2019-12-28,1,0,0
120385,wow~^^very good,5,1,2019-12-30,1,0,0
120386,The sound is amazing!,5,1,2020-1-9,1,0,0
120387,I love my AirPods Pro,5,1,2019-11-23,1,0,0


## Data Exploration

In [8]:
# Check for missing values
df.isnull().any()

comment       True
stars        False
verified     False
date         False
country      False
helpful      False
has-media    False
dtype: bool

In [9]:
# Find reviews with missing comments
missing_indices = df[df['comment'].isnull()].index.tolist()
print('Number of reviews missing comments: ', len(missing_indices))
print('Missing indices: ', missing_indices)

Number of reviews missing comments:  17
Missing indices:  [24835, 37237, 37277, 40072, 50852, 64895, 69760, 80854, 81562, 84103, 86420, 92670, 98539, 98769, 105389, 105405, 112139]


In [10]:
print('Max comment length (of all products): ', int(df.comment.str.len().max()))

Max comment length (of all products):  5127


In [11]:
# Get only the comments and star (labels) data
comment_df = df[['comment', 'stars']]
comment_df

Unnamed: 0,comment,stars
0,I could sit here and write all about the specs...,5
1,A very reasonably priced laptop for basic comp...,4
2,"This is the best laptop deal you can get, full...",5
3,A few months after the purchase....It is still...,5
4,BUYER BE AWARE: This computer has Microsoft 10...,1
...,...,...
120384,Great overall product,5
120385,wow~^^very good,5
120386,The sound is amazing!,5
120387,I love my AirPods Pro,5


In [12]:
# print first 5 reviews
for idx, row in comment_df[:5].iterrows():
    print(row['comment'] + '\n')

I could sit here and write all about the specs on this computer, but they are already in the description, and If you are like me... you don't really understand it anyways.So I am going to tell you what I LOVE about this computer and what I use it for. I am a full time college student as well as a single mother who stays busy. I have previously used a HP All In one computer that I bought brand new a year ago and I hate that thing... It is so slow!!! When I first opened this item, I was just hoping that it would be a little faster! What I got instead was an amazing computer that is faster than I could have ever imagined. Now I don't use this thing for much more than amazon reviews, school work, and papers. But this is exactly what I needed.

A very reasonably priced laptop for basic computing needs. The specs that stick out to me for describing this as "basic needs" is 4GB of RAM, and 128GB M.2 SSD. Both are at the bare minimum in today's needs. Cell phones now come with those specs( hig

## Define the Reviews Dataset
Each item in the dataset will return a dictionary consisting of:


*   input_ids: the input token ids
*   attn_mask: the attention mask of the input sequence
*   label: the target star rating of the input review

In [13]:
class ReviewsDataset(Dataset):
    def __init__(self, df, max_length=1024):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # input=review, label=stars
        review = self.df.loc[idx, 'comment']
        # labels are 0-indexed
        label = int(self.df.loc[idx, 'stars']) - 1
        
        encoded = self.tokenizer(
            review,                      # review to encode
            max_length=self.max_length,  # Truncate all segments to max_length
            padding='max_length',        # pad all reviews with the [PAD] token to the max_length
            return_attention_mask=True,  # Construct attention masks.
            return_tensors='pt'
        )
        
        input_ids = encoded['input_ids']
        attn_mask = encoded['attention_mask']
        
        return {
            'input_ids': input_ids, 
            'attn_mask': attn_mask, 
            'label': label
        }

Define some constants that are important later on.

In [14]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Create Datasets / DataLoaders
Create the train and test datasets and dataloaders for the neural network.

In [16]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

training_set = ReviewsDataset(train_dataset, MAX_LEN)
testing_set = ReviewsDataset(test_dataset, MAX_LEN)

print("# of samples in train set: {}".format(len(training_set)))
print("# of samples in test set: {}".format(len(testing_set)))

# of samples in train set: 96311
# of samples in test set: 24078


In [17]:
train_params = {
                'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {
                'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Define the neural model for fine tuning
Given a review as an input sequence, we want to predict its star rating. This is a multi-class sequence classification task.

For out model, we will use BertForSequenceClassification and set the num_labels argument to the number of unique values for Amazon star ratings.

In [18]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(df['stars'].unique()), # number of unique labels for our multi-class classification problem
    output_attentions = False,
    output_hidden_states = False,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Fine Tuning the Model on Train Dataset

In [19]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attn_mask'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return