In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import spacy
import string
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import collections
from IPython.display import HTML, IFrame
from textblob import TextBlob
from wordcloud import WordCloud
from tqdm import tqdm_notebook
from torchtext import data
import torch

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 19.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 55.9MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 50.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=45eb68ba3928

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [5]:
len(tokenizer.vocab)

30522

In [6]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

['hello', 'world', 'how', 'are', 'you', '?']
[7592, 2088, 2129, 2024, 2017, 1029]


In [7]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [8]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [9]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [10]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [11]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [12]:
from torchtext import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)
print("loading from csv ...")
tv_datafields = [("review", TEXT), ("label", LABEL)]

# Step two construction our dataset.
train, valid, test = data.TabularDataset.splits(path='/content/drive/MyDrive/SENTIMENT',
                                                train="train.csv", validation="valid.csv",
                                                test="test_dataset.csv", format="csv",
                                                skip_header=True, fields=tv_datafields)
print(train[0].__dict__.keys())

loading from csv ...
dict_keys(['review', 'label'])


In [13]:
print(f"Number of training examples: {len(train)}")
print(f"Number of validation examples: {len(valid)}")
print(f"Number of testing examples: {len(test)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [14]:
print(vars(train.examples[6]))

{'review': [1045, 10312, 2023, 3185, 2043, 1045, 2001, 1037, 26612, 1999, 2267, 1012, 1045, 1005, 2310, 2699, 2000, 3422, 2009, 2093, 2335, 1010, 1996, 2117, 2048, 2138, 2814, 2359, 2000, 2156, 2009, 1012, 1000, 4086, 1010, 4205, 5472, 3917, 1010, 1045, 1005, 2310, 2196, 2657, 1997, 2023, 3185, 1010, 2021, 2144, 2002, 1005, 1055, 2061, 6057, 2049, 10657, 2022, 6057, 1012, 1000, 3308, 999, 1045, 2064, 1005, 1056, 2191, 2870, 3422, 2023, 8632, 1997, 10231, 2044, 1996, 3959, 8362, 2674, 1013, 15301, 2162, 1010, 2073, 5255, 1996, 3124, 2007, 1037, 2204, 1062, 9912, 5320, 2115, 7116, 3558, 3255, 1012, 2017, 2052, 2228, 2008, 6659, 4038, 11878, 2017, 2003, 9951, 1010, 2021, 2044, 3666, 2023, 2017, 1005, 2222, 2113, 2049, 2995, 1012, 2023, 3185, 3475, 1005, 1056, 4276, 1996, 3976, 1045, 3825, 2005, 2009, 1012, 1045, 1005, 2310, 3427, 1037, 10228, 1997, 7112, 16562, 2140, 5691, 1010, 1998, 1045, 1005, 2310, 2130, 3427, 16760, 3807, 1012, 1012, 1012, 2021, 1045, 2145, 2481, 1005, 1056, 3422, 20

In [15]:
tokens = tokenizer.convert_ids_to_tokens(vars(train.examples[6])['review'])

print(tokens)

['i', 'stole', 'this', 'movie', 'when', 'i', 'was', 'a', 'freshmen', 'in', 'college', '.', 'i', "'", 've', 'tried', 'to', 'watch', 'it', 'three', 'times', ',', 'the', 'second', 'two', 'because', 'friends', 'wanted', 'to', 'see', 'it', '.', '"', 'sweet', ',', 'adam', 'sand', '##ler', ',', 'i', "'", 've', 'never', 'heard', 'of', 'this', 'movie', ',', 'but', 'since', 'he', "'", 's', 'so', 'funny', 'its', 'gotta', 'be', 'funny', '.', '"', 'wrong', '!', 'i', 'can', "'", 't', 'make', 'myself', 'watch', 'this', 'pile', 'of', 'crap', 'after', 'the', 'dream', 'boxing', 'match', '/', 'insult', 'war', ',', 'where', 'burning', 'the', 'guy', 'with', 'a', 'good', 'z', '##inger', 'causes', 'your', 'opponent', 'physical', 'pain', '.', 'you', 'would', 'think', 'that', 'terrible', 'comedy', 'hurting', 'you', 'is', 'ridiculous', ',', 'but', 'after', 'watching', 'this', 'you', "'", 'll', 'know', 'its', 'true', '.', 'this', 'movie', 'isn', "'", 't', 'worth', 'the', 'price', 'i', 'paid', 'for', 'it', '.', '

In [16]:
LABEL.build_vocab(train)

In [17]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fb22a043268>, {'0': 0, '1': 1})


In [18]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter , valid_iter , test_iter = data.BucketIterator.splits((train,valid,test), device=device, 
                                                 batch_sizes=(32,32,32), sort_key=lambda x: len(x.review),
                                                 sort_within_batch=True, repeat=False)

In [19]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [20]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [21]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [23]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [25]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [26]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [27]:
criterion = nn.BCEWithLogitsLoss()

In [28]:
model = model.to(device)
criterion = criterion.to(device)

In [29]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [30]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.review).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.review).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
        
    end_time = time.time()
        
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 8m 33s
	Train Loss: 0.358 | Train Acc: 83.61%
	 Val. Loss: 0.246 |  Val. Acc: 90.46%


In [None]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, tokenizer, "This film is terrible")

In [None]:
predict_sentiment(model, tokenizer, "This film is great")