In [1]:
#bert_env\Scripts\activate.bat 
import torch
import random
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed=10
random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnndeterministic=True

In [3]:
#!pip install transformers

In [4]:
from transformers import AutoTokenizer

In [5]:
base_model='bert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(base_model)

In [6]:
len(tokenizer.vocab)

30522

In [7]:
tokens=tokenizer.tokenize('Welcome to workshop')
print(tokens)

['welcome', 'to', 'workshop']


In [8]:
indexes=tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[6160, 2000, 8395]


In [9]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [10]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

In [11]:
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [12]:
max_input_len = tokenizer.max_model_input_sizes[base_model]
print(max_input_len)

512


In [13]:
max_input_length = 10

In [14]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_len - 2]
    return tokens

## Data Collection

In [15]:
from torchtext import data

text = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

label = data.LabelField(dtype = torch.float)

In [17]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(text, label)

train_data, valid_data = train_data.split(random_state = random.seed(seed))

Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors


In [18]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [19]:
print(vars(train_data.examples[10]))

{'text': [2061, 1010, 2067, 2043, 12810, 2666, 2081, 2010, 2034, 3311, 1010, 1045, 2001, 6669, 3407, 3666, 4670, 3557, 14757, 2185, 1012, 1045, 2069, 2359, 2000, 2022, 21474, 2005, 1037, 2261, 2847, 1998, 4521, 15241, 12136, 2098, 24593, 1012, 2085, 1010, 6854, 1010, 1045, 2031, 10908, 1997, 1037, 15544, 19510, 2075, 1013, 26380, 2466, 7188, 1045, 3422, 1037, 3185, 1010, 2065, 1045, 1005, 1049, 2025, 2006, 2070, 4066, 1997, 14667, 1012, 1998, 2023, 2003, 2178, 2204, 3185, 2005, 1996, 19960, 17872, 1012, 2045, 2024, 2053, 2350, 11680, 1010, 2053, 3375, 5436, 3210, 1010, 2053, 3697, 21438, 1012, 12810, 2666, 3929, 8209, 2003, 2307, 2005, 1996, 3929, 8209, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2023, 2001, 1996, 2034, 2051, 1045, 2018, 2464, 2474, 8840, 4819, 2006, 1996, 3898, 2144, 2016, 29176, 3182, 2007, 6175, 3389, 9195, 1006, 1045, 2245, 2016, 2001, 6581, 1999, 2008, 1007, 1010, 1998, 1045, 2064, 1005, 1056, 2360, 1045, 2001, 16668, 7622, 2023, 2051, 2105, 1012, 4998, 

In [20]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[10])['text'])

print(tokens)

['so', ',', 'back', 'when', 'herb', '##ie', 'made', 'his', 'first', 'appearance', ',', 'i', 'was', 'perfectly', 'happy', 'watching', 'dean', 'jones', 'mug', 'away', '.', 'i', 'only', 'wanted', 'to', 'be', 'entertained', 'for', 'a', 'few', 'hours', 'and', 'eat', 'overly', 'butter', '##ed', 'popcorn', '.', 'now', ',', 'unfortunately', ',', 'i', 'have', 'expectations', 'of', 'a', 'ri', '##vet', '##ing', '/', 'delightful', 'story', 'whenever', 'i', 'watch', 'a', 'movie', ',', 'if', 'i', "'", 'm', 'not', 'on', 'some', 'sort', 'of', 'medication', '.', 'and', 'this', 'is', 'another', 'good', 'movie', 'for', 'the', 'med', '##icated', '.', 'there', 'are', 'no', 'major', 'laughs', ',', 'no', 'complex', 'plot', 'lines', ',', 'no', 'difficult', 'twists', '.', 'herb', '##ie', 'fully', 'loaded', 'is', 'great', 'for', 'the', 'fully', 'loaded', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'this', 'was', 'the', 'first', 'time', 'i', 'had', 'seen', 'la', 'lo', '##han', 'on', 'the', 'screen', 'since', 

In [21]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[10])['text'])

print(tokens)

['so', ',', 'back', 'when', 'herb', '##ie', 'made', 'his', 'first', 'appearance', ',', 'i', 'was', 'perfectly', 'happy', 'watching', 'dean', 'jones', 'mug', 'away', '.', 'i', 'only', 'wanted', 'to', 'be', 'entertained', 'for', 'a', 'few', 'hours', 'and', 'eat', 'overly', 'butter', '##ed', 'popcorn', '.', 'now', ',', 'unfortunately', ',', 'i', 'have', 'expectations', 'of', 'a', 'ri', '##vet', '##ing', '/', 'delightful', 'story', 'whenever', 'i', 'watch', 'a', 'movie', ',', 'if', 'i', "'", 'm', 'not', 'on', 'some', 'sort', 'of', 'medication', '.', 'and', 'this', 'is', 'another', 'good', 'movie', 'for', 'the', 'med', '##icated', '.', 'there', 'are', 'no', 'major', 'laughs', ',', 'no', 'complex', 'plot', 'lines', ',', 'no', 'difficult', 'twists', '.', 'herb', '##ie', 'fully', 'loaded', 'is', 'great', 'for', 'the', 'fully', 'loaded', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'this', 'was', 'the', 'first', 'time', 'i', 'had', 'seen', 'la', 'lo', '##han', 'on', 'the', 'screen', 'since', 

In [25]:
label.build_vocab(train_data)

In [26]:
print(label.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [27]:
batch_size= 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size, 
    device = device)

In [None]:
from transformers import AutoTokenizer, AutoModel

b_model = AutoModel.from_pretrained(base_model)

Downloading:  45%|██████████████████████████████▍                                    | 191M/420M [05:11<06:21, 630kB/s]

In [None]:
import torch.nn as nn

class SentimentAnalyzer(nn.Module):
    def __init__(self,
                 b_model,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.b_model = b_model
        
        embedding_dim = b_model.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.b_model(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [None]:
hidden_dim = 256
op_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.25

model = SentimentAnalyzer(b_model,
                         hidden_dim,
                         op_dim,
                         n_layers,
                         bidirectional,
                         dropout)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
for name, param in model.named_parameters():                
    if name.startswith('b_model'):
        param.requires_grad = False

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
epochs = 3

best_valid_loss = float('inf')

for epoch in range(epochs):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'dev-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
hidden_dim = 256
op_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.25

model = SentimentAnalyzer(b_model,
                         hidden_dim,
                         op_dim,
                         n_layers,
                         bidirectional,
                         dropout)

In [None]:
model.load_state_dict(torch.load('dev-model.pt'))
model.to(device)

In [None]:
predict_sentiment(model, tokenizer, "This workshop is awesome")

In [None]:
predict_sentiment(model, tokenizer, "This workshop is boring")