https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/6%20-%20Transformers%20for%20Sentiment%20Analysis.ipynb

In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
import torch
import random

SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
tokens = tokenizer.tokenize("What's going on?")

print(tokens)

['what', "'", 's', 'going', 'on', '?']


In [13]:
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [14]:
def tokenize(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:254-2]
    return tokens

In [15]:
from torchtext import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = cls_token_idx,
                  eos_token = sep_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField()



In [16]:
from torchtext import datasets

train_data, valid_data = datasets.IMDB.splits(TEXT, LABEL)


downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 29.1MB/s]


In [17]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")

Number of training examples: 25000
Number of validation examples: 25000


In [18]:
LABEL.build_vocab(train_data)

In [19]:
BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE, 
    device = device)



In [20]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [21]:
import torch.nn as nn

class BERTSentiment(nn.Module):
    def __init__(self,
                 bert,
                 output_dim):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.out = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        embedded = self.bert(text)[1]
                
        #embedded = [batch size, emb dim]
        
        output = self.out(embedded)
        
        #output = [batch size, out dim]
        
        return output

In [22]:

OUTPUT_DIM = 2

model = BERTSentiment(bert,
                     OUTPUT_DIM).to(device)

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,483,778 trainable parameters


In [24]:
import torch.optim as optim
from transformers import AdamW, get_constant_schedule_with_warmup

optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6)

def get_scheduler(optimizer, warmup_steps):
    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    return scheduler

In [25]:
criterion = nn.CrossEntropyLoss().to(device)

In [26]:
#to calculate accuracy

def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    correct = (max_preds.squeeze(1)==y).float()
    return correct.sum() / len(y)

In [27]:
max_grad_norm = 1

def train(model, iterator, optimizer, criterion, scheduler):
    
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory 
        

        text = batch.text
        
        label = batch.label
        
        predictions = model(text)
        
        loss = criterion(predictions, label)
        
        acc = categorical_accuracy(predictions, label)
        #torch.nn.utils.clip_grad_norm_(optimizer, max_grad_norm)
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.text
            label = batch.label
                        
            predictions = model(text)
            
            loss = criterion(predictions, label)
                
            acc = categorical_accuracy(predictions, label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
import math
N_EPOCHS = 3
train_data_len = 25000

warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS*train_data_len*1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-nli.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 18m 5s
	Train Loss: 0.190 | Train Acc: 92.78%
	 Val. Loss: 0.235 |  Val. Acc: 90.65%
Epoch: 02 | Epoch Time: 18m 6s
	Train Loss: 0.132 | Train Acc: 95.39%
	 Val. Loss: 0.244 |  Val. Acc: 91.61%
Epoch: 03 | Epoch Time: 18m 6s
	Train Loss: 0.069 | Train Acc: 97.76%
	 Val. Loss: 0.260 |  Val. Acc: 91.63%
