Ref: https://github.com/bentrevett/pytorch-sentiment-analysis

In [None]:
import torch 
import pandas as pd
import numpy as np
import spacy 
from torchtext import data
from torchtext.legacy import data
from torchtext import datasets
import random
import regex as re
from spacy.lang.en import English
nlp = English()
from textblob import Word, TextBlob

In [None]:
tweet = data.Field(sequential=True,tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)
target = data.Field(sequential=False, use_vocab=False)

In [None]:
fields = {
    'Tweets' : ('t',tweet),
    'Target' : ('s', target)
}
train_data, valid_data = data.TabularDataset(path = "./clean_df_train.csv",test = "clean_df_valid.csv",format = "csv",fields=fields)

In [None]:
tweet.build_vocab(train_data,max_size=10000,vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_,min_freq=1)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 398394/400000 [00:14<00:00, 26650.72it/s]

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data,valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.t),
    sort_within_batch = False,
    device = device)

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,text_lengths.to('cpu'),enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [None]:
INPUT_DIM = len(tweet.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = tweet.vocab.stoi[tweet.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,310,857 trainable parameters


In [None]:
pretrained_embeddings = tweet.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([10002, 100])


In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.6911, -0.6631,  0.0450,  ...,  1.9049, -0.7128, -0.1678],
        [ 0.4257, -0.2420, -1.3958,  ..., -0.8480,  0.3458, -1.6876],
        [-0.3669,  0.4154,  0.1348,  ...,  0.0244,  0.2211,  0.4317],
        ...,
        [-0.6065,  0.2374, -0.3696,  ..., -0.9791,  1.3997, -0.8333],
        [-0.7169, -0.9050, -0.1154,  ..., -0.4203,  0.1224, -0.3669],
        [ 0.3782,  0.6037,  0.3379,  ...,  0.2614,  0.4405,  0.1442]])

In [None]:
UNK_IDX = tweet.vocab.stoi[tweet.unk_token]

In [None]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.MSELoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.t
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions.float(), batch.s.float())
        acc = binary_accuracy(predictions.float(), batch.s.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.t
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions.float(), batch.s.float())
            acc = binary_accuracy(predictions.float(), batch.s.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|█████████▉| 398394/400000 [00:30<00:00, 26650.72it/s]

Epoch: 01 | Epoch Time: 3m 6s
	Train Loss: 2.602 | Train Acc: 1.01%
	 Val. Loss: 2.370 |  Val. Acc: 0.86%
Epoch: 02 | Epoch Time: 3m 6s
	Train Loss: 2.407 | Train Acc: 1.83%
	 Val. Loss: 2.329 |  Val. Acc: 5.15%
Epoch: 03 | Epoch Time: 3m 6s
	Train Loss: 2.350 | Train Acc: 2.43%
	 Val. Loss: 2.304 |  Val. Acc: 0.00%
Epoch: 04 | Epoch Time: 3m 6s
	Train Loss: 2.313 | Train Acc: 2.71%
	 Val. Loss: 2.300 |  Val. Acc: 1.22%
