In [20]:
import operator
import os, math
import string
import numpy as np
import random
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from ekphrasis.classes.tokenizer import SocialTokenizer

seed = 2019
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    
data_dir = '/home/dfsnow/rbans/data'  
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 256
max_vocab_size = 25000

In [21]:
# Some user-defined helper functions
flatten = lambda l: [item for sublist in l for item in sublist]

def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def predict_sentiment(model, sentence):
    model.eval()
    tokenized = generate_bigrams([tok for tok in SocialTokenizer(lowercase=True).tokenize(sentence)])
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.round(torch.sigmoid(model(tensor)))
    return prediction.item()

In [22]:
# Defining the structure of the text data
TEXT = data.Field(
    sequential=True,
    preprocessing=generate_bigrams,
    tokenize=SocialTokenizer(lowercase=True).tokenize,
    lower=True)

LABEL = data.Field(
    dtype=torch.float,
    sequential=False,
    use_vocab=False,
    pad_token=None, 
    unk_token=None)

rnn_fields = [("id", None),
              ("score", None),
              ("body", TEXT),
              ("label", LABEL)]

In [None]:
# Splitting sets into train test and validate + preprocessing and tokenizing
train, validate, test = data.TabularDataset.splits(
    path=data_dir,
    train='test_train.csv',
    validation="test_validate.csv",
    test='test_test.csv',
    format='csv',
    skip_header=False, 
    fields=rnn_fields)

In [None]:
# Batch each set for processing via our model
train_iter, validate_iter, test_iter = data.BucketIterator.splits(
    (train, validate, test), batch_size=batch_size,
    sort_key=lambda x: len(x.body), device=device,
    repeat=False, shuffle=True)

# Load pre-trained embeddings from twitter data
vec = vocab.Vectors('glove.twitter.27B.100d.txt', os.path.join(data_dir, 'embeddings'))

# Build our corpus of vocabulary
TEXT.build_vocab(train, validate, max_size=max_vocab_size, vectors=vec, unk_init = torch.Tensor.normal_)

In [None]:
# Create the FastText model outlined in this notebook:
# https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/3%20-%20Faster%20Sentiment%20Analysis.ipynb
# and further described in this paper:
# https://arxiv.org/abs/1607.01759

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, embedding_weights, padding_idx):
        super().__init__()      

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding_weights = embedding_weights
        self.output_dim = output_dim
        self.padding_idx = padding_idx
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.embeddings.weight.data.copy_(embedding_weights)
        self.embeddings.weight.data[TEXT.vocab.stoi[TEXT.unk_token]] = torch.zeros(embedding_dim)
        self.embeddings.weight.data[padding_idx] = torch.zeros(embedding_dim)
        
        self.label = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
         
        embedded = self.embeddings(text)
        embedded = embedded.permute(1, 0, 2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
               
        return self.label(pooled)

In [None]:
def train_model(model, train_iter, optim, loss_func, epoch):
    
    total_epoch_loss = 0
    total_epoch_acc = 0
    
    model.train()
    for idx, batch in enumerate(train_iter):
        optim.zero_grad()
        batch_size = batch.body.size()[1]  # subsetting to fix a weird DataParallel bug
        predictions = model(batch.body).squeeze(1)
        loss = loss_func(predictions[0:batch_size], batch.label)  
        acc = binary_accuracy(predictions[0:batch_size], batch.label)      
        loss.backward()     
        optim.step()
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss / len(train_iter), total_epoch_acc / len(train_iter)


def evaluate_model(model, validate_iter, loss_func):
    
    total_epoch_loss = 0
    total_epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(validate_iter):
            batch_size = batch.body.size()[1]  
            predictions = model(batch.body).squeeze(1)
            loss = loss_func(predictions[0:batch_size], batch.label)  
            acc = binary_accuracy(predictions[0:batch_size], batch.label) 

            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()
        
    return total_epoch_loss / len(validate_iter), total_epoch_acc / len(validate_iter)

In [None]:
# Initialize the model with the following params
vocab_size = len(TEXT.vocab)
embedding_weights = TEXT.vocab.vectors
embedding_dim = 100
output_dim = 1
padding_idx = TEXT.vocab.stoi[TEXT.pad_token]

model = FastText(vocab_size, embedding_dim, output_dim, embedding_weights, padding_idx)
optim = torch.optim.Adam(model.parameters())
loss = nn.BCEWithLogitsLoss()

if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs for modelling...")
    model = nn.DataParallel(model)
    model.to(device)
    
model = model.to(device)
loss = loss.to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
num_epochs = 5
for epoch in range(num_epochs):

    start_time = time.time() 
    
    train_loss, train_acc = train_model(model, train_iter, optim, loss, epoch)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    torch.save(model.state_dict(), 'tut3-model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    
validate_loss, validate_acc = evaluate_model(model, validate_iter, loss)
print(f'Val. Loss: {validate_loss:.3f} |  Val. Acc: {validate_acc*100:.2f}%')

In [None]:
predict_sentiment(model, "I really didn't like that movie")

In [None]:
def predict_sentiment(sentence):
    model.eval()
    indexed = [TEXT.vocab.stoi[t] for t in sentence]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = (torch.sigmoid(model(tensor)) > 0.5)
    return prediction.item()

temp_list = [(predict_sentiment(x.body), x.body) for i, x in enumerate(test) if int(x.label) == 1]
#temp_list = [predict_sentiment(x.body) for i, x in enumerate(test)]