In [14]:
import numpy as np
import torch
from torch import nn
import torchtext
from torchtext import data
from torchtext import vocab
import spacy
import matplotlib.pyplot as plt
import transformers
from transformers import get_linear_schedule_with_warmup

In [15]:
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

In [16]:
def tokenizer(s):
    return [w.text.lower() for w in nlp(s)]

In [17]:
rew_field = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=True,
                       fix_length=512)

label_field = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

train_val_fields = [
    ('Review', rew_field), 
    ('Label', label_field)
]

dtrain, dval, dtest = data.TabularDataset.splits(path='/kaggle/input/sentimentanalysis/', 
                                            format='csv', 
                                            train='train_df.csv', 
                                            validation='valid_df.csv',
                                            test='test_df.csv',
                                            fields=train_val_fields, 
                                            skip_header=True)

In [18]:
%%time
vec = vocab.Vectors("../input/sentimentanalysis/glove.6B.200d.txt")
rew_field.build_vocab(dtrain, dval, dtest, max_size=100000, vectors=vec)

CPU times: user 2.44 s, sys: 274 ms, total: 2.72 s
Wall time: 2.73 s


In [19]:
rew_field.vocab.freqs.most_common(20)

[(' ', 641638),
 ('the', 336713),
 ('.', 327192),
 ('and', 164107),
 ('a', 163009),
 ('of', 145864),
 ('to', 135720),
 ('is', 107351),
 ('br', 101872),
 ('it', 96357),
 ('in', 93968),
 ('  ', 91970),
 ('i', 87832),
 ('this', 76000),
 ('that', 73623),
 ('s', 65969),
 ('   ', 63412),
 ('was', 48233),
 ('as', 46933),
 ('for', 44343)]

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bs = 100
trainl, validl, testl = data.BucketIterator.splits(datasets=(dtrain, dval, dtest),
                                                   batch_sizes=(bs, bs, bs),
                                                   sort_key=lambda x: len(x.Review),
                                                   device=device, 
                                                   sort_within_batch=True)
                                                   

In [21]:
batch = next(iter(trainl))
print(len(batch))
batch.dataset.fields

100


{'Review': <torchtext.data.field.Field at 0x7f7e1f5d4a10>,
 'Label': <torchtext.data.field.Field at 0x7f7e1f5d4a50>}

In [22]:
class BatchGenerator:
    
    def __init__(self, batch_obj, data_field, label_field):
        self.batch_obj = batch_obj
        self.data_field = data_field
        self.label_field = label_field
        
    def __len__(self):
        return len(self.batch_obj)
    
    def __iter__(self):
        for batch in self.batch_obj:
            data = getattr(batch, self.data_field)
            label = getattr(batch, self.label_field)
            yield data, label

In [23]:
train_batch = BatchGenerator(trainl, 'Review', 'Label')
batch = next(iter(train_batch))
ba, l = batch[0]

In [24]:
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SentimentRNN(nn.Module):
    """
    The LSTM model for sentiment analysis
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, pretrained_emb, drop_prob=0.5, bidirectional=False):
        """
        Initialize the model by setting up the layers
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.drop_prob = drop_prob
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim).from_pretrained(pretrained_emb)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True, bidirectional=self.bidirectional)
        
        self.dropout = nn.Dropout(self.drop_prob)
        self.lrelu = nn.LeakyReLU()
        
        if self.bidirectional:
            self.fc1 = nn.Linear(2*self.hidden_dim, self.hidden_dim)
            self.fc2 = nn.Linear(self.hidden_dim, self.output_size)
        else:
            self.fc1 = nn.Linear(self.hidden_dim, 100)
            self.fc2 = nn.Linear(100, self.output_size)
            
        self.sig = nn.Sigmoid()
        

    def forward(self, x, lengths, hidden=None):
        """
        Perform a forward pass of our model on some input and hidden state
        """
        batch_size = x.size(1)
        x = x.transpose(0, 1)
        
        if not hidden:
            hidden = self.init_hidden(batch_size)

        x = x.long()
        
        embeds = self.embedding(x)
      
        pack_embeds = pack_padded_sequence(embeds, lengths, batch_first=True)
        lstm_out, hidden = self.lstm(pack_embeds, hidden)
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

        out = F.adaptive_avg_pool1d(lstm_out.permute(0, 2, 1), 1).squeeze(2)
            
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.dropout(self.lrelu(out))
        out = self.fc2(out)
        sig_out = self.sig(out)
        
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        """
        Initializes hidden state
        """
        weight = next(self.parameters()).data
        
        if device.type == 'cuda':
            hidden = (weight.new(self.n_layers*(1 + self.bidirectional), batch_size, 
                                 self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers*(1 + self.bidirectional), batch_size, 
                                 self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers*(1 + self.bidirectional), batch_size, 
                                 self.hidden_dim).zero_(),
                      weight.new(self.n_layers*(1 + self.bidirectional), batch_size, 
                                 self.hidden_dim).zero_())
        
        return hidden
        

In [25]:
train_loader = BatchGenerator(trainl, 'Review', 'Label') 
valid_loader = BatchGenerator(validl, 'Review', 'Label') 
test_loader = BatchGenerator(testl, 'Review', 'Label') 

output_size = 1
embedding_dim = rew_field.vocab.vectors.shape[1]
hidden_dim = 256
n_layers = 3
epochs = 20

vocab_size = len(rew_field.vocab.stoi)
pretrained_emb = rew_field.vocab.vectors 
net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, 
                   pretrained_emb, bidirectional=True)

net.to(device)
print(net)

lr=0.001
wd = 0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr) 

total_steps = len(train_loader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

SentimentRNN(
  (embedding): Embedding(74067, 200)
  (lstm): LSTM(200, 256, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (lrelu): LeakyReLU(negative_slope=0.01)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [26]:
# training params
train_loss = []
val_loss = []

train_len = len(train_loader)
val_len = len(valid_loader)



epochs = 5 # 4-5 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 200
clip=5 # gradient clipping

for e in range(epochs):

    h = net.init_hidden(bs)

    tl = 0
    for (inputs, lengths), labels in train_loader:
        net.train()
        counter += 1

        h = tuple([each.data for each in h])

        net.zero_grad()

        output, h = net(inputs, lengths, h)

        loss = criterion(output.squeeze(), labels.float())
        
        loss.backward()
        
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        scheduler.step()

        if counter % print_every == 0:
            vl = 0
            train_loss.append(loss)

            net.eval()
            for (inputs, lengths), labels in valid_loader:
                output, val_h = net(inputs, lengths)
                vloss = criterion(output.squeeze(), labels.float())
                vl += vloss.item()
                torch.cuda.empty_cache()
                
            val_loss.append(vl/val_len)

            print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss),
                      "Val Loss: {:.6f}".format(vl/val_len))

Epoch: 1/5... Step: 200... Loss: 0.342713... Val Loss: 0.360510
Epoch: 2/5... Step: 400... Loss: 0.419579... Val Loss: 0.314463
Epoch: 3/5... Step: 600... Loss: 0.459653... Val Loss: 0.282613
Epoch: 4/5... Step: 800... Loss: 0.347845... Val Loss: 0.284480
Epoch: 5/5... Step: 1000... Loss: 0.265060... Val Loss: 0.267617


In [29]:
def get_accuracy(loader=test_loader):
    test_losses = [] 
    num_correct = 0

    net.eval()

    for (inputs, lengths), labels in test_loader:

        output, h = net(inputs, lengths)

        test_loss = criterion(output.squeeze(), labels.float())
        test_losses.append(test_loss.item())

        pred = torch.round(output.squeeze())  

        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.numpy()) if not device.type == 'cuda' else np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)

    
    test_len = 2500
    test_acc = num_correct/test_len
    print("Test accuracy: {:.3f}".format(test_acc))

In [30]:
get_accuracy()

Test accuracy: 0.889


In [32]:
def predict_sentiment(model, review):
    model.eval()
    tokenized = tokenizer(review)
    indexed = [rew_field.vocab.stoi[word] for word in tokenized]
    tensor = torch.LongTensor(indexed).to(device).unsqueeze(1)
    prediction = net(tensor, torch.LongTensor([tensor.size(0)]).to(device))
    return prediction[0].item()

In [40]:
pos_rew = "This film is great"
predict_sentiment(net, pos_rew)

0.0035714663099497557

In [34]:
neg_rew = "This film is terrible"
predict_sentiment(net, neg_rew)

3.7507160044469856e-08