In [2]:
!unzip -q "stanfordSentimentTreebank.zip"

### Preparing the dataset

In [7]:
import pandas as pd

In [8]:
!ls stanfordSentimentTreebank/

datasetSentences.txt  dictionary.txt		README.txt	      SOStr.txt
datasetSplit.txt      original_rt_snippets.txt	sentiment_labels.txt  STree.txt


In [9]:
df_dictionary = pd.read_csv("stanfordSentimentTreebank/dictionary.txt",sep='|',header = None)
df_labels = pd.read_csv("stanfordSentimentTreebank/sentiment_labels.txt",sep='|')

In [10]:
df = pd.merge(df_dictionary, df_labels, how='inner', left_on=1, right_on='phrase ids')

In [11]:
def score_to_label(score):
  score = int (score * 24)+1
  return score

In [12]:
df['label'] = df.apply(lambda row: score_to_label(row['sentiment values']), axis=1)

In [13]:
df.head()

Unnamed: 0,0,1,phrase ids,sentiment values,label
0,!,0,0,0.5,13
1,! ',22935,22935,0.52778,13
2,! '',18235,18235,0.5,13
3,! Alas,179257,179257,0.44444,11
4,! Brilliant,22936,22936,0.86111,21


In [14]:
df.shape

(239232, 5)

In [15]:
df['label'].value_counts()

13    58467
14    23319
11    20303
17    14091
8     12733
10    12175
16    11247
15    10510
12     9630
20     9490
19     9272
7      8330
18     6989
5      6845
9      6454
6      5082
22     3130
4      2821
21     2802
23     1696
3      1538
2      1471
1       437
24      264
25      136
Name: label, dtype: int64

In [16]:
# Import Library
import random
import torch, torchtext
from torchtext import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f25caf14fd8>

In [17]:
Review = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(is_target=True, batch_first =True, dtype=torch.float, sequential=False)

In [18]:
fields = [('review', Review),('labels',Label)]

In [19]:
example = [data.Example.fromlist([df[0][i],df['label'][i]], fields) for i in range(df.shape[0])]

In [20]:
dataset = data.Dataset(example, fields)

In [22]:
(train, valid) = dataset.split(split_ratio=[0.80, 0.20])

In [23]:
(len(train), len(valid))

(191386, 47846)

In [24]:
vars(train.examples[13])

{'labels': 12, 'review': ['Strangely']}

In [25]:
Review.build_vocab(train)
Label.build_vocab(train)

In [26]:
print('Size of input vocab : ', len(Review.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Review.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  20816
Size of label vocab :  25
Top 10 words appreared repeatedly : [('the', 61179), (',', 56385), ('a', 43695), ('of', 41697), ('and', 41488), ('.', 30394), ('to', 29724), ('-', 28889), ("'s", 22483), ('is', 18198)]
Labels :  defaultdict(<function _default_unk_index at 0x7f257cd96f28>, {13: 0, 14: 1, 11: 2, 17: 3, 8: 4, 10: 5, 16: 6, 15: 7, 12: 8, 20: 9, 19: 10, 7: 11, 18: 12, 5: 13, 9: 14, 6: 15, 22: 16, 4: 17, 21: 18, 23: 19, 3: 20, 2: 21, 1: 22, 24: 23, 25: 24})


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [28]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.review),
                                                            sort_within_batch=True, device = device)

In [30]:
import os, pickle
with open('tomato_tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Review.vocab.stoi, tokens)

### Building the model

In [31]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden[-1])   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs, dim=-1)
            
        return output

In [32]:
size_of_vocab = len(Review.vocab)
embedding_dim = 300
num_hidden_nodes = 200
num_output_nodes = 25
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [37]:
model

classifier(
  (embedding): Embedding(20816, 300)
  (encoder): LSTM(300, 200, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=200, out_features=25, bias=True)
)

In [38]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,973,025 trainable parameters


In [39]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

Training

In [40]:

def train_loop(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        review, review_lengths = batch.review   
        
        # convert to 1D tensor
        predictions = model(review, review_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels.long())
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Evaluation

In [41]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            review, review_lengths = batch.review
            
            # convert to 1d tensor
            predictions = model(review, review_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels.long())
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [42]:
N_EPOCHS = 15
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train_loop(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 3.047 | Train Acc: 24.43%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.49%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.49%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.49%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.49%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.50%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.50%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.51%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.52%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.52%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.53%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.55%
	 Val. Loss: 3.043 |  Val. Acc: 24.21% 

	Train Loss: 3.040 | Train Acc: 24.57%
	

Model Evaluation

In [43]:
path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_review(tweet):
    
    categories = {
        0: "Very Negative",
        1: "Negative",
        2: "Neutral",
        3: "Positive",
        4: "Very Positive"
      }
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[Label.vocab.stoi[pred.item()]]

In [44]:
classify_review("Profoundly, exhaustingly mediocre, a fussily overdone bit of mindless prestige filmmaking.")

'Very Negative'

In [46]:
classify_review("Best entertainer of the year , edge of seat thriller.")

'Very Negative'

### Augmentation techniques for text

### Random Deletion

In [47]:
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [50]:
random_deletion(["This"," movie", "is" , "very" ," good"])

['This', ' movie', 'is']

### Random Swap

In [56]:
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [57]:
random_swap('Dialogs were very good in this movie'.split(' '))

['Dialogs', 'were', 'in', 'this', 'good', 'very', 'movie']

### Back Translation

In [58]:
!pip install google_trans_new

Collecting google_trans_new
  Downloading https://files.pythonhosted.org/packages/f9/7b/9f136106dc5824dc98185c97991d3cd9b53e70a197154dd49f7b899128f6/google_trans_new-1.1.9-py3-none-any.whl
Installing collected packages: google-trans-new
Successfully installed google-trans-new-1.1.9


In [64]:
from google_trans_new import google_translator  

translator = google_translator()  
translate_text = translator.translate('Tomorrow is wednesday',lang_tgt='fr')  
print(translate_text)

Demain c'est mercredi 
