In [44]:
# !wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
# !unzip -q stanfordSentimentTreebank.zip

In [45]:
import pandas as pd

In [46]:
datasetSentences = pd.read_csv('/content/stanfordSentimentTreebank/datasetSentences.txt', sep='\t')
# datasetSplit = pd.read_csv('/content/stanfordSentimentTreebank/datasetSplit.txt')
sentimentLabels = pd.read_csv('/content/stanfordSentimentTreebank/sentiment_labels.txt', sep='|')
dictionary = pd.read_csv('/content/stanfordSentimentTreebank/dictionary.txt', sep='|', names=['sentence', 'phrase ids'])

In [47]:
dictionary.tail()

Unnamed: 0,sentence,phrase ids
239227,zoning ordinances to protect your community fr...,220441
239228,zzzzzzzzz,179256
239229,élan,220442
239230,É,220443
239231,É um passatempo descompromissado,220444


In [48]:
sentimentLabels.tail()

Unnamed: 0,phrase ids,sentiment values
239227,239227,0.36111
239228,239228,0.38889
239229,239229,0.33333
239230,239230,0.88889
239231,239231,0.5


In [49]:
sentimentLabels.shape, dictionary.shape

((239232, 2), (239232, 2))

In [50]:
dataset_df = pd.merge(dictionary, sentimentLabels, on='phrase ids' )

In [51]:
dataset_df.set_index('phrase ids', inplace=True)

In [52]:
dataset_df.shape

(239232, 2)

In [53]:
dataset_df.tail()

Unnamed: 0_level_0,sentence,sentiment values
phrase ids,Unnamed: 1_level_1,Unnamed: 2_level_1
220441,zoning ordinances to protect your community fr...,0.13889
179256,zzzzzzzzz,0.19444
220442,élan,0.51389
220443,É,0.5
220444,É um passatempo descompromissado,0.5


In [54]:
dist_labels='very negative, negative, neutral, positive, very positive'.split(',')
dist_labels

['very negative', ' negative', ' neutral', ' positive', ' very positive']

In [55]:
dataset_df['sentiment_labels'] = pd.qcut(dataset_df['sentiment values'], q=[0, .2, .4, .6, .8, 1], labels=dist_labels )

In [56]:
dataset_df.tail()

Unnamed: 0_level_0,sentence,sentiment values,sentiment_labels
phrase ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
220441,zoning ordinances to protect your community fr...,0.13889,very negative
179256,zzzzzzzzz,0.19444,very negative
220442,élan,0.51389,neutral
220443,É,0.5,negative
220444,É um passatempo descompromissado,0.5,negative


In [57]:
dataset_df['sentiment_labels'].value_counts()/dataset_df.shape[0]

 negative         0.352190
 positive         0.209190
very negative     0.203969
 very positive    0.179395
 neutral          0.055256
Name: sentiment_labels, dtype: float64

### I guess its pretty hard to sate the hunger of man...
More than 55% of all the sentiments are negative....

In [58]:
# Import Library
import random
import torch, torchtext
from torchtext import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fa35256ad80>

In [59]:
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

In [60]:
def random_swap(sentence, n=5): 
    length = range(len(sentence))
    if len(sentence) > 1: 
        for _ in range(n):
            idx1, idx2 = random.sample(length, 2)
            sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [61]:
!pip install googletrans==3.1.0a0
import random
import googletrans
from googletrans import Translator

translator = Translator()
sentence = ['The dog slept on the rug']

available_langs = list(googletrans.LANGUAGES.keys()) 
trans_lang = random.choice(available_langs) 
print(f"Translating to {googletrans.LANGUAGES[trans_lang]}")

translations = translator.translate(sentence, dest=trans_lang) 
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=trans_lang, dest='en') 
en_text = [t.text for t in translations_en_random]
print(en_text)

Translating to zulu
['The dog slept on the rug']
['The dog slept on the rug']


In [62]:
translator = Translator()
available_langs = list(googletrans.LANGUAGES.keys()) 
def back_translate(sentence):
    trans_lang = random.choice(available_langs) 
    translations = translator.translate(sentence, dest=trans_lang)
    t_text = [t.text for t in translations]
    translations_en_random = translator.translate(t_text, src=trans_lang, dest='en') 
    en_text = [t.text for t in translations_en_random]
    return en_text

In [63]:
Tweet = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [64]:
fields = [('tweets', Tweet),('labels',Label)]

In [65]:
example = [data.Example.fromlist([dataset_df.sentence[i], dataset_df.sentiment_labels[i]], fields) for i in range(dataset_df.shape[0])] 

In [66]:
len(example)

239232

In [67]:
treebankDataset = data.Dataset(example, fields)

In [68]:
(train, valid) = treebankDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED))

In [69]:
(len(train), len(valid))

(203347, 35885)

Exemple grate

In [70]:
vars(train.examples[100])

{'labels': ' negative', 'tweets': [',', 'no', 'less']}

In [71]:
train_len = len(train.examples)

### Random deletion

In [72]:
from tqdm.auto import tqdm
# for i in tqdm(range(len(train_data.examples))):
#     vars(train_data.examples[i])['text'].reverse()
random_series = random.sample(range(train_len), random.randint(0,train_len-1) )
for i in tqdm(random_series):
    vars(train.examples[i])['tweets'] = random_deletion(vars(train.examples[i])['tweets'])

HBox(children=(FloatProgress(value=0.0, max=10106.0), HTML(value='')))




## Random swap

In [73]:
random_series = random.sample(range(train_len), random.randint(0,train_len-1) )
for i in tqdm(random_series):
    vars(train.examples[i])['tweets'] = random_swap(vars(train.examples[i])['tweets'], n=2)

HBox(children=(FloatProgress(value=0.0, max=98143.0), HTML(value='')))




Lets keep back translate low, for 1 sentence takes around 2.3 sec.

In [74]:
random_series = random.sample(range(train_len), 10) #random.randint(0,train_len-1) )
for i in tqdm(random_series):
    vars(train.examples[i])['tweets'] = back_translate(vars(train.examples[i])['tweets'])

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [75]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [76]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  20826
Size of label vocab :  5
Top 10 words appreared repeatedly : [('the', 63532), (',', 58584), ('a', 45218), ('of', 43301), ('and', 43133), ('.', 31778), ('to', 30825), ('-', 29989), ("'s", 23408), ('is', 18945)]
Labels :  defaultdict(<function _default_unk_index at 0x7fa3017f97b8>, {' negative': 0, ' positive': 1, 'very negative': 2, ' very positive': 3, ' neutral': 4})


In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [78]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [79]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [80]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 600
num_hidden_nodes = 200
num_output_nodes = 5
num_layers = 4
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [81]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(20826, 600)
  (encoder): LSTM(600, 200, num_layers=4, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=200, out_features=5, bias=True)
)
The model has 14,103,005 trainable parameters


In [82]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [83]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [84]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.417 | Train Acc: 48.82%
	 Val. Loss: 1.359 |  Val. Acc: 54.17% 

	Train Loss: 1.326 | Train Acc: 57.48%
	 Val. Loss: 1.330 |  Val. Acc: 57.00% 

	Train Loss: 1.285 | Train Acc: 61.78%
	 Val. Loss: 1.318 |  Val. Acc: 58.13% 

	Train Loss: 1.256 | Train Acc: 64.94%
	 Val. Loss: 1.311 |  Val. Acc: 58.73% 

	Train Loss: 1.232 | Train Acc: 67.44%
	 Val. Loss: 1.308 |  Val. Acc: 59.11% 

	Train Loss: 1.212 | Train Acc: 69.50%
	 Val. Loss: 1.306 |  Val. Acc: 59.37% 

	Train Loss: 1.196 | Train Acc: 71.16%
	 Val. Loss: 1.305 |  Val. Acc: 59.47% 

	Train Loss: 1.182 | Train Acc: 72.61%
	 Val. Loss: 1.306 |  Val. Acc: 59.40% 

	Train Loss: 1.169 | Train Acc: 73.87%
	 Val. Loss: 1.305 |  Val. Acc: 59.44% 

	Train Loss: 1.159 | Train Acc: 74.86%
	 Val. Loss: 1.306 |  Val. Acc: 59.35% 

	Train Loss: 1.150 | Train Acc: 75.68%
	 Val. Loss: 1.306 |  Val. Acc: 59.34% 

	Train Loss: 1.143 | Train Acc: 76.33%
	 Val. Loss: 1.307 |  Val. Acc: 59.22% 

	Train Loss: 1.137 | Train Acc: 76.95%
	

### Without Data augmentation

Train Loss: 1.440 | Train Acc: 46.64%
    Val. Loss: 1.392 |  Val. Acc: 50.80% 

Train Loss: 1.345 | Train Acc: 55.56%
    Val. Loss: 1.347 |  Val. Acc: 55.33% 

Train Loss: 1.307 | Train Acc: 59.56%
    Val. Loss: 1.330 |  Val. Acc: 57.07% 

Train Loss: 1.281 | Train Acc: 62.31%
    Val. Loss: 1.320 |  Val. Acc: 58.06% 

Train Loss: 1.262 | Train Acc: 64.24%
    Val. Loss: 1.317 |  Val. Acc: 58.33% 

Train Loss: 1.246 | Train Acc: 65.96%
    Val. Loss: 1.311 |  Val. Acc: 58.80% 

Train Loss: 1.234 | Train Acc: 67.31%
    Val. Loss: 1.310 |  Val. Acc: 58.99% 

Train Loss: 1.222 | Train Acc: 68.52%
    Val. Loss: 1.307 |  Val. Acc: 59.31% 

Train Loss: 1.212 | Train Acc: 69.54%
    Val. Loss: 1.310 |  Val. Acc: 58.94% 

Train Loss: 1.203 | Train Acc: 70.45%
    Val. Loss: 1.307 |  Val. Acc: 59.31%