In [63]:

import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd

torch.backends.cudnn.deterministic = True

In [64]:
DEVICE = torch.device('cpu')

Mapping:



1.   sad
2.   anger
3. fear
4. happy

In [65]:
df = pd.read_csv('text_ds_clean.csv')
df

Unnamed: 0,caption,sentiment
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am feeling grouchy,1
4,ive been feeling a little burdened lately wasn...,0
...,...,...
17635,im having ssa examination tomorrow in the morn...,0
17636,i constantly worry about their fight against n...,3
17637,i feel its important to share this info for th...,3
17638,i truly feel that if you are passionate enough...,3


Inspired by lectures from STAT 453: Intro to Deep Learning @ UW-Madison (Spring 2021) by Sebastian Raschka


In [66]:
TEXT = torchtext.legacy.data.Field(
    tokenize='spacy', 
    tokenizer_language='en_core_web_sm'
)

In [67]:
LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

In [68]:
fields = [('caption', TEXT), ('sentiment', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path='text_ds_clean.csv', format='csv',
    skip_header=True, fields=fields)

## Split Dataset into Train/Validation/Test
Split the dataset into training, validation, and test partitions:

In [69]:
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(100))


In [70]:
train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(100))


In [71]:
print(vars(train_data.examples[5]))

{'caption': ['i', 'm', 'also', 'feeling', 'cranky', 'about', 'it', 'because', 'the', 'main', 'characters', 'scientist', 'brother', 'observing', 'the', 'moon', 'mentions', 'that', 'there', 'is', 'zero', 'gravity', 'there'], 'sentiment': '1'}


## Build Vocabulary

The vocab will have only top 20000 words.

In [72]:
TEXT.build_vocab(train_data, max_size=15000)
LABEL.build_vocab(train_data)

Some sanity checks:

In [73]:
print(TEXT.vocab.freqs.most_common(20))

[('i', 21741), ('feel', 8428), ('and', 7023), ('to', 6645), ('the', 6121), ('a', 4586), ('feeling', 3843), ('that', 3800), ('of', 3659), ('my', 3145), ('in', 2515), ('it', 2290), ('m', 2147), ('like', 2099), ('so', 1892), ('for', 1738), ('have', 1720), ('was', 1716), ('me', 1698), ('but', 1645)]


In [74]:
print(TEXT.vocab.itos[:10]) 

['<unk>', '<pad>', 'i', 'feel', 'and', 'to', 'the', 'a', 'feeling', 'that']


In [75]:
print(TEXT.vocab.stoi['the']) # stoi = string-to-integer

6


**Class labels:**

In [76]:
print(LABEL.vocab.stoi)

defaultdict(None, {'3': 0, '0': 1, '1': 2, '2': 3})


**Class label count:**

In [77]:
LABEL.vocab.freqs

Counter({'0': 3956, '1': 1825, '2': 1575, '3': 4639})

In [78]:
train_loader, valid_loader, test_loader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=128,
         sort_within_batch=False,
         sort_key=lambda x: len(x.caption),
         device=DEVICE
    )

In [79]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)

        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]
        
        output = self.fc(hidden)
        return output

In [80]:
torch.manual_seed(100)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=128,
            hidden_dim=256,
            output_dim=4 # could use 1 for binary classification
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [81]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [82]:
start_time = time.time()
def train_Model(no_of_epochs):
  for epoch in range(no_of_epochs):
      model.train()
      for batch_idx, batch_data in enumerate(train_loader):

          text = batch_data.caption.to(DEVICE)
          labels = batch_data.sentiment.to(DEVICE)

          ### FORWARD AND BACK PROP
          logits = model(text)
          loss = F.cross_entropy(logits, labels)
          optimizer.zero_grad()

          loss.backward()

          ### UPDATE MODEL PARAMETERS
          optimizer.step()

          ### LOGGING
          if not batch_idx % 50:
              print (f'Epoch: {epoch+1:03d}/{15:03d} | '
                     f'Loss: {loss:.4f}')

      with torch.set_grad_enabled(False):
          print(f'training accuracy: '
                f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
                f'\nvalid accuracy: '
                f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

  print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

In [83]:
train_Model(15)

Epoch: 001/015 | Loss: 1.4114
Epoch: 001/015 | Loss: 1.2333
training accuracy: 38.78%
valid accuracy: 36.94%
Epoch: 002/015 | Loss: 1.1817
Epoch: 002/015 | Loss: 1.2753
training accuracy: 38.82%
valid accuracy: 36.32%
Epoch: 003/015 | Loss: 1.3504
Epoch: 003/015 | Loss: 1.2867
training accuracy: 38.87%
valid accuracy: 36.18%
Epoch: 004/015 | Loss: 1.3251
Epoch: 004/015 | Loss: 1.2286
training accuracy: 39.02%
valid accuracy: 34.86%
Epoch: 005/015 | Loss: 1.2905
Epoch: 005/015 | Loss: 1.3399
training accuracy: 39.05%
valid accuracy: 34.81%
Epoch: 006/015 | Loss: 1.2435
Epoch: 006/015 | Loss: 1.1908
training accuracy: 50.13%
valid accuracy: 46.53%
Epoch: 007/015 | Loss: 0.9830
Epoch: 007/015 | Loss: 0.8144
training accuracy: 63.51%
valid accuracy: 57.96%
Epoch: 008/015 | Loss: 0.6768
Epoch: 008/015 | Loss: 0.4585
training accuracy: 83.56%
valid accuracy: 78.79%
Epoch: 009/015 | Loss: 0.2842
Epoch: 009/015 | Loss: 0.2833
training accuracy: 95.78%
valid accuracy: 87.01%
Epoch: 010/015 | Lo

- Happy : 0
- Sad   : 1
- Anger : 2
- Fear  : 3

In [100]:
nlp = spacy.blank("en")

def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = model(tensor)
    return prediction, tensor.shape

predict_sentiment(model, "count not the troubles but the joys")

(tensor([[ 0.4724, -0.2632, -0.0916, -0.5106]], grad_fn=<AddmmBackward>),
 torch.Size([7, 1]))

### Optimizing our model for mobile and downloading it:


In [85]:
cpu_model = model.cpu()
torchscript_model = torch.jit.script(cpu_model)
torch.jit.save(torchscript_model, "MemeSentiment_model_V4.pt")

### Downloading the vocab

In [86]:
def save_vocab(vocab, path):
    with open(path, 'w+') as f:
        for token, index in vocab.stoi.items():
            f.write(f'{token}' +',' + f'{index} \n')

In [87]:
save_vocab(TEXT.vocab,'vocab.txt')