In [1]:
# simple LSTM
import numpy as np
import pandas as pd
from string import punctuation
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim

In [2]:
with open("../data/sentiment_labelled_sentences/sentiment.txt") as f:
    reviews = f.read()

In [3]:
data =  pd.DataFrame([review.split("\t") for review in reviews.split("\n")])
data.columns = ["review", 'sentiment']
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
def split_words_reviews(data):
    text = data.review.tolist()
    clean_text = []
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
    tokenized = [word_tokenize(x) for x in clean_text]
    all_text  = []
    for tokens in tokenized:
        for to in tokens:
            all_text.append(to)
    return tokenized, set(all_text)
                
    

In [5]:
reviews, vocab = split_words_reviews(data)
labels = np.array([int(x) for x in data['sentiment'].values])


In [6]:
# assemble lookuo dictionaires

In [7]:
def create_dictionaries(words):

    word_to_int_dict = {w: i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i: w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict
    

In [8]:
word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)
print(word_to_int_dict['imperial'])
int_to_word_dict[2]

3725


'clearly'

In [9]:
# decide on padding (max & mean length)
lengths = [len(x) for x in reviews]
print(np.max(lengths), np.median(lengths), np.mean(lengths))

70 10.0 11.783666666666667


In [10]:
input_size = 50


In [11]:
def pad_text(tokenized_reviews, seq_length):
    reviews = []
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length - len(review)) + review )
    return np.array(reviews)

In [12]:
padded_sentences = pad_text(reviews, seq_length = 50)

In [13]:
padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', 'a', 'very', 'very', 'very', 'slowmoving', 'aimless',
       'movie', 'about', 'a', 'distressed', 'drifting', 'young', 'man'],
      dtype='<U33')

In [14]:
# add empty token to dictionary
word_to_int_dict[''] = 0
int_to_word_dict[0] = ''

In [15]:
# now encode sentences
encoded_sentences = np.array([[word_to_int_dict[word] for word in review ] for review in padded_sentences])

In [16]:
encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 3257,  606,  606,  606, 4777, 1626, 2794,
       4597, 3257, 3551, 3225, 2139, 3714])

In [17]:
# model architecture

In [18]:
from torch import nn

In [34]:
class SentimentLSTM(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p=0.8):
        super().__init__()
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self, input_words):
        
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)
        sigmoid_out = self.sigmoid(fc_out)
        sigmoid_out = sigmoid_out.view(batch_size, -1)
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    
    def init_hidden(self, batch_size):
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, 
                         batch_size,
                         self.n_hidden
                        ).zero_().to(device),
             weights.new(self.n_layers, 
                         batch_size,
                         self.n_hidden
                        ).zero_().to(device),
            )
        return h
    

In [35]:
n_vocab = len(word_to_int_dict)
n_embed = 50
n_hidden = 100
n_output = 1
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net

SentimentLSTM(
  (embedding): Embedding(5401, 50)
  (lstm): LSTM(50, 100, num_layers=2, batch_first=True, dropout=0.8)
  (dropout): Dropout(p=0.8, inplace=False)
  (fc): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [29]:
# training loop

In [30]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio) / 2


In [23]:
total = len(encoded_sentences)
train_cutoff = int(total*train_ratio)
valid_cutoff = train_cutoff + int(total*valid_ratio) + 1

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff:valid_cutoff]).long(), torch.Tensor(labels[train_cutoff:valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:]).long()

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)


In [31]:
total, len(train_data), len(valid_data), len(test_data)

(3000, 2400, 300, 300)

In [36]:
batch_size = 1 # single sentence

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [None]:
print_every = 200
step = 0
n_epochs = 5
clip = 5
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    for inputs, labels in train_loader:
        step +=1
        net.zero_grad()
        output, h = net(inputs)
        loss  =  criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:
            net.eval()
            valid_losses = []
            for v_inputs, v_labels in valid_loader:
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())
                print(f"epoch {epoch}/{n_epochs}, step {step}, training loss: {loss.item()} , valid_loss: {np.mean(valid_losses)}")

                net.train()
        
        

  nn.utils.clip_grad_norm(net.parameters(), clip)
