In [1]:
# simple LSTM
import numpy as np
import pandas as pd
from string import punctuation
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim

In [2]:
with open("../data/sentiment_labelled_sentences/sentiment.txt") as f:
    reviews = f.read()

In [3]:
data =  pd.DataFrame([review.split("\t") for review in reviews.split("\n")])
data.columns = ["review", 'sentiment']
data = data.sample(frac=1, random_state=0)

In [4]:
data.head()

Unnamed: 0,review,sentiment
311,One more thing: I can tolerate political incor...,0
1025,That's right....the red velvet cake.....ohhh t...,1
1587,I hate those things as much as cheap quality b...,0
2941,I have tried these cables with my computer and...,1
2980,Echo Problem....Very unsatisfactory,0


In [5]:
def split_words_reviews(data):
    text = data.review.tolist()
    clean_text = []
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
    tokenized = [word_tokenize(x) for x in clean_text]
    all_text  = []
    for tokens in tokenized:
        for to in tokens:
            all_text.append(to)
    return tokenized, set(all_text)
                
    

In [6]:
reviews, vocab = split_words_reviews(data)
labels = np.array([int(x) for x in data['sentiment'].values])
reviews[0]

['one',
 'more',
 'thing',
 'i',
 'can',
 'tolerate',
 'political',
 'incorrectness',
 'very',
 'well',
 'im',
 'all',
 'for',
 'artistic',
 'freedom',
 'and',
 'suspension',
 'of',
 'disbelief',
 'but',
 'the',
 'slavic',
 'female',
 'character',
 'was',
 'just',
 'too',
 'much']

In [7]:
# assemble lookuo dictionaires

In [8]:
def create_dictionaries(words):

    word_to_int_dict = {w: i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i: w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict
    

In [9]:
word_to_int_dict, int_to_word_dict = create_dictionaries(sorted(vocab))
int_to_word_dict

{1: '010',
 2: '1',
 3: '10',
 4: '100',
 5: '1010',
 6: '11',
 7: '110',
 8: '1199',
 9: '12',
 10: '13',
 11: '15',
 12: '15lb',
 13: '17',
 14: '18',
 15: '18th',
 16: '1928',
 17: '1947',
 18: '1948',
 19: '1949',
 20: '1971',
 21: '1973',
 22: '1979',
 23: '1980s',
 24: '1986',
 25: '1995',
 26: '1998',
 27: '2',
 28: '20',
 29: '2000',
 30: '2005',
 31: '2006',
 32: '2007',
 33: '20th',
 34: '20the',
 35: '2160',
 36: '23',
 37: '24',
 38: '25',
 39: '2mp',
 40: '3',
 41: '30',
 42: '30s',
 43: '325',
 44: '34ths',
 45: '35',
 46: '350',
 47: '375',
 48: '3o',
 49: '4',
 50: '40',
 51: '400',
 52: '40min',
 53: '42',
 54: '45',
 55: '4s',
 56: '5',
 57: '50',
 58: '5020',
 59: '510',
 60: '5320',
 61: '54',
 62: '5of',
 63: '5year',
 64: '6',
 65: '680',
 66: '7',
 67: '70',
 68: '70000',
 69: '700w',
 70: '70s',
 71: '744',
 72: '750',
 73: '785',
 74: '8',
 75: '80',
 76: '80s',
 77: '810',
 78: '8125',
 79: '815pm',
 80: '8525',
 81: '8530',
 82: '8pm',
 83: '9',
 84: '90',
 8

In [10]:
# decide on padding (max & mean length)
lengths = [len(x) for x in reviews]
print(np.max(lengths), np.median(lengths), np.mean(lengths))

70 10.0 11.783666666666667


In [11]:
input_size = 50


In [12]:
def pad_text(tokenized_reviews, seq_length):
    reviews = []
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length - len(review)) + review )
    return np.array(reviews)

In [13]:
padded_sentences = pad_text(reviews, seq_length = 50)

In [14]:
padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', 'one', 'more', 'thing', 'i', 'can', 'tolerate',
       'political', 'incorrectness', 'very', 'well', 'im', 'all', 'for',
       'artistic', 'freedom', 'and', 'suspension', 'of', 'disbelief',
       'but', 'the', 'slavic', 'female', 'character', 'was', 'just',
       'too', 'much'], dtype='<U33')

In [15]:
# add empty token to dictionary
word_to_int_dict[''] = 0
int_to_word_dict[0] = ''

In [16]:
# now encode sentences
encoded_sentences = np.array([[word_to_int_dict[word] for word in review ] for review in padded_sentences])

In [17]:
encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       3272, 3059, 4752, 2364,  726, 4828, 3568, 2426, 5109, 5231, 2387,
        204, 1908,  326, 1943,  245, 4629, 3244, 1366,  688, 4728, 4300,
       1797,  815, 5187, 2600, 4839, 3094])

In [18]:
# model architecture

In [20]:
class SentimentLSTM(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p=0.8):
        super().__init__()
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self, input_words):
        
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)
        sigmoid_out = self.sigmoid(fc_out)
        sigmoid_out = sigmoid_out.view(batch_size, -1)
        
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    
    def init_hidden(self, batch_size):
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, 
                         batch_size,
                         self.n_hidden
                        ).zero_().to(device),
             weights.new(self.n_layers, 
                         batch_size,
                         self.n_hidden
                        ).zero_().to(device),
            )
        return h
    

In [21]:
n_vocab = len(word_to_int_dict)
n_embed = 50
n_hidden = 100
n_output = 1
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net

SentimentLSTM(
  (embedding): Embedding(5401, 50)
  (lstm): LSTM(50, 100, num_layers=2, batch_first=True, dropout=0.8)
  (dropout): Dropout(p=0.8, inplace=False)
  (fc): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [22]:
# training loop

In [23]:
sentence_labels = np.array([int(x) for x in data['sentiment'].values])


In [24]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio) / 2


In [25]:
total = len(encoded_sentences)
train_cutoff = int(total*train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))
print(train_cutoff, valid_cutoff)

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(),               torch.Tensor(sentence_labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(sentence_labels[train_cutoff:valid_cutoff]).long()
test_x, test_y   = torch.Tensor(encoded_sentences[valid_cutoff:]).long(),               torch.Tensor(sentence_labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data  = TensorDataset(test_x, test_y)


In [26]:
total, len(train_data), len(valid_data), len(test_data)

(3000, 2400, 300, 300)

In [27]:
batch_size = 1 # single sentence

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [28]:
print_every = 2400
step = 0
n_epochs = 5
clip = 5
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)


In [29]:

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step +=1
        net.zero_grad()
        output, h = net(inputs)
        loss  =  criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:
            net.eval()
            valid_losses = []
            
            for v_inputs, v_labels in valid_loader:
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())
                
            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            
            net.train()
        
        

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  nn.utils.clip_grad_norm(net.parameters(), clip)


Epoch: 1/5 Step: 2400 Training Loss: 0.8810 Validation Loss: 0.6239
Epoch: 2/5 Step: 4800 Training Loss: 1.1317 Validation Loss: 0.6526
Epoch: 3/5 Step: 7200 Training Loss: 0.0112 Validation Loss: 0.6450
Epoch: 4/5 Step: 9600 Training Loss: 0.0564 Validation Loss: 0.7117
Epoch: 5/5 Step: 12000 Training Loss: 0.0079 Validation Loss: 0.7279


In [30]:
torch.save(net.state_dict(), 'model.pkl')

In [31]:
net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net.load_state_dict(torch.load('model.pkl'))

<All keys matched successfully>

In [32]:
labels

tensor([1])

In [33]:
net.eval()
test_losses = []
num_correct = 0

for inputs, labels in  test_loader:
    
    test_output, test_h = net(inputs)
    loss = criterion(test_output, labels.float())
    test_losses.append(loss.item())
    preds =  torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
print(f"test loss {np.mean(test_losses)}")
print(f"test accuracy {num_correct / len(test_loader.dataset)}")

test loss 1.0144421436911215
test accuracy 0.7533333333333333


In [34]:
seq_length = 50

In [58]:
def preprocess_review(review):
    seq_length = 50
    review.translate(str.maketrans('', '', punctuation)).lower().rstrip()
    tokenized  = word_tokenize(review)
    if len(tokenized) >= 50:
        review = tokenized[:seq_length]
    else:
        review = ['']*(seq_length - len(tokenized)) + tokenized 
        
    final  = []
    for token in review:
        try:
            final.append(word_to_int_dict[token])
        except:
            final.append(word_to_int_dict['']) # if new word
            
    return final

In [61]:
def predict(review):
    net.eval()
    words = np.array([preprocess_review(review)])
    padded_words = torch.from_numpy(words)
    pred_loader = DataLoader(padded_words, batch_size=1, shuffle=True)
    for x in pred_loader:
        output = net(x)[0].item()
    msg = "Positive rev." if output > 0.5 else "Negative rev."
    print(msg)
    print(f"prediction: {output}")
    

In [65]:
predict("The film was good")

Positive rev.
prediction: 0.9965888261795044


In [66]:
predict("The film was bad")

Negative rev.
prediction: 0.02365453727543354
