In [225]:
# importing packages

import re
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [226]:
print(torch.backends.cudnn.is_available())
print(torch.backends.cudnn.version())

True
8302


## Read the data

In [227]:
f = open('sentiment labelled sentences/imdb_labelled.txt','r').readlines() + open('sentiment labelled sentences/yelp_labelled.txt','r').readlines() + open('sentiment labelled sentences/amazon_cells_labelled.txt','r').readlines()

## Train split test

In [228]:
# prepare the training and testing sets
x_test_r, y_test = [], []
x_train_r, y_train = [], []

# Take 10% of the whole dataset for testing
for line in f[:300]:
    line = line.split('\t')
    # if len(line[0].split(" ")) < 30 and len(line[0].split(" ")) > 0:
    y_test.append(int(line[-1]))
    x_test_r.append(line[0].lower())

# Take 90% of the whole dataset for training
for line in f[300:]:
    line = line.split('\t')

    # For better training, we will eliminate some too long sentences and too short sentences. Here, I put all sentences < 30
    # if len(line[0].split(" ")) < 30 and len(line[0].split(" ")) > 0:
    y_train.append(int(line[-1]))
    x_train_r.append(line[0].lower())

In [229]:
# Remove all the special mark from our dataset
def clean_mark(li):
    char = '\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\_|\+|\{|\}|\[|\]|\;|\:|\'|\"|\<|\>|\,|\.|\/|\?|\`|\~|\-'

    formatted = []

    # replace the special char with empty strings and strip the sentences
    li_char = list(map(lambda x: re.sub(char, "", x).strip(), li))

    # Split the sentences into words
    formatted = list(map(lambda x: [i for i in x.split(" ") if i != ''], li_char))

    return formatted

In [230]:
x_train_mk, x_test_mk = clean_mark(x_train_r), clean_mark(x_test_r)

## Vocab and look up table

In [231]:
# Create the vocab and look up table from the training set, any word that is not in alphabet is set as <ukn>
vocab_r = list(set(word if word.isalpha() else "<ukn>" for senten in x_train_mk for word in senten))
vocab_r.sort()

# Put the <pad> into the vocab for padding purpose
vocab = ['<pad>'] + vocab_r

In [232]:
# Create the look up table
ind_to_word = {i: vocab[i] for i in range(len(vocab))}
word_to_ind = {w: i for i, w in ind_to_word.items()}

## Clean up data

In [233]:
# Replace all the unknown word with <ukn> in the testing dataset

def clean_unk(li, vocab):
    for senten in li:
        for i in range(len(senten)):
            if senten[i] not in vocab:
                senten[i] = '<ukn>'

    return li

x_train_ukn, x_test_ukn = clean_unk(x_train_mk, vocab), clean_unk(x_test_mk, vocab)

In [234]:
# This is where I check if the lengths of each sentence in the training dataset do not vary too much. If
# then, I can change the condition at the begining of the reading dataset.

len_train = []
for i in x_train_ukn:
    len_train.append(len(i))

print("Max:", max(len_train))
print("Min:", min(len_train))
print("Mean:", sum(len_train)/len(len_train))
print("Len:", len(x_train_ukn))
print()
# x_test_ukn = x_test_ukn[: len(x_train_ukn)//10]
print("Len test:", len(x_test_ukn))

Max: 70
Min: 1
Mean: 11.690740740740742
Len: 2700

Len test: 300


In [235]:
# Padding the datasets, the longest sentence will be the standard length of all sentences.
total = x_train_ukn + x_test_ukn

max_le = 0
for i in total:
    if len(i) > max_le:
        max_le = len(i)

# Add the <pad> to all sentences in both dataset
x_train_pad = [x + ['<pad>']*(max_le - len(x)) for x in x_train_ukn]
x_test_pad = [x + ['<pad>']*(max_le - len(x)) for x in x_test_ukn]

## Sentence to Index

In [236]:
## Function to convert word to index using the look up table

def sentence_to_ind(li, word_to_ind):
    sen_ind = []
    for sentence in li:
        sen_ind.append( [ word_to_ind[x] for x in sentence ] )

    return sen_ind

x_train, x_test = sentence_to_ind(x_train_pad, word_to_ind), sentence_to_ind(x_test_pad, word_to_ind)

In [237]:
len(x_train[0])

70

## Batching

In [238]:
# Convert all the training and testing datasets to tensor

def convert_to_tensor(x_li, y_li):
    tensor_x_li = torch.tensor(x_li)
    tensor_y_li = torch.tensor([ int(y) for y in y_li])
    return tensor_x_li, tensor_y_li

x_train_tensor, y_train_tensor = convert_to_tensor(x_train, y_train)
x_test_tensor, y_test_tensor = convert_to_tensor(x_test, y_test)

In [239]:
# Batching the datasets, with the batch size of 128

data = list(zip(x_train_tensor, y_train_tensor))
batch_size = 128
shuffle = True

loader = DataLoader(data, batch_size=batch_size, shuffle = shuffle)

## Models implementation

In [240]:
# Bidirectional LSTM model

class modelSentiment(nn.Module):
    def __init__(self, n_class, n_hidden):
        super(modelSentiment, self).__init__()

        # Declare variables
        self.n_class = n_class
        self.n_hidden = n_hidden

        # Embedding layer
        self.embed = nn.Embedding(num_embeddings=n_class, embedding_dim=100)

        # LSTM layer, bidirectional = True -> the output will have the dimesion of 2 * n_hidden
        self.lstm = nn.LSTM(input_size = 100, hidden_size = n_hidden, bidirectional=True)

        # Output layer
        self.W = nn.Linear(n_hidden * 2, 2, bias=True)
        self.probabilities = nn.Sigmoid()

    def forward(self, tensor_train): # batch_size x len_sentence [128, 19]

        # Transpose because this allows us to take the last hidden state later
        tensor_train = tensor_train.transpose(0, 1) # L x B [19, 128]

        # Embedding layer
        embed = self.embed(tensor_train) # L x B x E [19, 128, 100]
        
        # BiLSTM layer
        model, (_, _) = self.lstm(embed) # L x B x H [19, 128, 256]

        # Take the last hidden state
        model = model[-1] # B x H [128, 256]

        # Output layer
        model = self.W(model) # B x 2 [128, 2]
        model = self.probabilities(model) # B x 2 [128, 2]     
        
        return model

In [241]:
# Attention model

class Attention(nn.Module):
    def __init__(self, n_class, n_hidden):
        super(Attention, self).__init__()

        # Declare variables
        self.n_class = n_class
        self.n_hidden = n_hidden

        # Embedding layer
        self.embed = nn.Embedding(num_embeddings=n_class, embedding_dim=100)

        # LSTM layer
        # We need two RRN network, one for encoding and one for decoding.
        self.enc = nn.LSTM(input_size = 100, hidden_size = n_hidden)
        self.dec = nn.LSTM(input_size = n_hidden, hidden_size = 1)

        # Softmax layer for the weights
        self.sm = nn.Softmax(dim=1)

        # Output layer
        self.W = nn.Linear(n_hidden, 2, bias=True)
        self.probabilities = nn.Sigmoid()

    def forward(self, tensor_train): # batch_size x len_sentence [128, 19]
        
        # Embedding layer
        embed = self.embed(tensor_train) # B x L x E [128, 19, 100]
        
        # Pass the last hidden states of the encoding to the decoding to get the weights
        enc_outputs, (_, _) = self.enc(embed) # B x L x H [128, 19, 256]
        dec_outputs, (_, _) = self.dec(enc_outputs) # B x L x 1 [128, 19, 1]

        # Softmax the weights
        dec_outputs_sm = torch.tensor([])
        dec_outputs_sm = dec_outputs_sm.to(device)

        for i in dec_outputs:
            dec_outputs_sm = torch.cat( ( dec_outputs_sm, self.sm(i.squeeze(1).unsqueeze(0).to(device)).to(device) ),  0)

        dec_outputs_sm = dec_outputs_sm.unsqueeze(1) # B x 1 x L [128, 1, 19]

        # Calculate the weight sum
        weight_sum = torch.matmul(dec_outputs_sm, enc_outputs) # B x 1 x H [128, 1, 256]

        # Output layer
        model = self.W(weight_sum) # B x 1 x 2 [128, 1, 2]
        model = model.squeeze(1) # B x 2 [128, 2]
        model = self.probabilities(model) # B x 2 [128, 2]
        
        return model

## Training model LSTM

In [242]:
# Declare the n_class = number of word in the vocab
# Hidden state = 128
n_class = len(vocab)
n_hidden = 128

In [243]:
# Init model LSTM
model_lstm = modelSentiment(n_class, n_hidden)
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model_lstm.parameters(), lr=0.0001)

# Put the model into the cuda for computatiing purpose on cuda
model_lstm = model_lstm.to(device)

In [244]:
# Train the data for the "epoch = 500" times
for epoch in range(500):
    tot_loss = 0 # calculate the total loss of each batch

    for x_train_l, y_train_l in loader:

        # Put the batch onto cuda
        x_train_l, y_train_l = x_train_l.to(device), y_train_l.to(device)
        
        # Set the optimizer back to 0
        optimizer.zero_grad()

        # Feed the x_train to the model
        output  = model_lstm(x_train_l)

        # Calculate the loss and backproping
        loss = criterion(output, y_train_l.squeeze(0))
        loss.backward()
    
        # Calculate the total loss and optimize
        tot_loss += loss.item()
        optimizer.step()

    # print the total loss
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(tot_loss))


Epoch: 0010 cost = 15.258881
Epoch: 0020 cost = 15.244247
Epoch: 0030 cost = 15.250197
Epoch: 0040 cost = 15.244195
Epoch: 0050 cost = 15.242241
Epoch: 0060 cost = 15.253512
Epoch: 0070 cost = 15.246881
Epoch: 0080 cost = 15.242767
Epoch: 0090 cost = 15.255420
Epoch: 0100 cost = 15.254030
Epoch: 0110 cost = 15.246911
Epoch: 0120 cost = 15.235857
Epoch: 0130 cost = 15.248224
Epoch: 0140 cost = 15.244251
Epoch: 0150 cost = 15.245834
Epoch: 0160 cost = 15.246093
Epoch: 0170 cost = 15.256033
Epoch: 0180 cost = 15.241031
Epoch: 0190 cost = 15.247469
Epoch: 0200 cost = 15.126817
Epoch: 0210 cost = 15.286882
Epoch: 0220 cost = 15.249876
Epoch: 0230 cost = 15.244909
Epoch: 0240 cost = 15.240001
Epoch: 0250 cost = 15.244601
Epoch: 0260 cost = 15.244958
Epoch: 0270 cost = 15.240298
Epoch: 0280 cost = 15.249593
Epoch: 0290 cost = 15.251680
Epoch: 0300 cost = 15.245340
Epoch: 0310 cost = 15.249856
Epoch: 0320 cost = 15.255143
Epoch: 0330 cost = 15.249961
Epoch: 0340 cost = 15.246250
Epoch: 0350 co

## Training model Attention

In [245]:
## Training model Attention
# Declare the n_class = number of word in the vocab
# Hidden state = 128
n_class = len(vocab)
n_hidden = 256

In [246]:
# Init model Attention
model_attention = Attention(n_class, n_hidden)
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model_attention.parameters(), lr=0.0001)

# Put the model into the cuda for computatiing purpose on cuda
model_attention = model_attention.to(device)

In [247]:
# Train the data for the "epoch = 500" times
for epoch in range(500):
    tot_loss = 0 # calculate the total loss of each batch

    for x_train_a, y_train_a in loader:

        # Put the batch onto cuda
        x_train_a, y_train_a = x_train_a.to(device), y_train_a.to(device)
        
        # Set the optimizer back to 0
        optimizer.zero_grad()

        # Feed the x_train to the model
        output = model_attention(x_train_a)

        # Calculate the loss and backproping
        loss = criterion(output, y_train_a.squeeze(0))
        loss.backward()
    
        # Calculate the total loss and optimize
        tot_loss += loss.item()
        optimizer.step()

    # print the total loss
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(tot_loss))


Epoch: 0010 cost = 15.228873
Epoch: 0020 cost = 15.218556
Epoch: 0030 cost = 15.117722
Epoch: 0040 cost = 14.758621
Epoch: 0050 cost = 14.054735
Epoch: 0060 cost = 13.288648
Epoch: 0070 cost = 12.548198
Epoch: 0080 cost = 12.114475
Epoch: 0090 cost = 11.619663
Epoch: 0100 cost = 11.173743
Epoch: 0110 cost = 11.003304
Epoch: 0120 cost = 10.546913
Epoch: 0130 cost = 10.329826
Epoch: 0140 cost = 10.246729
Epoch: 0150 cost = 9.959751
Epoch: 0160 cost = 9.779602
Epoch: 0170 cost = 9.477413
Epoch: 0180 cost = 9.363777
Epoch: 0190 cost = 9.235867
Epoch: 0200 cost = 9.038111
Epoch: 0210 cost = 8.882900
Epoch: 0220 cost = 8.745993
Epoch: 0230 cost = 8.773301
Epoch: 0240 cost = 8.598092
Epoch: 0250 cost = 8.490643
Epoch: 0260 cost = 8.365248
Epoch: 0270 cost = 8.357336
Epoch: 0280 cost = 8.226789
Epoch: 0290 cost = 8.155202
Epoch: 0300 cost = 8.076519
Epoch: 0310 cost = 8.001636
Epoch: 0320 cost = 8.006361
Epoch: 0330 cost = 7.949170
Epoch: 0340 cost = 8.026319
Epoch: 0350 cost = 7.838537
Epoch:

## Testing

In [248]:
x_test_tensor = x_test_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)
x_test_tensor

tensor([[   2, 4621, 4621,  ...,    0,    0,    0],
        [2886, 4173, 4759,  ...,    0,    0,    0],
        [   1,    1, 4800,  ...,    0,    0,    0],
        ...,
        [ 146, 2035, 4133,  ...,    0,    0,    0],
        [2117, 1968, 4357,  ...,    0,    0,    0],
        [2117,  945, 4299,  ...,    0,    0,    0]], device='cuda:0')

In [249]:
# Test the model
predict = model_lstm(x_test_tensor).data
predict_1 = [int(np.argmax(x)) for i, x in enumerate(predict.cpu())]

# Count the number of correct prediction
count = 0
for i in range(len(x_test_tensor)):
    if predict_1[i] == int(y_test_tensor[i]):
        count += 1
    
# Print the result
print("Accuracy: ", count/len(y_test_tensor)*100, "%")

Accuracy:  42.333333333333336 %


In [250]:
# Test the model
predict = model_attention(x_test_tensor).data
predict_1 = [int(np.argmax(x)) for i, x in enumerate(predict.cpu())]

# Count the number of correct prediction
count = 0
for i in range(len(x_test_tensor)):
    if predict_1[i] == y_test_tensor[i]:
        count += 1

# Print the result
print("Accuracy: ", count/len(y_test_tensor)*100, "%")

Accuracy:  78.0 %
