In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'Colab\ Notebooks/Amazon Reviews for Sentiment Analysis'
%cd drive/MyDrive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Amazon Reviews for Sentiment Analysis


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
import os
import time

In [None]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
def get_reviews_and_labels(filepath):
  with open(filepath, 'r') as f:
    reviews = []
    labels = []
    for line in f:
      reviews.append(line[10:].strip())
      labels.append(int(line[9])-1)
    reviews = reviews[:int(len(reviews)*0.506)]
    labels = labels[:int(len(labels)*0.506)]
    return reviews, labels

train_reviews, train_labels = get_reviews_and_labels('train.ft.txt')
test_reviews, test_labels = get_reviews_and_labels('test.ft.txt')

In [None]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [None]:
def preprocessing(reviews, patterns, replacements):
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)
    reviews[i] = review
  return reviews

In [None]:
train_reviews = preprocessing(train_reviews, patterns, replacements)
test_datas = preprocessing(test_reviews, patterns, replacements)

In [None]:
from sklearn import model_selection
train_reviews, val_reviews, train_labels, val_labels = model_selection.train_test_split(train_reviews, train_labels, test_size = 0.4, random_state=42)

In [None]:
num_train = len(train_reviews)
num_val = len(val_reviews)
num_test = len(test_datas)
longest_num_tokens = 250

In [None]:
def indexing_tokens():
  indices = {'<start>':0, '<end>':1, '<pad>':2, '<unk>':3}
  counter = 4
  for i in range(num_train):
    tokens = train_reviews[i].split()
    for token in tokens:
      if token not in indices:
        indices[token] = counter
        counter += 1
  return indices

In [None]:
def get_data(indices, longest_num_tokens, mode='train'):
  data = []
  Y = []
  if mode == 'train':
    for i in range(num_train):
      train_data = []
      label, tokens = train_labels[i], train_reviews[i].split()
      for token in tokens:
        train_data.append(indices[token])
        if len(train_data) == longest_num_tokens:
          break
      while len(train_data) < longest_num_tokens:
        train_data.append(indices['<pad>'])
      train_data.insert(indices['<start>'], 0)
      train_data.append(indices['<end>'])
      data.append(train_data)
      Y.append(label)
  elif mode == 'val':
    for i in range(num_val):
      val_data = []
      label, tokens = val_labels[i], val_reviews[i].split()
      for token in tokens:
        if token in indices:
          val_data.append(indices[token])
        else:
          val_data.append(indices['<unk>'])
        if len(val_data) == longest_num_tokens:
          break
      while len(val_data) < longest_num_tokens:
        val_data.append(indices['<pad>'])
      val_data.insert(indices['<start>'], 0)
      val_data.append(indices['<end>'])
      data.append(val_data)
      Y.append(label)
  else:
    for i in range(num_test):
      test_data = []
      label, tokens = test_labels[i], test_datas[i].split()
      for token in tokens:
        if token in indices:
          test_data.append(indices[token])
        else:
          test_data.append(indices['<unk>'])
        if len(test_data) == longest_num_tokens:
          break
      while len(test_data) < longest_num_tokens:
        test_data.append(indices['<pad>'])
      test_data.insert(indices['<start>'], 0)
      test_data.append(indices['<end>'])
      data.append(test_data)
      Y.append(label)
  return data, Y

In [None]:
# Loading Training Data & Val Data
indices = indexing_tokens()
train_data, train_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')

In [None]:
print('Number of training:', len(train_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))

Number of training: 1092960
Number of validation: 728640
Length of corpus: 672492


In [None]:
# Create tensors of train & val
train_tensor = torch.tensor(train_data)
train_labels_tensor = torch.tensor(train_labels)
val_tensor = torch.tensor(val_data)
val_labels_tensor = torch.tensor(val_labels)

In [None]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)
print('Train Label Tensor:', train_labels_tensor.shape)
print('Val Label Tensor:', val_labels_tensor.shape)

Train Tensor: torch.Size([1092960, 252])
Val Tensor: torch.Size([728640, 252])
Train Label Tensor: torch.Size([1092960])
Val Label Tensor: torch.Size([728640])


In [None]:
num_embeddings = len(indices)
embedding_dim = 300
hidden_dim = 256
sequence_len = 252
output_dim = 2
print_every = 6000
batch_size = 64

In [None]:
class MyModel(nn.Module):
  def __init__(self, num_embeddings, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding_layer = nn.Embedding(num_embeddings, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_dim)
  def forward(self, x):
    # x.shape = N * 252
    embedding_data = self.embedding_layer(x)
    # x.shape = M * 252 * 100
    output, (h_n, c_n) = self.lstm(embedding_data)
    out = output[:, -1, :]
    return self.fc(out)

In [None]:
model = MyModel(num_embeddings, embedding_dim, hidden_dim, output_dim)
model = model.cuda()

In [None]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(train_labels_tensor, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels_tensor, batch_size=batch_size)

In [None]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    for counter, (x, y) in enumerate(zip(mini_trains, mini_train_labels)):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if counter % print_every == 0:
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)

In [None]:
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    total_acc = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      predictions = scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      total_acc += acc
    print(f'Epoch[{epoch+1}] Acc: {total_acc/len(val_data)}')

In [None]:
# import torch.optim as optim

loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# Start training 
train(5, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch[1] Acc: 0.5017937527448397
Epoch[1] Acc: 0.4982062472551603
Epoch[1] Acc: 0.4982062472551603
Epoch[2] Acc: 0.4982062472551603
Epoch[2] Acc: 0.6306200592885376
Epoch[2] Acc: 0.8600914031620553
Epoch[3] Acc: 0.9091650197628458
Epoch[3] Acc: 0.9240365612648221
Epoch[3] Acc: 0.9294452678963548
Epoch[4] Acc: 0.9348622090469917
Epoch[4] Acc: 0.9352011967501098
Epoch[4] Acc: 0.9397686100131752
Epoch[5] Acc: 0.9411959266578832
Epoch[5] Acc: 0.9415884387351778
Epoch[5] Acc: 0.9436525581906017


In [None]:
# Loading Testing Data
test_data, test_labels = get_data(indices, longest_num_tokens, mode='test')
# Create tensors of test
test_tensor = torch.tensor(test_data)
test_labels_tensor = torch.tensor(test_labels)

In [None]:
print('Test Tensor:', test_tensor.shape)
print('Test Label Tensor:', test_labels_tensor.shape)

Test Tensor: torch.Size([202400, 252])
Test Label Tensor: torch.Size([202400])


In [None]:
mini_tests = DataLoader(test_tensor, batch_size=1, shuffle=False)
mini_test_labels = DataLoader(test_labels_tensor, batch_size=1)

In [None]:
def predict(model, mini_tests, mini_test_labels, device):
  model.eval()
  start = time.time()
  with torch.no_grad():
    total_acc = 0
    predictions = []
    for x, y in zip(mini_tests, mini_test_labels):
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      prediction = scores.max(1)[1]
      predictions.append(prediction.item())
      acc = prediction.eq(y).sum().item()
      total_acc += acc
    end = time.time()
    elapsed = end - start
    print(f'Testing acc: {total_acc/len(test_data)}, tiem_spent: {round(elapsed, 2)} sec')
    return predictions

In [None]:
predictions = predict(model, mini_tests, mini_test_labels, device)
# submission = pd.DataFrame(zip(test_reviews, predictions), columns=['text','label'])
# submission.to_csv('submission_lstm.csv', index=False)

Testing acc: 0.942895256916996, tiem_spent: 636.72 sec


In [None]:
# submission.to_csv('submission_lstm.csv', index=False)

In [None]:
# submission = pd.DataFrame(zip(test_reviews, predictions, test_labels), columns=['text','label', 'ans'])
# submission.to_csv('Amazon_Reviews_for_Sentiment_Analysis_lstm.csv', index=False)

In [None]:
def out_file(predictions, test_reviews, test_labels, out_filename):

    print('\n===============================================')
    print(f'Writing predictions to --> {out_filename}')
    with open(out_filename, 'w') as out:
        for prediction, test_review in zip(predictions, test_reviews):
          out.write('__label__' + str(prediction+1) + '\t' + str(test_review) + '\n')
    print('===============================================')

out_file(predictions, test_reviews, test_labels, 'Amazon_Reviews_for_Sentiment_Analysis_lstm_0.506.txt')


Writing predictions to --> Amazon_Reviews_for_Sentiment_Analysis_lstm_0.506.txt
