In [44]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter

In [46]:
vocab = {}

In [47]:
def initialise_vocab():
  unk_token = '<UNK>'
  vocab['t_2_i'] = {}
  vocab['i_2_t'] = {}
  idx = add_token(unk_token)
  vocab['add_unk'] = True
  vocab['unk_token'] = unk_token
  vocab['unk_token_idx'] = idx

In [48]:
def add_token(token):
  if token in vocab['t_2_i']:
    idx = vocab['t_2_i'][token]
  else:
    idx = len(vocab['t_2_i'])
    vocab['t_2_i'][token] = idx
    vocab['i_2_t'][idx] = token
  return idx

In [49]:
def add_many_tokens(tokens):
  idxes = [add_token(token) for token in tokens]
  return idxes

In [50]:
def look_up_token(token):
  if vocab['unk_token_idx'] >= 0:
    return vocab['t_2_i'].get(token, vocab['unk_token_idx'])
  else:
    return vocab['t_2_i'][token]

In [51]:
def look_up_index(idx):
  if idx not in vocab['i_2_t']:
    raise KeyError(f"Index {idx} is not there")
  return vocab['i_2_t'][idx]

In [52]:
def vocab_from_dataframe(df, cutoff=25):
  initialise_vocab()
  word_counts = Counter()
  for r in df.review:
    for word in r.split(" "):
      if word not in string.punctuation:
        word_counts[word] += 1
  for word, count in word_counts.items():
    if count > cutoff:
      add_token(word)

In [53]:
df = pd.read_csv("/content/drive/MyDrive/ML_Projects/Mini_project_Sentiment_Classification_NLP_with_RNN/reviews.csv")

In [54]:
vocab_from_dataframe(df, cutoff=25)

In [55]:
look_up_token('this')

128

In [56]:
look_up_index(128)

'this'

In [57]:
len(vocab['t_2_i'])

8945

In [58]:
#Vectorizer definition were given a review, it returns a 2-D nummpy array of the
#feature vector, each feature vector being one-hot encoded

def vectorize(review):
  is_first = True
  for token in review.split(" "):
    if token not in string.punctuation:
      one_hot = np.zeros((len(vocab['t_2_i']), 1))
      one_hot[look_up_token(token)] = 1
      if is_first:
        x_f = one_hot
        is_first = False
      else:
        x_f = np.hstack((x_f, one_hot))

  return x_f

In [59]:
x_f = vectorize(df['review'][0])

In [60]:
x_f.shape
#(len of vocabulary, total number of words that are there)

(8945, 102)

In [61]:
x_f = vectorize(df['review'][1])
x_f.shape

(8945, 17)

In [62]:
#Taking a very small portion of dataset for fast training
small_df_pos = df[df['rating'] == 'positive'].iloc[:5]
small_df_neg = df[df['rating'] == 'negative'].iloc[:5]
df_small = pd.concat([small_df_pos, small_df_neg], axis=0)

In [63]:
df_small

Unnamed: 0,rating,review
28000,positive,my experience was by far the most pleasant i h...
28001,positive,i have been to this place a couple of times on...
28002,positive,very popular sushi bar in the heart of old tow...
28003,positive,the staff is nice . it s pretty clean . they u...
28004,positive,my co worker picked up lunch for us from this ...
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [64]:
vocab_from_dataframe(df_small, cutoff=0)

In [65]:
len(vocab['t_2_i']) #number of tokens = 491

491

In [66]:
num_features = len(vocab['t_2_i'])
hidden_units = 10
h0 = torch.tensor(np.zeros((hidden_units, 1)))
Wx = torch.tensor(np.random.uniform(0, 1, (hidden_units, num_features)), requires_grad=True)
Wh = torch.tensor(np.random.uniform(0, 1, (hidden_units, hidden_units)), requires_grad=True)
Wy = torch.tensor(np.random.uniform(0, 1, (1, hidden_units)), requires_grad=True)
#Wy has 1 because we just have 1 output

In [87]:
#Applied sigmoid since the output is only 1
def stepForward(xt, Wx, Wh, Wy, prev_memory):
  x_frwrd = torch.matmul(Wx, torch.from_numpy(xt[:, np.newaxis]))
  h_frwrd = torch.matmul(Wh, prev_memory)
  ht = torch.tanh(x_frwrd + h_frwrd)
  yt_hat = torch.sigmoid(torch.matmul(Wy, ht))
  return ht, yt_hat

In [88]:
def full_forward_RNN(X, Wx, Wh, Wy, prev_memory):
  y_hat = 0 #Not a list because is just 1 prediction
  for t in range(X.shape[1]): #X is the list of embeddings
    ht, yt_hat = stepForward(X[:,t], Wx, Wh, Wy, prev_memory)
    prev_memory = ht
    y_hat = yt_hat #just keeping the last output (the rest, throwing them away)
  return y_hat

In [89]:
def compute_loss(y, y_hat):
  loss = 0
  for yi, yi_hat in zip(y, y_hat):
    if yi == 1:
      loss += -torch.log2(yi_hat)
    else:
      loss += -torch.log2(1-yi_hat)
  return loss / len(y)

In [90]:
def update_params(Wx, Wh, Wy, dWx, dWh, dWy, lr):
  with torch.no_grad():
    Wx -= lr * dWx
    Wh -= lr * dWh
    Wy -= lr * dWy

  return Wx, Wh, Wy

In [95]:
def train_RNN(train_df, Wx, Wh, Wy, prev_memory, lr, n_epochs):
  losses = []
  for epoch in range(n_epochs):
    y, y_hat = [], []
    for rv, rt in zip(train_df['review'], train_df['rating']):
      X = vectorize(rv)
      yi_hat = full_forward_RNN(X, Wx, Wh, Wy, prev_memory)
      yi = 0
      if rt == 'positive':
        yi = 1
      y.append(yi)
      y_hat.append(yi_hat)

    loss = compute_loss(y, y_hat)
    loss.backward()
    losses.append(loss)
    print("Loss after epoch=%d: %f" %(epoch, loss))
    sys.stdout.flush()
    dWx = Wx.grad.data
    dWh = Wh.grad.data
    dWy = Wy.grad.data
    Wx, Wh, Wy = update_params(Wx, Wh, Wy, dWx, dWh, dWy, lr)
    Wx.grad.data.zero_()
    Wh.grad.data.zero_()
    Wy.grad.data.zero_()

  return Wx, Wh, Wy, losses

In [98]:
Wx, Wh, Wy, losses = train_RNN(df_small, Wx, Wh, Wy, h0, lr=0.01, n_epochs=20)

Loss after epoch=0: 4.036739
Loss after epoch=1: 3.985504
Loss after epoch=2: 3.934326
Loss after epoch=3: 3.883210
Loss after epoch=4: 3.832161
Loss after epoch=5: 3.781183
Loss after epoch=6: 3.730281
Loss after epoch=7: 3.679462
Loss after epoch=8: 3.628730
Loss after epoch=9: 3.578092
Loss after epoch=10: 3.527555
Loss after epoch=11: 3.477125
Loss after epoch=12: 3.426811
Loss after epoch=13: 3.376620
Loss after epoch=14: 3.326561
Loss after epoch=15: 3.276643
Loss after epoch=16: 3.226875
Loss after epoch=17: 3.177269
Loss after epoch=18: 3.127834
Loss after epoch=19: 3.078582


In [143]:
rev = df_small['review'].iloc[0]
y = df_small['rating'].iloc[0]

In [144]:
X = vectorize(rev)

In [146]:
y_hat = full_forward_RNN(X, Wx, Wh, Wy, h0)

In [147]:
y_hat

tensor([[0.9848]], dtype=torch.float64, grad_fn=<SigmoidBackward0>)

In [148]:
y

'positive'