In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter

In [2]:
vocab = {} 

In [11]:
def initialiaseVocabulary():
  unkToken = '<UNK>'
  vocab['t_2_i'] = {}
  vocab['i_2_t'] = {}
  idx = addToken(unkToken)
  vocab['addUnk'] = True
  vocab['unkToken'] = unkToken
  vocab['unkTokenIdx']=idx

In [12]:
def addToken(token):
  if token in vocab['t_2_i']:
    idx = vocab['t_2_i'][token]
  else:
    idx = len(vocab['t_2_i'])
    vocab['t_2_i'][token] = idx
    vocab['i_2_t'][idx] = token
  return idx

In [13]:
def addManyTokens(tokens):
  idxes = [addToken(token) for token in tokens]
  return idxes

In [14]:
def lookUpToken(token):
  if vocab['unkTokenIdx'] >= 0:
    return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
  else:
    return vocab['t_2_i'][token]

In [15]:
def lookUpIndex(idx):
  if idx not in vocab['i_2_t']:
    raise KeyError("the index % is not there" % idx)
  return vocab['i_2_t'][idx]

In [16]:
def vocabularyFromDataFrame(df,cutoff=25):
  initialiaseVocabulary()
  wordCounts = Counter()
  for r in df.review:
    for word in r.split(" "):
      if word not in string.punctuation:
        wordCounts[word] += 1
  for word,count in wordCounts.items():
    if count > cutoff:
      addToken(word)


In [17]:
df=pd.read_csv('/home/laksh/Documents/reviews.csv')

In [18]:
vocabularyFromDataFrame(df,cutoff=25)

In [19]:
lookUpToken('this')

128

In [21]:
lookUpIndex(128)

'this'

In [22]:
lookUpIndex(12)

'them'

In [23]:
len(vocab['t_2_i'])

8945

In [24]:
len(vocab['i_2_t'])

8945

In [25]:
def vectorize(review):
    isFirst = True
    for token in review.split(" "):
        if token not in string.punctuation:
            oneHot = np.zeros((len(vocab['t_2_i']),1))
            oneHot[lookUpToken(token)]=1
            if isFirst:
                xF = oneHot
                isFirst = False
            else:
                xF= np.hstack((xF,oneHot))
    return xF

In [26]:
xF = vectorize(df['review'][0])

In [27]:
xF.shape

(8945, 102)

In [28]:
df

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...
...,...,...
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...


In [29]:
smallDf_pos = df[df['rating']=='positive'].iloc[:5]
smallDf_neg = df[df['rating']=='negative'].iloc[:5]
df_small = pd.concat([smallDf_pos,smallDf_neg],axis=0)

In [30]:
df_small

Unnamed: 0,rating,review
28000,positive,my experience was by far the most pleasant i h...
28001,positive,i have been to this place a couple of times on...
28002,positive,very popular sushi bar in the heart of old tow...
28003,positive,the staff is nice . it s pretty clean . they u...
28004,positive,my co worker picked up lunch for us from this ...
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [47]:
vocabularyFromDataFrame(df_small,cutoff=0)

In [48]:
len(vocab['t_2_i'])

491

In [49]:
numFeatures = len(vocab['t_2_i'])

In [50]:
hiddenUnits = 10
h0 = torch.tensor(np.zeros((hiddenUnits,1)))
Wx = torch.tensor(np.random.uniform(0,1,(hiddenUnits,numFeatures)),requires_grad = True)
Wh = torch.tensor(np.random.uniform(0,1,(hiddenUnits,hiddenUnits)),requires_grad = True)
Wy = torch.tensor(np.random.uniform(0,1,(1,hiddenUnits)),requires_grad = True)

In [51]:
def stepForward(xt,Wx,Wy,Wh,prevMem):
  x_frd = torch.matmul(Wx,torch.from_numpy(xt[:,np.newaxis]))
  h_frd = torch.matmul(Wh,prevMem)
  ht = torch.tanh(x_frd+h_frd)
  yt_hat = torch.sigmoid(torch.matmul(Wy,ht))
  return ht,yt_hat

In [57]:
def fullForwardRNN(X,Wx,Wh,Wy,prevMem):
  y_hat = 0
  for t in range(X.shape[1]):
    ht,yt_hat = stepForward(X[:,t],Wx,Wy,Wh,prevMem)
    prevMem=ht
    y_hat=yt_hat
  return y_hat


In [58]:
def computeLoss(y,y_hat):
  loss=0
  for yi,yi_hat in zip(y,y_hat):
    if yi == 1:
        loss += -torch.log2(yi_hat)
    else:
        loss += -torch.log2(1-yi_hat)
  return loss/len(y)

In [59]:
def updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr):
  with torch.no_grad():
    Wx-= lr*dWx
    Wh-= lr*dWh
    Wy-= lr*dWy
  return Wx,Wh,Wy


In [60]:
def trainRNN(train_df,Wx,Wh,Wy,prevMem,lr,nepoch):
  losses = []
  for epoch in range(nepoch):
    y,y_hat = [],[]
    for rv,rt in zip(train_df['review'],train_df['rating']):
        X = vectorize(rv)
        yi_hat = fullForwardRNN(X,Wx,Wh,Wy,prevMem)
        yi = 0
        if rt == 'positive':
            yi=1
        y.append(yi)
        y_hat.append(yi_hat)
    loss = computeLoss(y,y_hat)
    loss.backward()
    losses.append(loss)
    print("loss after epoch %d is %f" %(epoch,loss))
    sys.stdout.flush()
    dWx=Wx.grad.data
    dWh=Wh.grad.data
    dWy=Wy.grad.data
    Wx,Wh,Wy=updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr)
    Wx.grad.data.zero_()
    Wh.grad.data.zero_()
    Wy.grad.data.zero_()
  return Wx,Wh,Wy,losses

In [65]:
Wx,Wh,Wy,losses = trainRNN(df_small,Wx,Wh,Wy,h0,0.005,50)

loss after epoch 0 is 2.346488
loss after epoch 1 is 2.324533
loss after epoch 2 is 2.302702
loss after epoch 3 is 2.281000
loss after epoch 4 is 2.259429
loss after epoch 5 is 2.237992
loss after epoch 6 is 2.216694
loss after epoch 7 is 2.195536
loss after epoch 8 is 2.174522
loss after epoch 9 is 2.153656
loss after epoch 10 is 2.132940
loss after epoch 11 is 2.112378
loss after epoch 12 is 2.091974
loss after epoch 13 is 2.071730
loss after epoch 14 is 2.051649
loss after epoch 15 is 2.031736
loss after epoch 16 is 2.011993
loss after epoch 17 is 1.992423
loss after epoch 18 is 1.973030
loss after epoch 19 is 1.953817
loss after epoch 20 is 1.934786
loss after epoch 21 is 1.915942
loss after epoch 22 is 1.897287
loss after epoch 23 is 1.878824
loss after epoch 24 is 1.860557
loss after epoch 25 is 1.842487
loss after epoch 26 is 1.824618
loss after epoch 27 is 1.806952
loss after epoch 28 is 1.789493
loss after epoch 29 is 1.772243
loss after epoch 30 is 1.755204
loss after epoch 3

In [68]:
r = df_small['review'].iloc[0]
y = df_small['rating'].iloc[0]

In [69]:
X = vectorize(r)

In [70]:
y_hat = fullForwardRNN(X,Wx,Wh,Wy,h0)

In [71]:
y_hat

tensor([[0.8438]], dtype=torch.float64, grad_fn=<SigmoidBackward>)

In [72]:
y

'positive'