In [26]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score

In [2]:
df = pd.read_csv('yelp_review.csv').iloc[:20000,1:]
df.head()

Unnamed: 0,text,stars
0,"If you decide to eat here, just be aware it is...",3.0
1,I've taken a lot of spin classes over the year...,5.0
2,Family diner. Had the buffet. Eclectic assortm...,3.0
3,"Wow! Yummy, different, delicious. Our favo...",5.0
4,Cute interior and owner (?) gave us tour of up...,4.0


In [43]:
df1 = pd.read_csv('yelp_review.csv').iloc[:20000,2:]
df1.head()
df['stars']=df1['stars']

In [3]:
from string import punctuation

def clean(text):
    lst_of_tokens = [token.lower() for token in text.split() if token not in punctuation] 
    return ' '.join(lst_of_tokens)
df['cleaned'] = df['text'].apply(clean)
df['cleaned']

0        if you decide to eat here, just be aware it is...
1        i've taken a lot of spin classes over the year...
2        family diner. had the buffet. eclectic assortm...
3        wow! yummy, different, delicious. our favorite...
4        cute interior and owner (?) gave us tour of up...
                               ...                        
19995    manager is a complete asshole. if you have a b...
19996    if you're looking for bingo around st louis, t...
19997    unfortunately we had a bad experience here...w...
19998    meh. this pizza was basically a deep-dish grea...
19999    this place is amazing. excellent sushi burrito...
Name: cleaned, Length: 20000, dtype: object

In [4]:
from collections import Counter

all_text = ' '.join(df['cleaned'])
# create a list of words
words = all_text.split()
# Count all the words using Counter Method
count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)
sorted_words[:25]

[('the', 103760),
 ('and', 71159),
 ('a', 52976),
 ('i', 50885),
 ('to', 46224),
 ('was', 36817),
 ('of', 28959),
 ('is', 24808),
 ('for', 23454),
 ('in', 21873),
 ('it', 21221),
 ('my', 17874),
 ('we', 17211),
 ('with', 16441),
 ('but', 15711),
 ('this', 15467),
 ('that', 15317),
 ('they', 14789),
 ('on', 13662),
 ('you', 12934),
 ('have', 12302),
 ('had', 11911),
 ('were', 11522),
 ('not', 11508),
 ('are', 10143)]

In [11]:
#Now we need to create a vocab which contains int in sequence starting from index 1 , becasue we will have <pad> as 0

vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}


In [6]:
df['encoded_lst'] = df['cleaned'].apply(lambda x: [vocab_to_int[token] for token in x.split()])

In [48]:
df['stars_enc'] = df['stars'].apply(lambda x:1.0 if x>4.0 else 0)

In [8]:
all_encoded_lst  =[]
for i in range(len(df)):
    all_encoded_lst.append(df['encoded_lst'][i])

In [9]:
all_encoded_lst[:1]

[[35,
  20,
  1471,
  5,
  151,
  523,
  38,
  29,
  2211,
  11,
  8,
  120,
  5,
  130,
  54,
  184,
  476,
  44,
  2927,
  5,
  3303,
  13,
  21,
  174,
  11,
  992,
  1223,
  75,
  4,
  131,
  5,
  40,
  581,
  4,
  21,
  61,
  5,
  50,
  76,
  1925,
  10,
  6490,
  2,
  99,
  22,
  3,
  219,
  345,
  1,
  31,
  8,
  220,
  15,
  11,
  785,
  3,
  30,
  196,
  59,
  5,
  107,
  347,
  1,
  2239,
  8,
  30,
  7723,
  15,
  299,
  2240,
  13,
  21,
  38,
  22,
  113,
  146,
  1472,
  185,
  13,
  835,
  136,
  113,
  196,
  3021,
  13,
  299,
  5403,
  9,
  163,
  1572,
  49,
  114,
  19,
  1,
  5404,
  10,
  102,
  5,
  29,
  396,
  16226]]

In [24]:
def pad_sequences(len_of_dataset,seq_length):

    #len_of_dataset = df.shape[0]
    #seq_length = 200

    features = np.zeros((len_of_dataset,seq_length),dtype = int)    #20000,200
    for i,text  in enumerate(all_encoded_lst):
        #print(text)
        text_len = len(text)

        if text_len <= seq_length:
            zeros_appended = list(np.zeros(seq_length-text_len))
            new = zeros_appended + text
        else:
            new = text[:seq_length]
        features[i,:]   = np.array(new)
    #print(features[i])

<b>SPLIT THE DATA</b>

In [58]:
x = features
y = df['stars_enc']
x_train,x_test,y_train,y_test = train_test_split(x,y,stratify=y,train_size=0.7)

In [63]:
train_data = TensorDataset(torch.from_numpy(x_train),torch.from_numpy(y_train.values))
test_data = TensorDataset(torch.from_numpy(x_test),torch.from_numpy(y_test.values))


In [66]:
# make sure to SHUFFLE your data
batch_size = 100
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [67]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([100, 200])
Sample input: 
 tensor([[   16,   435,     8,  ...,     8,  2081, 80343],
        [    0,     0,     0,  ...,    28,     4, 34321],
        [    0,     0,     0,  ...,    53,   240,   947],
        ...,
        [    0,     0,     0,  ...,    33,   184,   872],
        [    0,     0,     0,  ...,     1,    31,  1039],
        [    0,     0,     0,  ...,   288,   130,   108]], dtype=torch.int32)

Sample label size:  torch.Size([100])
Sample label: 
 tensor([0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
        1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0.,
        0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
        1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1.,
        0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
        0., 1., 1., 1., 0., 1., 1., 0., 1., 1.], dtype=torch.float64)


In [71]:
batch_size = 2
seq_length = 3
hidden_dim = 4

lstm_out = torch.randn(batch_size, seq_length, hidden_dim)
print(lstm_out.size())  # output: torch.Size([2, 3, 4])

print(lstm_out.contiguous().size())#contigius creates a copy and stores in diff mem loc , usefull for operations such as transposse, reshape

torch.Size([2, 3, 4])
torch.Size([2, 3, 4])


In [72]:
lstm_out.view(-1,hidden_dim)

tensor([[-0.7923, -1.3737, -0.4511,  0.8726],
        [-0.9581, -0.9765,  0.0406,  0.1735],
        [ 0.2002,  1.0429, -0.1244, -1.3146],
        [-1.8264, -0.5591,  1.2189, -0.7296],
        [-0.6415, -1.8003, -0.3025,  1.0516],
        [ 0.0329,  2.8075,  0.2792, -0.7642]])

In [None]:
lstm_out = lstm_out.contiguous().view(-1, hidden_dim)
print(lstm_out.size())  # output: torch.Size([6, 4])


In [96]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [141]:
class LSTM_yelp(nn.Module):
    def __init__(self,vocab_size,emb_dim,hidden_dim,n_outputs,n_layer):
        super().__init__()
        #self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.n_outputs = n_outputs
        self.n_layer= n_layer
        self.embedding = nn.Embedding(vocab_size,emb_dim)
        self.lstm = nn.LSTM(emb_dim,self.hidden_dim,batch_first=True,num_layers=n_layer)
        self.fc = nn.Linear(hidden_dim, self.n_outputs)
        self.sig = nn.Sigmoid()
    
    def forward(self,x,hidden)    :
        embeds = self.embedding(x)
        lstm_output, hidden = self.lstm(embeds, hidden)

        # stack up lstm outputs, for output to be fed in NN, it should have (batch*seq_length, hidden)
        lstm_output = lstm_output.contiguous().view(-1, self.hidden_dim)
        out = self.fc(lstm_output)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]


        
        return sig_out,hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        
        weight = next(self.parameters()).data
        
        if torch.cuda.is_available():
            hidden = (weight.new(self.n_layer, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layer, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layer, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layer, batch_size, self.hidden_dim).zero_())
        
        return hidden    
        

In [140]:
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
n_outputs = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
batch_size = 100

model = LSTM_yelp(vocab_size, embedding_dim,hidden_dim,n_outputs, n_layers)
print(model)

LSTM_yelp(
  (embedding): Embedding(82700, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [133]:
h = model.init_hidden(batch_size)
print(h[1].size())

torch.Size([2, 2, 256])


In [122]:
tuple([i.data for i in h ])

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0'),
 tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0'))

In [143]:

# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(torch.cuda.is_available()):
    model.cuda()

model.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = model.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(torch.cuda.is_available()):
            #print('True cuda is avualable')
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        model.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor).cuda()
        
        output, h = model(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inputs, labels in test_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(torch.cuda.is_available()):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor).cuda()
                output, val_h = model(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            model.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.577491... Val Loss: 0.575144
Epoch: 2/4... Step: 200... Loss: 0.422333... Val Loss: 0.576997
Epoch: 3/4... Step: 300... Loss: 0.179247... Val Loss: 0.621252
Epoch: 3/4... Step: 400... Loss: 0.269241... Val Loss: 0.685635
Epoch: 4/4... Step: 500... Loss: 0.198815... Val Loss: 0.771847


In [98]:
torch.cuda.is_available()

True

In [154]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints

# test code and generate tokenized review

test_review = 'This movie had the best acting and the was so good. I loved it.'

test_ints = tokenize_review(test_review)
print(test_ints,len(test_ints[0]))


# test sequence padding
seq_length=200

features = pad_sequences(len(test_ints[0]), seq_length)

print(features)




[[16, 1800, 22, 1, 78, 7902, 2, 1, 6, 26, 36, 4, 242, 11]] 14


IndexError: index 14 is out of bounds for axis 0 with size 14

In [145]:
def predict(model, test_review, sequence_length=200):
    
    model.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = model.init_hidden(batch_size)
    
    if(torch.cuda.is_available()):
        feature_tensor = feature_tensor.cuda()
    
    # get the output from the model
    output, h = model(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")