In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis/IMDB_dataset.csv")
print(data.head())

from string import punctuation
def remove_punc(text):
    text = text.lower()
    return ("".join(i for i in text if i not in punctuation))

data["review"] = data["review"].apply(remove_punc)
print(data.head())
X = data["review"].values
y = data["sentiment"].values
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(X_train.shape)
print(X_test.shape)



                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production br br the filmin...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically theres a family where a little boy j...  negative
4  petter matteis love in the time of money is a ...  positive
(37500,)
(12500,)


In [7]:
def process(string):
    string = re.sub(r"[^\w\s]", '', string)
    string = re.sub(r"\d", '', string)
    string = re.sub(r"\s+", '', string)
    return string

def tokenize(X_train,y_train,X_test,y_test):
    words = []
    stop_words = set(stopwords.words('english')) 
    for x in X_train:
        for word in x.split():
            word = process(word)
            if word not in stop_words and word != '':
                words.append(word)
                
    counts = Counter(words)
    vocab = sorted(counts, key=counts.get, reverse=True)[:1000]
    vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}
    new_X_train = []
    new_X_test = []
    for s in X_train:
            new_X_train.append([vocab_to_int[process(word)] for word in s.split() 
                                     if process(word) in vocab_to_int.keys()])
    for s in X_test:
            new_X_test.append([vocab_to_int[process(word)] for word in s.split() 
                                    if process(word) in vocab_to_int.keys()])
            
    new_y_train = [1 if label =='positive' else 0 for label in y_train]  
    new_y_test = [1 if label =='positive' else 0 for label in y_test]
    return new_X_train, new_y_train,new_X_test, new_y_test, vocab_to_int


In [8]:
X_train,y_train,X_test,y_test,vocab_to_int = tokenize(X_train,y_train,X_test,y_test)

In [58]:
'''
non_zero_idx_train = [ii for ii, review in enumerate(X_train) if len(review) != 0]
non_zero_idx_test = [ii for ii, review in enumerate(X_test) if len(review) != 0]
X_train = [X_train[ii] for ii in non_zero_idx_train]
X_test = [X_test[ii] for ii in non_zero_idx_test]
y_train = np.array([y_train[ii] for ii in non_zero_idx_train])
y_test = np.array([y_test[ii] for ii in non_zero_idx_test])
print(len(X_train), len(X_test))
'''

37499 12500


In [10]:
def padding(sentence, seqLength):
    #determine shape
    features = np.zeros((len(sentence), seqLength), dtype=int)
    for i, row in enumerate(sentence):
        if len(row) != 0:
            features[i, -len(row):] = np.array(row)[:seqLength]
    return features

In [11]:
X_train_pad = padding(X_train,500)
X_test_pad = padding(X_test,500)


In [12]:
#Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train_pad), torch.from_numpy(np.array(y_train)))
valid_data = TensorDataset(torch.from_numpy(X_test_pad), torch.from_numpy(np.array(y_test)))

batch_size = 50

#Shuffle for generalization
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

#Obtain a batch of the training data
dataiter = iter(train_loader)
sampleX, sampley = next(dataiter)

print('Sample input size: ', sampleX.size())
print('Sample input: \n', sampleX)
print('Sample input: \n', sampley)

Sample input size:  torch.Size([50, 500])
Sample input: 
 tensor([[  0,   0,   0,  ..., 551,   5, 137],
        [  0,   0,   0,  ..., 380, 178, 215],
        [  0,   0,   0,  ..., 100, 448, 165],
        ...,
        [  0,   0,   0,  ..., 709, 118,  39],
        [  0,   0,   0,  ...,  76, 295, 531],
        [  0,   0,   0,  ..., 254, 650,  45]])
Sample input: 
 tensor([0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
        0, 0])


In [13]:
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [14]:

class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
       
    def forward(self,x,hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        #Hidden state
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.no_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.no_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.no_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.no_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [15]:
no_layers = 2
vocab_size = len(vocab_to_int) + 1
embedding_dim = 64
output_dim = 1
hidden_dim = 256

model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)
print(model)


SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [16]:
#Training
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [18]:
clip = 5
epochs = 4 

def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

if(train_on_gpu):
    model.cuda()
    
for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    #train mode
    model.train() 
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()  
        h = tuple([each.data for each in h])
        model.zero_grad()
        output,h = model(inputs,h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        accuracy = acc(output,labels)
        train_acc += accuracy
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

test_losses = []
num_correct = 0

h = model.init_hidden(batch_size)
#Testing mode
model.eval()
for inputs, labels in valid_loader:
    h = tuple([each.data for each in h])
    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze()) 
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(valid_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))



Test loss: 0.464
Test accuracy: 0.847


In [19]:
h = model.init_hidden(50)
h = tuple(each.data for each in h)
print(h)

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0'), tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0.,

In [20]:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import pandas as pd
import nltk
nltk.download("stopwords")

class Scraper:
    def __init__(self,url):
        self.url = url
        
    def getText(self):
        response = requests.get(self.url)
        html_data = BeautifulSoup(response.content, "html.parser")
        text = html_data.get_text()
        words = text.split()
        words = [word.lower() for word in words if word.isalpha()]
        return ' '.join(words)
    
class ETL:
    def __init__(self,url):
        self.url = url
    
    def run(self):
        scrape = Scraper(self.url)
        text = scrape.getText()
        return text
    
if __name__ == "__main__":
    url = "https://www.imdb.com/title/tt9486184/reviews"
    pipeline = ETL(url)
    test_review = pipeline.run()
    print(test_review)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
positive positive user reviews imdb menumoviesrelease calendartop moviesmost popular moviesbrowse movies by genretop box officeshowtimes ticketsmovie newsindia movie spotlighttv on tv streamingtop tv showsmost popular tv showsbrowse tv shows by genretv newswatchwhat to watchlatest trailersimdb originalsimdb picksimdb spotlightimdb podcastsawards eventsoscarsemmystifffestival seasonstarmeter awardsawards centralall eventscelebsborn todaymost popular celebscelebrity newscommunityhelp centercontributor zonepollsfor industry professionalslanguageenglish supportedenglish supportedfrançais inenfully supportedenglish supportedfrançais app positive user reviews review this title reviews hide spoilers sort featured review date total votes prolific reviewer review rating filter by show all star stars stars stars stars stars stars stars stars stars positive effort may i can 

In [21]:

def tokenize_review(test_review):
    test_review = test_review.lower()
    test_text = ''.join([i for i in test_review if i not in punctuation])
    test_words = test_text.split()
    test_ints = []
    test_ints.append([vocab_to_int.get(word, 0) for word in test_words])
    return test_ints

def predict(net, test_review, sequence_length=500):
    model.eval()
    test_ints = tokenize_review(test_review)
    seq_length=sequence_length
    features = padding(test_ints, seq_length)
    feature_tensor = torch.from_numpy(features)
    batch_size = feature_tensor.size(0)
    h = net.init_hidden(batch_size)
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    output, h = model(feature_tensor, h)
    print('Prediction value: {:.6f}'.format(output.item()))
    if(output.item() > 0.5):
        print("Positive review detected! With probability of:",output.item())
    else:
        print("Negative review detected! With probability of:", (1 - output.item()))
        
#test_review = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'
predict(model, test_review, 500)        

Prediction value: 0.235779
Negative review detected! With probability of: 0.7642206698656082


In [23]:
#Deploying PyTorch using Tracing Method
'''
import torch
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
    
    def forward(self,x,hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        #Hidden state
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.no_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.no_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.no_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.no_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
'''
example = torch.zeros(50,500, dtype = torch.int)
if train_on_gpu:
    example = example.cuda()
print(example)

traced_script_module = torch.jit.trace(model, (example,h))
traced_script_module.save("sentiment_rnn.pt")
new_model = torch.jit.load("sentiment_rnn.pt")


tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0', dtype=torch.int32)
