In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/moviereviews.tsv',sep= '\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
df.drop(['label'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,review
0,how do films like mouse hunt get into theatres...
1,some talented actresses are blessed with a dem...
2,this has been an extraordinary year for austra...
3,according to hollywood movies made in last few...
4,my first press screening of 1998 and already i...


In [6]:
df.isnull().sum()

review    35
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
import nltk
nltk.download("vader_lexicon")
from typing import Sequence
from nltk.tokenize import word_tokenize
nltk.download("punkt")

from nltk.corpus import stopwords
nltk.download("stopwords")

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet")
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout,Dense, SimpleRNN, GRU, Embedding

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
def cleantext(text):
  token = [t for t in word_tokenize(text.lower())]
  ftoken = [t for t in token if(t.isalpha())]
  lemma = WordNetLemmatizer()
  ftoken = [lemma.lemmatize(t) for t in ftoken]
  return " ".join(ftoken)

In [10]:
df['review'] = df['review'].apply(cleantext)
df['review']

0       how do film like mouse hunt get into theatre i...
1       some talented actress are blessed with a demon...
2       this ha been an extraordinary year for austral...
3       according to hollywood movie made in last few ...
4       my first press screening of and already i gott...
                              ...                        
1995    i like movie with albert brook and i really li...
1996    it might surprise some to know that joel and e...
1997    the verdict drama from horror maestro stephen ...
1998    i want to correct what i wrote in a former ret...
1999    a couple of month ago when i first downloaded ...
Name: review, Length: 1965, dtype: object

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [12]:
sid = SentimentIntensityAnalyzer()

In [13]:
def getscore(sent):
    sid = SentimentIntensityAnalyzer()
    d = sid.polarity_scores(sent)
    v = d["compound"]
    
    if(v>0):
        return 1
    else:
        return 0

In [14]:
df['Label'] = df["review"].apply(getscore)

In [15]:
df.head()

Unnamed: 0,review,Label
0,how do film like mouse hunt get into theatre i...,0
1,some talented actress are blessed with a demon...,1
2,this ha been an extraordinary year for austral...,1
3,according to hollywood movie made in last few ...,1
4,my first press screening of and already i gott...,1


In [16]:
df["Label"].value_counts()

1    1472
0     493
Name: Label, dtype: int64

In [17]:
x = df["review"]
y = df["Label"]

In [18]:
from sklearn.model_selection import train_test_split
xtrain,xtest, ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [19]:
sent_lens = []
for sent in df["review"]:
  sent_lens.append(len(word_tokenize(sent)))
df["sent_lens"] = sent_lens
df.head()

Unnamed: 0,review,Label,sent_lens
0,how do film like mouse hunt get into theatre i...,0,370
1,some talented actress are blessed with a demon...,1,573
2,this ha been an extraordinary year for austral...,1,612
3,according to hollywood movie made in last few ...,1,832
4,my first press screening of and already i gott...,1,715


In [20]:
np.quantile(sent_lens,0.95)

1145.6

In [21]:
max_len = int(np.quantile(sent_lens,0.95))

In [22]:
tok = Tokenizer(char_level=False, split=" ")
tok.fit_on_texts(xtrain)

In [23]:
tok.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'to',
 6: 'is',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'film',
 11: 'with',
 12: 'for',
 13: 'his',
 14: 'this',
 15: 'he',
 16: 'i',
 17: 'but',
 18: 'are',
 19: 'on',
 20: 'movie',
 21: 'by',
 22: 'be',
 23: 'an',
 24: 'who',
 25: 'one',
 26: 'not',
 27: 'wa',
 28: 'you',
 29: 'have',
 30: 'from',
 31: 'at',
 32: 'ha',
 33: 'they',
 34: 'her',
 35: 'all',
 36: 'character',
 37: 'there',
 38: 'like',
 39: 'so',
 40: 'about',
 41: 'out',
 42: 'more',
 43: 'when',
 44: 'which',
 45: 'what',
 46: 'she',
 47: 'their',
 48: 'up',
 49: 'do',
 50: 'or',
 51: 'some',
 52: 'doe',
 53: 'time',
 54: 'just',
 55: 'get',
 56: 'we',
 57: 'if',
 58: 'him',
 59: 'make',
 60: 'scene',
 61: 'into',
 62: 'only',
 63: 'even',
 64: 'than',
 65: 'can',
 66: 'story',
 67: 'no',
 68: 'good',
 69: 'most',
 70: 'would',
 71: 'will',
 72: 'been',
 73: 'much',
 74: 'also',
 75: 'other',
 76: 'way',
 77: 'see',
 78: 'very',
 79: 'life',
 80: 'them',
 81: 'two',
 82: 'aft

In [24]:
vocab_len = len(tok.index_word)

In [25]:
vocab_len

30325

In [26]:
sequnces_train = tok.texts_to_sequences(xtrain)

sequnces_train

Output hidden; open in https://colab.research.google.com to view.

In [27]:
sequences_matrix_train = sequence.pad_sequences(sequnces_train,maxlen=max_len)

In [28]:
gru = Sequential()

gru.add(Embedding(vocab_len+1,700,input_length=max_len,mask_zero=True))
gru.add(GRU(32,activation='tanh'))
gru.add(Dense(32,activation='relu'))
gru.add(Dropout(0.2))

gru.add(Dense(1,activation="sigmoid"))

In [29]:
gru.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1145, 700)         21228200  
                                                                 
 gru (GRU)                   (None, 32)                70464     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21,299,753
Trainable params: 21,299,753
Non-trainable params: 0
_________________________________________________________________


In [30]:
gru.compile(loss="binary_crossentropy",optimizer='adam')
gru.fit(sequences_matrix_train,ytrain,batch_size=20,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f6ef188e090>

In [31]:
sequences_matrix_test = sequence.pad_sequences(tok.texts_to_sequences(xtest),maxlen=max_len)

In [32]:
ypred = gru.predict(sequences_matrix_test)

In [33]:
ypred=ypred>0.5

In [34]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.33      0.18      0.23       107
           1       0.74      0.86      0.80       286

    accuracy                           0.68       393
   macro avg       0.53      0.52      0.51       393
weighted avg       0.63      0.68      0.64       393

