In [37]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow import estimator
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,Bidirectional
from keras.optimizers import Adam

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debnarayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debnarayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df=pd.read_csv("sentiment.csv")

In [4]:
df.shape

(50000, 2)

In [5]:
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
tag_re = re.compile(r'<[^>]+>')
def remove_tags(text):
    return tag_re.sub('', text)

In [9]:
def preprocessing(txt):
    txt=txt.lower()
    txt=re.sub(r'<[^>]+>','', txt)
    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt= re.sub(r"\s+[a-zA-Z]\s+", ' ', txt)
    txt = re.sub(r'\s+', ' ', txt)
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    txt = pattern.sub('', txt)

    return txt

In [10]:
df["review"]=df["review"].apply(preprocessing)

In [11]:
df['review'][2]

'thought wonderful way spend time hot summer weekend sitting air conditioned theater watching light hearted comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point risk addiction thought proof woody allen still fully control style many us grown love laughed one woody comedies years dare say decade never impressed scarlet johanson managed tone sexy image jumped right average spirited young woman may crown jewel career wittier devil wears prada interesting superman great comedy go see friends '

In [12]:
df["sentiment"] = np.array(list(map(lambda x: 1 if x=="positive" else 0,df["sentiment"])))

In [13]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1
...,...,...
49995,thought movie right good job creative original...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary schools n...,0
49998,going disagree previous comment side maltin on...,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.20, random_state=42)

In [15]:

X_test.shape

(10000,)

In [16]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [17]:
vocab_length = len(word_tokenizer.word_index) + 1

vocab_length

92394

In [18]:
n=100
X_train=pad_sequences(X_train,padding="post",maxlen=n)
X_test=pad_sequences(X_test,padding="post",maxlen=n)

In [19]:
X_train[2]


array([ 1173,    35,     9,  2652,  1114,    35,  4370,  2412,  1532,
        3749,   214,  4022,    16,  5506,  1185,     9,  2572, 16956,
         360,    13,  2038,  1455,    12,     5,   469,   210,  6554,
        1186,  3345,     9,  4023,    11,  1174,  2125,   176,  1349,
       32472,  1353, 12034,   273,   197,   569,     9,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

In [44]:
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open("glove.6B.100d.txt", "r", encoding="utf-8") 

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [45]:
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector


In [49]:
model=Sequential()
model.add(Embedding(input_dim=vocab_length,output_dim=100,weights=[embedding_matrix],input_length=n,trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))


In [50]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [51]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 100)          9239400   
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
Total params: 9,356,777
Trainable params: 117,377
Non-trainable params: 9,239,400
_________________________________________________________________


In [52]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=6)

Train on 40000 samples, validate on 10000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [53]:
score=model.evaluate(X_test, y_test, verbose=1)



In [54]:
score

[0.3228968353271484, 0.8661999702453613]

In [58]:
model1=Sequential()
model1.add(Embedding(input_dim=vocab_length,output_dim=100,weights=[embedding_matrix],input_length=n,trainable=False))
model1.add(Bidirectional(LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))),
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [64]:
model1.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 100)          9239400   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 257       
Total params: 9,474,153
Trainable params: 234,753
Non-trainable params: 9,239,400
_________________________________________________________________


In [60]:
history = model1.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=6)

Train on 40000 samples, validate on 10000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [62]:
score1=model1.evaluate(X_test, y_test, verbose=1)



In [63]:
score1

[0.33246750326156616, 0.8607000112533569]