In this project, the objective is to classify movie reviews into 'Positive' [1] and 'Negative' [0] by using different classification models.  
It is binary classification task, since the output is either '1' or '0'.

Reference: Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow by Aurelien Geron

Import libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd


In [3]:
# Read dataset

imdb_data = pd.read_csv('../Sentiment_Analysis/IMDB_Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Change positive & negative sentiment to [0,1]
imdb_data = imdb_data[imdb_data.sentiment != 'unsup']
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive':1, 'negative':0})
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [5]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\808485\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\808485\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Data Preprocessing

In [6]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define a function to remove stopwords & lemmatize text(to get meaningful word)
def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

imdb_data['Processed_Reviews'] = imdb_data.review.apply(lambda x: clean_text(x))

In [7]:
imdb_data.head()

Unnamed: 0,review,sentiment,Processed_Reviews
0,One of the other reviewers has mentioned that ...,1,one reviewer ha mention watch 1 oz episode you...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br film techniq...
2,I thought this was a wonderful way to spend ti...,1,think wa wonderful way spend time hot summer w...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stun f...


####  Model Building ####

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers
import keras
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(imdb_data['Processed_Reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(imdb_data['Processed_Reviews'])

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = imdb_data['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


history = model.fit(X_t,y, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
