#Movie Reviews Sentimental Identification Machine Learning Model

By : Muhammad Fu'ad Saifuddin

Email : muh.fuad.saifuddin@gmail.com

Linkedin : https://www.linkedin.com/in/muhammad-fuad-saifuddin/

Github : https://github.com/fuad-saifuddin




In [None]:
# Import the tensorflow modul and clear the session were run in the system before
import tensorflow as tf
tf.keras.backend.clear_session()

In [None]:
tf.__version__

'2.9.2'

In [None]:
# Import the pandas module and read the dataset
import pandas as pd

url='https://drive.google.com/file/d/1fE0XjbqolexFf9XnGUADUR88ziEuffc_/view?usp=share_link'
file_id=url.split('/')[-2]
dwn_url='https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
# Feature Engineering, make a new feature (is_positive_review) containing information whether the reviews are positive (1), or negative (0)
import numpy as np
df['is_positive_review'] = np.where(df['sentiment']=='positive',1,0)
df

Unnamed: 0,review,sentiment,is_positive_review
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [None]:
df['is_positive_review'].value_counts()

1    25000
0    25000
Name: is_positive_review, dtype: int64

In [None]:
#split the data to train 0 and test data,
#Then split the train 0 data to train and validation data
#Train data is used for modelling, validation data is used to validating the model in training process, and test data is used for evaluating the model performance


from sklearn.model_selection import train_test_split

X = df['review']
y = df['is_positive_review']

X_train_0,X_test,y_train_0,y_test = train_test_split (X,y,test_size=0.2)
X_train,X_val,y_train,y_val = train_test_split (X_train_0,y_train_0,test_size=0.2)

In [None]:
y_train.value_counts()

0    16081
1    15919
Name: is_positive_review, dtype: int64

In [None]:
#Tokenization and padding the review data

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
tokenizer = Tokenizer(num_words = 8000, oov_token='x')

tokenizer.fit_on_texts(X_train)
 
seq_train = tokenizer.texts_to_sequences(X_train)
seq_val = tokenizer.texts_to_sequences(X_val)
 
pad_train = pad_sequences(seq_train) 
pad_val = pad_sequences(seq_val)

In [None]:
print(len(tokenizer.index_word))

101283


In [None]:
#Oversampling the training data to remove minimize the imbalance
from imblearn.over_sampling import RandomOverSampler


pad_train_ovsample,y_train_ovsample = RandomOverSampler(random_state=42).fit_resample(pad_train,y_train)

In [None]:
y_train_ovsample.value_counts()

0    16081
1    16081
Name: is_positive_review, dtype: int64

In [None]:
len(pad_train_ovsample)

32162

In [None]:
len(pad_train[2])

2493

In [None]:
#Making 2 model of NLP machine learning using sequential model contained 3 hidden layer in each other.
#First model using LSTM and the second model using Global Average Polling 1D
#Combine those 2 models into 1 final model

import tensorflow as tf
from keras.layers import Dense, Concatenate, Add
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=8000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=8000, output_dim=16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

merge = Concatenate()([model1.output, model2.output])
merge = Dense(1, activation='sigmoid')(merge)

final_model = tf.keras.models.Model([model1.input,model2.input], merge)

final_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
final_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 embedding_input (InputLayer)   [(None, None)]       0           []                               
                                                                                                  
 embedding_1_input (InputLayer)  [(None, None)]      0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 16)     128000      ['embedding_input[0][0]']        
                                                                                                  
 embedding_1 (Embedding)        (None, None, 16)     128000      ['embedding_1_input[0][0]']      
                                                                                              

In [None]:
#Fit the final model to the training data
history = final_model.fit([pad_train_ovsample,pad_train_ovsample], y_train_ovsample, epochs=6, validation_data=([pad_val,pad_val], y_val), verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


The training result was not bad, with accuracy about 87.6 percent

In [None]:
#Evaluating the model
seq_test = tokenizer.texts_to_sequences(X_test)
pad_test = pad_sequences(seq_test) 

test_loss, test_acc = final_model.evaluate([pad_test,pad_test], y_test, verbose=1)
print('Final test accuracy: {:.4f}'.format(test_acc))

Final test accuracy: 0.8801


Conclusion :
We alredy develop NLP Machine learning Model for movie review sentimental identification. Our model combines the 2 results from the LSTM model and GlobalAveragePolling 1D models. Using this way, we can achive NLP model with test accuracy about 88%. 