In [1]:
import pandas as pd 
import numpy as np 
import os 

In [2]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM    # these are the 3 layers
from tensorflow.keras.preprocessing.text import Tokenizer      # to convert text data to tokens
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to make the no. of tokens as same

In [3]:
df=pd.read_csv('IMDB Dataset.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# To check whether the classes are ditributed evenly or not
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [9]:
# converting the categorcial variable to numeric
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})

In [14]:
# To split the data
x=df.drop('sentiment',axis=1)
y=df['sentiment']

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [16]:
# Data PreProceesing for convertig the review to  tockenize
tokenizer=Tokenizer(num_words=4000)
tokenizer.fit_on_texts(x_train['review'])
x_train=pad_sequences(tokenizer.texts_to_sequences(x_train['review']),maxlen=200)
x_test=pad_sequences(tokenizer.texts_to_sequences(x_test['review']),maxlen=200)

In [22]:
# LSTM Model- kind to RNN, used for sequqnatial dataset or timeseries or textual dataset

model=Sequential()
model.add(Embedding(input_dim=4000,output_dim=128))  # First layer of NLP 
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))  # The purpose of mentioning dropout is to generalize your model, so that it won;t overfit
# 128 is the no. of neurons

model.add(Dense(1,activation='sigmoid'))  # All the neurons in the previous layers are connected to this layer. This is the output layer
# Since it's binary classifcation problem, hence we are using sigmoid

In [23]:
model.summary()

In [24]:
# Compile teh model
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

In [26]:
# training the model

model.fit(x_train,y_train,epochs=5,batch_size=64,validation_split=0.2)

Epoch 1/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 299ms/step - accuracy: 0.7941 - loss: 0.4632 - val_accuracy: 0.7937 - val_loss: 0.4495
Epoch 2/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 321ms/step - accuracy: 0.8365 - loss: 0.3843 - val_accuracy: 0.8591 - val_loss: 0.3379
Epoch 3/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 349ms/step - accuracy: 0.8683 - loss: 0.3287 - val_accuracy: 0.8146 - val_loss: 0.4305
Epoch 4/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 304ms/step - accuracy: 0.8653 - loss: 0.3296 - val_accuracy: 0.8606 - val_loss: 0.3372
Epoch 5/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 302ms/step - accuracy: 0.8929 - loss: 0.2723 - val_accuracy: 0.8720 - val_loss: 0.3284


<keras.src.callbacks.history.History at 0x15d41a83e30>

In [27]:
loss,accuracy=model.evaluate(x_test,y_test)
print("loss is ", loss)
print("accuracy is",accuracy)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 59ms/step - accuracy: 0.8739 - loss: 0.3248
loss is  0.324657678604126
accuracy is 0.8755333423614502


In [46]:
# Building a predictive model
def prediction_model(review):
    sequence=tokenizer.texts_to_sequences([review])
    padded_seq=pad_sequences(sequence,maxlen=200)
    prediction=model.predict(padded_seq)
    sentiment='positive' if prediction[0][0]>0.5 else "negative"
    return sentiment,prediction
    

In [59]:
review='this movie is very bad'
ans,probab=prediction_model(review)
print("The sentiment of review",probab[0][0]*100,"%", ans)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
The sentiment of review 24.04603511095047 % negative


In [42]:
ans

'negative'

In [53]:
import pickle
pickle.dump(model,open('model.pkl','wb'))