In [100]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
import os
import string
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
import wordcloud
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [26]:
df=pd.read_csv('/content/IMDB Dataset.csv')

##EDA


In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df.shape

(50000, 2)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [16]:
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [17]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [27]:
df['positive'] = (df['sentiment'] == 'positive').astype(int)
df['negative'] = (df['sentiment'] == 'negative').astype(int)
df=df.drop('sentiment', axis=1)
df.head()

Unnamed: 0,review,positive,negative
0,One of the other reviewers has mentioned that ...,1,0
1,A wonderful little production. <br /><br />The...,1,0
2,I thought this was a wonderful way to spend ti...,1,0
3,Basically there's a family where a little boy ...,0,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,0


In [28]:
df.head(20)

Unnamed: 0,review,positive,negative
0,One of the other reviewers has mentioned that ...,1,0
1,A wonderful little production. <br /><br />The...,1,0
2,I thought this was a wonderful way to spend ti...,1,0
3,Basically there's a family where a little boy ...,0,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,0
5,"Probably my all-time favorite movie, a story o...",1,0
6,I sure would like to see a resurrection of a u...,1,0
7,"This show was an amazing, fresh & innovative i...",0,1
8,Encouraged by the positive comments about this...,0,1
9,If you like original gut wrenching laughter yo...,1,0


In [30]:
#Splitting Features and Labels
X = df['review']
y = df[df.columns[1:]].values

In [32]:
X.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [33]:
y

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]])

In [39]:
#reviews_with_word = df[df['review'].str.contains(']', case=False)]


In [40]:
#Cleaning noise (links, html tags, symbols)

def remove_links(text):
    url_pattern = r'(https?://\S+|www\.\S+|\b\w+\.[a-z]{2,4}\b(?:/[^\s]*)?)'
    return re.sub(url_pattern, '', text)

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noise from text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text=remove_links(text)
    return text
#Apply function on review column of df
df['review']=df['review'].apply(denoise_text)


  soup = BeautifulSoup(text, "html.parser")


In [41]:
#Removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
df['review']=df['review'].apply(remove_special_characters)

In [44]:
nltk.download('stopwords')
nltk.download('punkt')
#Setting English stopwords
stopwords_en = set(stopwords.words('english'))

# function to remove stopwords from text
def remove_stopwords(text):
    words = word_tokenize(text)  # Tokenize text into words
    filtered_words = [word for word in words if word.lower() not in stopwords_en]  # Remove stopwords
    return ' '.join(filtered_words)  # Join filtered words back into a string

df['review'] = df['review'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [45]:
total_unique_word_count = df['review'].str.lower().str.split().explode().nunique()
total_unique_word_count

220795

In [47]:
max_sentence_length = df['review'].apply(lambda x: len(str(x).split())).max()
max_sentence_length

1429

In [48]:
max_features = 500000  #Defining the maximum number of unique words to keep
max_len = 1500  #Defining the maximum length for padding/ truncating sequences

#Map every word to an integer value
vectorizer = TextVectorization(max_tokens = max_features, output_sequence_length = max_len, output_mode = 'int')


In [49]:
#To analyze the text data, tokenize it, and build vocabulary
vectorizer.adapt(X.values)

In [50]:
vectorized_text = vectorizer(X.values)
vectorized_text

<tf.Tensor: shape=(50000, 1500), dtype=int64, numpy=
array([[  29,    5,    2, ...,    0,    0,    0],
       [   4,  385,  116, ...,    0,    0,    0],
       [  10,  195,   11, ...,    0,    0,    0],
       ...,
       [  10,  227,    4, ...,    0,    0,    0],
       [ 142,  162,    6, ...,    0,    0,    0],
       [  56,   29, 5827, ...,    0,    0,    0]])>

In [69]:
# MCSHBAP (map, cache, shuffle, batch, prefetch) for efficient and optimized handling of input data

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)  #Buffer size
dataset = dataset.batch(256)       #Each batch represented as a series of 256 samples
dataset = dataset.prefetch(16)     #To prevent bottlenecks

In [70]:
a_batch = dataset.as_numpy_iterator().next()    # batch of 256 training example


In [71]:
#Checking length
len(a_batch[0])

256

In [72]:
no_of_batches=len(dataset)
no_of_batches

196

##Modelling

In [73]:
# Splitting training, test and validation set
train = dataset.take(int(len(dataset)*.9))   # 90%
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.05))   # 5%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.05))  # 5%

In [74]:
len(train), len(val), len(test)

(176, 9, 9)

In [75]:

  model = Sequential()
  # embedding layer for nlp
  model.add(Embedding(max_features+1, 32)) #Each word is represented as a vector with 32 components
  model.add(Bidirectional(LSTM(32, activation='tanh'))) # Bidirectional LSTM Layer
  # Fully connected layers for feature extraction and representation
  model.add(Dense(128, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.5)) #To reduce overfitting
  model.add(Dense(2, activation='sigmoid')) #Layer for classification
  model.compile(optimizer='Adam', loss='BinaryCrossentropy', metrics=['accuracy'])

In [76]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          16000032  
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                      

In [77]:
#Training the model with Bidirectional LSTM for Text Classification
hist = model.fit(train, epochs=3, validation_data=val, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [78]:
l,a= model.evaluate(test)
print('Test Loss: ', l)
print('Test Accuracy: ', a)


Test Loss:  0.05443241447210312
Test Accuracy:  0.9830729365348816


##Prediction


In [97]:
input_text = vectorizer("While 'Dune' boasts stunning visuals and an impressive cast, I couldn't help but feel underwhelmed by its overly complex plot and slow pacing. Despite the breathtaking landscapes and intricate world-building, the story often felt convoluted and difficult to follow. The performances, although commendable, failed to fully engage me, leaving me disconnected from the characters' struggles. Additionally, the film's lengthy runtime only added to the sense of monotony, making it a challenge to stay invested until the end. Overall, while 'Dune' has its moments, it falls short of delivering a truly captivating cinematic experience.")
res = model.predict(np.expand_dims(input_text, axis=0))
if res[0,0]>0.8: print("positive")
else: print("negative")

negative


In [99]:
input_text = vectorizer("I was utterly captivated by the epic scale and breathtaking visuals of 'Dune'. From the vast desert landscapes to the intricate details of the futuristic world, every scene was a visual treat. The performances were stellar, with each actor bringing depth and emotion to their characters. The storyline kept me on the edge of my seat, with its blend of political intrigue, action, and adventure. Hans Zimmer's haunting score added to the intensity of the film, elevating the viewing experience to new heights. Overall, 'Dune' is a cinematic masterpiece that will leave you craving for more.")
res = model.predict(np.expand_dims(input_text, axis=0))
if res[0,0]>0.8: print("positive")
else: print("negative")

positive
