In [2]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
kaggle_dictionary = json.load(open("kaggle.json"))
#json.load() comvert a json object into python dictionary.

In [4]:
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [5]:
# setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 35% 9.00M/25.7M [00:00<00:00, 40.0MB/s]
100% 25.7M/25.7M [00:00<00:00, 90.9MB/s]


In [6]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [7]:
# unzip the dataset file
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [8]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [9]:
data = pd.read_csv("/content/IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [11]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [12]:
# split data into training data and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


# Data Preprocessing

In [14]:
#Tokenize text Data
tokenizer = Tokenizer(num_words=5000) #converts numbers into integer or vectors
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

#pad_sequence : All the input data are in same length


In [15]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [16]:
print(X_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [17]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

# Model Building

In [18]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim = 128, input_length=200))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout=0.2)) #dropout = 20% of input data to NN off (regularization parameter)
model.add(Dense(1, activation="sigmoid")) #binary classification, sigmoid activation

## Embedding Layer :
An embedding layer in neural networks is used to convert categorical data, particularly text data (words), into dense vectors of fixed size. This transformation is crucial for NLP tasks as it allows the model to work with numerical representations of words, enabling better processing and learning.

Converts high-dimensional categorical data into lower-dimensional continuous vectors, which makes the data more manageable and computationally efficient for the model to process.

Embedding layers learn to capture semantic relationships between words. Words with similar meanings or contexts are mapped to vectors that are close to each other in the embedding space.

Embedding layers help to address the sparsity problem in one-hot encoded vectors by providing dense representations.




In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [21]:
model.fit(X_train, Y_train, epochs=1, batch_size=64, validation_split=0.2)



<keras.src.callbacks.History at 0x7ad6ee1522f0>

In [22]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.2843415439128876
Test Accuracy: 0.8844000101089478


In [23]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [24]:
# example usage
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: positive


In [25]:
# example usage
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: negative


In [26]:
# example usage
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: negative
