In [2]:
# Install the Kaggle library to interact with Kaggle datasets
!pip install kaggle



# 1. Importing dependencies

In [3]:
import os # Provides a way to interact with the operating system
import json # Used for working with JSON data

from zipfile import ZipFile # To extract Kaggle Zip file
import pandas as pd # Load csv into a pandas df
from sklearn.model_selection import train_test_split # Split data into training and testing data
# Dependencies to build neural network
from tensorflow.keras.models import Sequential # API to build neural networks with Tensor Flow - Sequential is used to allows models to stack linear layers
from tensorflow.keras.layers import Dense, Embedding, LSTM # Layers for building the neural network
from tensorflow.keras.preprocessing.text import Tokenizer # To convert text into numerical sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences # To make sure all the input data has the same shape

# 2. Data collection - Kaggle API

In [4]:
# Load Kaggle credentials from the kaggle.json file
kaggle_dictionary = json.load(open('kaggle.json'))

In [5]:
# Display the keys in the loaded Kaggle dictionary (should contain 'username' and 'key')
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [6]:
# Setup kaggle credentials as environment variables
os.environ['KAGGLE_USERNAME'] = kaggle_dictionary['username']
os.environ['KAGGLE_KEY'] = kaggle_dictionary['key']

In [7]:
# Download the dataset from Kaggle using the Kaggle API command
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s]
100% 25.7M/25.7M [00:00<00:00, 1.59GB/s]


In [8]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [9]:
# Unzip dataset file
with ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as zip_ref:
   zip_ref.extractall()

In [10]:
!ls

'IMDB Dataset.csv'			 kaggle.json
 imdb-dataset-of-50k-movie-reviews.zip	 sample_data


# 3. Loading the dataset

In [12]:
# Load the downloaded CSV file into a pandas DataFrame
data = pd.read_csv('IMDB Dataset.csv')

In [13]:
# Display the number of rows and columns in the DataFrame
data.shape

(50000, 2)

In [14]:
# Display the first few rows of the DataFrame to get a glimpse of the data
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
# Display the count of each unique value in the "sentiment" column
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [16]:
# Replace sentiment values: 'positive' with 1 and 'negative' with 0
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [17]:
# Display the first few rows again to see the updated sentiment values (1 and 0)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [19]:
# Display the count of each unique value in the "sentiment" column after replacement
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


# 4. Splitting data

In [20]:
# Splitting the data into training and testing data (80% for training, 20% for testing)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
# Print the shapes of the training and testing dataframes
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


# 5. Data Preprocessing

In [23]:
# Tokenize text data, used to convert words into numbers
tokenizer = Tokenizer(num_words=5000)  # Initialize a tokenizer to vectorize text data, considering the top 5000 words
tokenizer.fit_on_texts(train_data["review"]) # Fit the tokenizer on the training data reviews
# Convert those 5000 into integers and pad sequences
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200) # Convert training reviews to sequences of integers and pad them to a max length of 200
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200) # Convert test reviews to sequences of integers and pad them to a max length of 200

In [25]:
print(x_train)
print(x_test)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]
[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [29]:
y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

In [31]:
print(y_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


# 6. LSTM - Long Short Term Memory

In [40]:
# Build a sequential model (linear stack of layers)
model = Sequential()
# Add an Embedding layer: converts integer sequences to dense vectors of fixed size
model.add(Embedding(input_dim=10000, output_dim=128, input_length=300))
# Add an LSTM layer: processes sequences, capturing dependencies and handling variable-length input
model.add(LSTM(units=192, dropout=0.2, recurrent_dropout=0.2))
# Add a Dense layer with sigmoid activation: outputs a single value between 0 and 1, suitable for binary classification
model.add(Dense(units=1, activation="sigmoid"))



# 7. Compiling and Training the Model

In [41]:
# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [42]:
# Train the model
# x_train: training data
# y_train: training labels
# epochs: number of training iterations
# batch_size: number of samples per gradient update
# validation_split: fraction of training data to use for validation
history = model.fit(x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 649ms/step - accuracy: 0.7158 - loss: 0.5464 - val_accuracy: 0.8105 - val_loss: 0.4328
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 639ms/step - accuracy: 0.8378 - loss: 0.3886 - val_accuracy: 0.8486 - val_loss: 0.3544
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 623ms/step - accuracy: 0.8772 - loss: 0.3065 - val_accuracy: 0.8564 - val_loss: 0.3473
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 618ms/step - accuracy: 0.8872 - loss: 0.2825 - val_accuracy: 0.8652 - val_loss: 0.3469
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 613ms/step - accuracy: 0.9185 - loss: 0.2184 - val_accuracy: 0.8709 - val_loss: 0.3328


In [43]:
model.summary()

# 8. Evaluating the Model

In [44]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.3252622187137604
Test Accuracy: 0.8745999932289124


# 9. Building a predictive system

In [45]:
def predict_sentiment(review):
  #tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  #make a prediction
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction > 0.5 else "negative"
  return sentiment

In [46]:
# Example usage
new_review = "This movie was fantastic! I loved every minute of it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 615ms/step
The sentiment of the review is: positive
