In [1]:

import kagglehub
imdb_dataset_of_50k_movie_reviews_path = kagglehub.dataset_download('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')

print('Data source import complete.')


Data source import complete.


# Import Libraries

In [None]:
import numpy as np  
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load Data & EDA

In [3]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df.isna().sum()

Unnamed: 0,0
review,0
sentiment,0


### Splitting Data

In [6]:
train_df, temp_df = train_test_split(
    df,
    train_size=5000,
    random_state=42,
    stratify=df['sentiment']
)

test_df, _ = train_test_split(
    temp_df,
    train_size=1000,
    random_state=42,
    stratify=temp_df['sentiment']
)

X_train = train_df['review'].tolist()
y_train = train_df['sentiment'].tolist()
X_test = test_df['review'].tolist()
y_test = test_df['sentiment'].tolist()

# Preprocessing

In [7]:
from bs4 import BeautifulSoup

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub(r'\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

# Apply the denoising function to the training and testing data
X_train = [denoise_text(review) for review in X_train]
X_test = [denoise_text(review) for review in X_test]

### Tokenization & Normalization

In [8]:
import nltk
nltk.download('punkt_tab')
def normalize_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return word_tokenize(text)

X_train_tokens = [normalize_and_tokenize(doc) for doc in X_train]
X_test_tokens = [normalize_and_tokenize(doc) for doc in X_test]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### Stemming

In [9]:
stemmer = PorterStemmer()
X_train_stemmed = [[stemmer.stem(tok) for tok in doc] for doc in X_train_tokens]
X_test_stemmed = [[stemmer.stem(tok) for tok in doc] for doc in X_test_tokens]

### Lemmatization

In [10]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatize(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]

X_train_lemmatized = [lemmatize(doc) for doc in X_train_tokens]
X_test_lemmatized = [lemmatize(doc) for doc in X_test_tokens]

In [12]:
X_train_text = [' '.join(doc) for doc in X_train_lemmatized]
X_test_text = [' '.join(doc) for doc in X_test_lemmatized]

### One-Hot Encoding

In [13]:
ohe = CountVectorizer(binary=True)
X_train_ohe = ohe.fit_transform(X_train_text)
X_test_ohe = ohe.transform(X_test_text)

In [14]:
feature_names = ohe.get_feature_names_out()

print("Example processed text:", X_train_text[0])

print("\nWords present in first review (One-Hot Encoding):")
vector_values = X_train_ohe[0].toarray()[0]

present_words = [feature_names[i] for i, val in enumerate(vector_values) if val == 1]
print(present_words)

Example processed text: I get the dvd very cheap and I m a total drewbie and that s probably the only constellation where this movie could ever interest anyonean early draw movie she s look great and she get a quite lot of really cute scene of she like a shower scene a sexy dance scene quite a number of sexy outfit etc she do never show the friendly charm we know from her more recent moviesthe movie itself be pretty average or subaverage and much more look like be make for the tv than one for the cinema there be no real horror or tension build up and the dialog be often cheesythe most interesting part be probably the end because I honestly do not understand it but maybe there be nothing to understand about it anyway but at least you do not get the end you would be expect and it also come much soon than one would have expectedoverall I think this movie be exclusively for drewbie

Words present in first review (One-Hot Encoding):
['about', 'also', 'and', 'anyonean', 'anyway', 'at', 'aver

### Bag-of-Words

In [15]:
bow = CountVectorizer(binary=False)
X_train_bow = bow.fit_transform(X_train_text)
X_test_bow = bow.transform(X_test_text)

In [16]:
feature_names = bow.get_feature_names_out()

vector_values = X_train_bow[0].toarray()[0]

df_bow_first_review = pd.DataFrame({
    'word': feature_names,
    'count': vector_values
})

print("Example processed text:", X_train_text[0])
print("\nBOW vector (first review):")
print(df_bow_first_review)

Example processed text: I get the dvd very cheap and I m a total drewbie and that s probably the only constellation where this movie could ever interest anyonean early draw movie she s look great and she get a quite lot of really cute scene of she like a shower scene a sexy dance scene quite a number of sexy outfit etc she do never show the friendly charm we know from her more recent moviesthe movie itself be pretty average or subaverage and much more look like be make for the tv than one for the cinema there be no real horror or tension build up and the dialog be often cheesythe most interesting part be probably the end because I honestly do not understand it but maybe there be nothing to understand about it anyway but at least you do not get the end you would be expect and it also come much soon than one would have expectedoverall I think this movie be exclusively for drewbie

BOW vector (first review):
                     word  count
0                      00      0
1              

### TF-IDF

In [17]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

In [18]:
feature_names = tfidf.get_feature_names_out()

vector_values = X_train_tfidf[0].toarray()[0]

df_tfidf_first_review = pd.DataFrame({
    'word': feature_names,
    'tfidf': vector_values
})

df_tfidf_first_review = df_tfidf_first_review.sort_values(by='tfidf', ascending=False)

print("Example processed text:", X_train_text[0])
print("\nTF-IDF vector (first review):")
print(df_tfidf_first_review)

Example processed text: I get the dvd very cheap and I m a total drewbie and that s probably the only constellation where this movie could ever interest anyonean early draw movie she s look great and she get a quite lot of really cute scene of she like a shower scene a sexy dance scene quite a number of sexy outfit etc she do never show the friendly charm we know from her more recent moviesthe movie itself be pretty average or subaverage and much more look like be make for the tv than one for the cinema there be no real horror or tension build up and the dialog be often cheesythe most interesting part be probably the end because I honestly do not understand it but maybe there be nothing to understand about it anyway but at least you do not get the end you would be expect and it also come much soon than one would have expectedoverall I think this movie be exclusively for drewbie

TF-IDF vector (first review):
                  word     tfidf
12228          drewbie  0.355016
36772       

In [19]:
import joblib
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']

In [20]:
print("OHE shape:", X_train_ohe.shape, X_test_ohe.shape)
print("BOW shape:", X_train_bow.shape, X_test_bow.shape)
print("TF-IDF shape:", X_train_tfidf.shape, X_test_tfidf.shape)

OHE shape: (5000, 46700) (1000, 46700)
BOW shape: (5000, 46700) (1000, 46700)
TF-IDF shape: (5000, 46700) (1000, 46700)


In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()
model.add(Dense(128, input_shape=(X_train_tfidf.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid')) # Sigmoid for binary classification

# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Convert target variable to numerical
y_train_numerical = (train_df['sentiment'] == 'positive').astype(int)
y_test_numerical = (test_df['sentiment'] == 'positive').astype(int)

# Train the model
history = model.fit(X_train_tfidf, y_train_numerical,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_test_tfidf, y_test_numerical),
                    verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_tfidf, y_test_numerical, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 1s/step - accuracy: 0.6219 - loss: 0.6461 - val_accuracy: 0.8550 - val_loss: 0.3444
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 1s/step - accuracy: 0.9299 - loss: 0.2089 - val_accuracy: 0.8610 - val_loss: 0.3382
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 1s/step - accuracy: 0.9859 - loss: 0.0553 - val_accuracy: 0.8510 - val_loss: 0.4184
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 1s/step - accuracy: 0.9957 - loss: 0.0207 - val_accuracy: 0.8430 - val_loss: 0.4763
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 928ms/step - accuracy: 0.9989 - loss: 0.0083 - val_accuracy: 0.8470 - val_loss: 0.5189
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 869ms/step - accuracy: 0.9998 - loss: 0.0043 - val_accuracy: 0.8420 - val_loss: 0.5622
Epoch 7/10
[1m1

In [22]:
model.save('imdb_sentiment_analysis_model.h5')

