## Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


## Loading dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 68.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [3]:
csv_file = path + "/IMDB Dataset.csv"
df = pd.read_csv(csv_file)

## Text cleaning and Pre-processing 


In [4]:

import re
# Convert sentiments to binary values
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Clean text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<br />', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['cleaned_review'])

sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
max_length = 200
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split into training (80%), dev (10%), and test (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## Building and Training the LSTM model

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
model.add(LSTM(units=64, return_sequences=False))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_dev, y_dev))


Epoch 1/5




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 266ms/step - accuracy: 0.5890 - loss: 0.6424 - val_accuracy: 0.8500 - val_loss: 0.3494
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 265ms/step - accuracy: 0.8715 - loss: 0.3188 - val_accuracy: 0.8816 - val_loss: 0.2919
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 262ms/step - accuracy: 0.8991 - loss: 0.2593 - val_accuracy: 0.8830 - val_loss: 0.2726
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 267ms/step - accuracy: 0.9152 - loss: 0.2248 - val_accuracy: 0.8800 - val_loss: 0.2932
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 262ms/step - accuracy: 0.9241 - loss: 0.2078 - val_accuracy: 0.8790 - val_loss: 0.2862


##Evaluation

In [6]:
from sklearn.metrics import f1_score

# Predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"F1-score: {f1}")


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step
F1-score: 0.889360880478857
