# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b>  Importing Libraries </b></div>

In this section, we import all the libraries required for data processing, feature extraction, and model building. These libraries help us perform tasks like loading and manipulating data, tokenizing text, building neural networks.


In [None]:
# Core libraries
import numpy as np
import pandas as pd

# NLP preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')

# Scikit-learn for splitting data, evaluation and models 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


# TensorFlow/Keras for deep learning (BiLSTM)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    LSTM, Bidirectional, Dense, Dropout, Masking
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping


# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 3. Loading the Dataset </b></div>

In this step, we load the IMDB dataset, which contains 50,000 movie reviews labeled as either positive or negative.  
Each review is stored as a text entry along with its sentiment label.

We use `pandas.read_csv()` to load the dataset into a DataFrame for further processing.


In [None]:
path = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'
df = pd.read_csv(path)

# Show the first 5 rows
df.head()

# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 4. Data Preprocessing </b></div>



In [None]:
# Define English stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply preprocessing to the 'review' column
df['clean_review'] = df['review'].apply(preprocess_text)

# Show sample after preprocessing
df[['review', 'clean_review']].head()


In [None]:
# Keep only Positive and Negative samples
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

In [None]:
# Step 1: Split into train (80%) and test (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

# Step 2: Split train into train (80% of 80%) and val (20% of 80%) → 64% train, 16% val
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['sentiment'])

# Show sizes
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 5. Tokenization</b></div>


In [None]:
# Apply word_tokenize to the cleaned review column
train_df['tokens'] = train_df['clean_review'].apply(word_tokenize)
val_df['tokens'] = val_df['clean_review'].apply(word_tokenize)
test_df['tokens'] = test_df['clean_review'].apply(word_tokenize)

# Show sample tokens
train_df[['clean_review', 'tokens']].head()


# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 6. Text Representation Methods </b></div>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join the tokens back into full sentences (as TF-IDF expects raw text input)
train_texts = train_df['tokens'].apply(lambda x: ' '.join(x))
val_texts = val_df['tokens'].apply(lambda x: ' '.join(x))
test_texts = test_df['tokens'].apply(lambda x: ' '.join(x))

# Initialize TF-IDF vectorizer with a maximum of 5000 features
tfidf = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on training data and transform it
X_train_tfidf = tfidf.fit_transform(train_texts)

# Transform validation and test sets using the fitted vectorizer
X_val_tfidf = tfidf.transform(val_texts)
X_test_tfidf = tfidf.transform(test_texts)


In [None]:
from gensim.models import Word2Vec

# Train a Word2Vec model on the tokenized text
w2v_model = Word2Vec(sentences=train_df['tokens'], vector_size=200, window=6, min_count=2)

# Function to convert tokens to sequence of word vectors
def tokens_to_sequence(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return vectors

# Apply tokens_to_sequence to convert tokens to sequences of word vectors
X_train_seq_w2v = train_df['tokens'].apply(tokens_to_sequence).tolist()
X_val_seq_w2v = val_df['tokens'].apply(tokens_to_sequence).tolist()
X_test_seq_w2v = test_df['tokens'].apply(tokens_to_sequence).tolist()

# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 7. Model Training </b></div>

A model is a mathematical structure that learns patterns from data.  
Training a model means teaching it to make predictions by learning from labeled examples (inputs and outputs).


In [None]:
# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=250, C=0.1, penalty='l2')
# Train the model
log_reg.fit(X_train_tfidf, train_df['sentiment'])

# Predict on training set
log_train_preds = log_reg.predict(X_train_tfidf)
train_accuracy = accuracy_score(train_df['sentiment'], log_train_preds)

# Predict on validation set
log_val_preds = log_reg.predict(X_val_tfidf)
val_accuracy = accuracy_score(val_df['sentiment'], log_val_preds)

# Print results
print("Logistic Regression Accuracy:")
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")


In [None]:
# Define max sequence length and vector size (same as Word2Vec vector size)
max_len = 280
vector_size = 200

# Pad sequences to make them all of the same shape (max_len x vector_size)
X_train_seq_padded = pad_sequences(X_train_seq_w2v, maxlen=max_len,
                                   dtype='float32',padding='post',
                                   truncating='post', value=0.0)
X_val_seq_padded = pad_sequences(X_val_seq_w2v, maxlen=max_len,
                                 dtype='float32',padding='post',
                                 truncating='post', value=0.0)
X_test_seq_padded = pad_sequences(X_test_seq_w2v, maxlen=max_len,
                                  dtype='float32',padding='post',
                                  truncating='post', value=0.0)


In [None]:
# Build the BiLSTM model
model = Sequential([
    Masking(mask_value=0.0, input_shape=(max_len, vector_size)), 
    Bidirectional(LSTM(256, return_sequences=True, 
                       kernel_regularizer=regularizers.l2(0.0005))),
    Dropout(0.4),
    Bidirectional(LSTM(128, kernel_regularizer=regularizers.l2(0.0005))),
    Dropout(0.4),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.0005)),
    Dropout(0.4),
    Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.0005)),
    Dropout(0.4),
    Dense(1, activation='sigmoid') 
])

# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer= Adam(learning_rate=0.001),
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_seq_padded,
                    train_df['sentiment'],
                    epochs=12,
                    batch_size=64,
                    validation_data=(X_val_seq_padded, val_df['sentiment']))


# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 8. Export Predictions for Evaluation </b></div>


In [None]:
# Logistic Regression Predictions
logistic_probs = log_reg.predict_proba(X_test_tfidf)
logistic_preds = log_reg.predict(X_test_tfidf)

logistic_results_df = pd.DataFrame({
    'review': test_df['review'].values,
    'true_label': test_df['sentiment'].values,
    'predicted_label': logistic_preds,
    'prob_negative': logistic_probs[:, 0],
    'prob_positive': logistic_probs[:, 1]
})

logistic_results_df.to_csv("logistic_preds.csv", index=False)
print("Saved logistic_preds.csv")


# BiLSTM Predictions
bilstm_probs = model.predict(X_test_seq_padded)
bilstm_preds = (bilstm_probs > 0.5).astype(int).flatten()

bilstm_results_df = pd.DataFrame({
    'review': test_df['review'].values,
    'true_label': test_df['sentiment'].values,
    'predicted_label': bilstm_preds,
    'prob_negative': 1 - bilstm_probs.flatten(),
    'prob_positive': bilstm_probs.flatten()
})

bilstm_results_df.to_csv("bilstm_preds.csv", index=False)
print("Saved bilstm_preds.csv")


# <div style="text-align:center; background: #03045e; padding: 7px; border-radius:10px 10px; font-size: 1.5em; color: #e3f2fd; cursor: pointer;font-family: cursive;"><b> 9. Conclusion </b></div>

