In [15]:

# Core data manipulation libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations and array functionality
import re  # For regular expressions (text pattern matching)

# NLTK (Natural Language Toolkit) imports for text processing
import nltk
from nltk.tokenize import word_tokenize  # Split text into individual words
from nltk.corpus import stopwords  # Common words to filter out (e.g., 'the', 'a', 'is')
from nltk.stem import PorterStemmer  # Reduce words to their root form (stem)
from nltk.stem import WordNetLemmatizer  # Reduce words to their dictionary form (lemma)

# Scikit-learn Pipeline for chaining preprocessing and model steps
from sklearn.pipeline import Pipeline

# Scikit-learn feature extraction tools for text vectorization
from sklearn.feature_extraction.text import CountVectorizer  # Convert text to word count vectors
from sklearn.feature_extraction.text import TfidfTransformer  # Transform counts to TF-IDF representation
from sklearn.feature_extraction.text import TfidfVectorizer  # Combined CountVectorizer + TfidfTransformer

# Scikit-learn classification models
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors classifier
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes classifier for text
from sklearn.svm import SVC  # Support Vector Machine classifier

# Scikit-learn model selection and validation tools
from sklearn.model_selection import StratifiedKFold  # Stratified K-fold cross-validation (maintains class distribution)
from sklearn.model_selection import KFold  # Standard K-fold cross-validation
from sklearn.model_selection import train_test_split  # Split data into train/test sets

# Scikit-learn evaluation metrics
from sklearn.metrics import accuracy_score  # Calculate accuracy percentage
from sklearn.metrics import precision_score  # Calculate precision (true positives / predicted positives)
from sklearn.metrics import recall_score  # Calculate recall (true positives / actual positives)
from sklearn.metrics import f1_score  # Calculate F1 score (harmonic mean of precision and recall)
from sklearn.metrics import classification_report  # Generate comprehensive classification metrics

# Scikit-learn base classes for creating custom transformers
from sklearn.base import BaseEstimator, TransformerMixin  # Base classes for custom pipeline components

# Gensim for word embeddings
from gensim.models import Word2Vec  # Train and use Word2Vec word embedding models

# TensorFlow/Keras for deep learning models
from tensorflow.keras.models import Sequential  # Sequential neural network model
from tensorflow.keras.layers import LSTM, Dense, Embedding  # Neural network layer types
from tensorflow.keras.preprocessing.text import Tokenizer  # Convert text to sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Pad sequences to uniform length

# Download required NLTK data files
nltk.download('punkt')  # Tokenizer models for sentence and word splitting
nltk.download('stopwords')  # Lists of common stopwords in multiple languages
nltk.download('wordnet')
# Ignore warnings to make life easier
import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lapos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lapos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lapos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
df_train = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
df_test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet")
df_unsupervised = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/unsupervised-00000-of-00001.parquet")

In [17]:
df_train.shape

(25000, 2)

In [18]:
df_test.shape

(25000, 2)

In [19]:
df_train.columns


Index(['text', 'label'], dtype='object')

In [20]:
df_test.columns


Index(['text', 'label'], dtype='object')

In [21]:
df_train['label'].unique()

array([0, 1])

In [22]:
df_test['label'].unique()

array([0, 1])

In [23]:
df_train['text'].sample(10)

14149    Fr√§ulein Doktor is as good a demonstration as ...
20683    Magnificent and unforgettable, stunningly atmo...
9186     ... or maybe it just IS this bad. The plot is ...
15120    Finally, Timon and Pumbaa in their own film......
13334    I will admit, I thought this movie wasn't goin...
8944     This U.S soap opera, 'Knots Landing' has all t...
12386    I am sorry to fans of this film but it is the ...
3746     Err...this movie sucked. A LOT.<br /><br />I h...
20366    The Write Word<br /><br />What you see is what...
18116    I sat through both parts of Che last night, ba...
Name: text, dtype: object

In [24]:
df_test['text'].sample(10)

7319     What a bloody nuisance! You can't get on subje...
18728    For some perverse reason best known to themsel...
8340     I sat last night to see this film being played...
8839     This TV show is possibly the most pathetic dis...
10238    I don't even understand what they tried to acc...
13416    So keira knightly is in it...So automatically ...
23266    I started watching this expecting the worst, i...
10118    I think my summary says it all. This MTV-ish a...
3100     I think Homegrown is a bit of a misnomer for t...
4944     weak direction, weak plot, unimpressive music,...
Name: text, dtype: object

In [25]:
class pre_process_text(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        prep_sentences = []
        for text in X:
            # Remove HTML tags (good practice)
            text = re.sub(r'<.*?>', '', text)

            # Tokenize
            tokens = word_tokenize(text)

            words_to_keep = {"not", "no", "against", "down","no", "nor", "not", "don","very", "too", "more", "most", "so", "only"}


            for word in words_to_keep:
                self.stop_words.discard(word)

            # Process tokens
            processed = [
                self.lemmatizer.lemmatize(token.lower())
                for token in tokens
                if token.isalpha() and token.lower() not in self.stop_words
            ]

            # Join back to a string
            prep_sentences.append(" ".join(processed))

        return prep_sentences

class Word2VecAverager(BaseEstimator, TransformerMixin):
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model
        self.vector_size = w2v_model.wv.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        avg_vectors = []

        for doc in X:
            doc_vectors = []
            # We .split() the processed string
            for word in doc.split():
                if word in self.w2v_model.wv:
                    doc_vectors.append(self.w2v_model.wv[word])

            if not doc_vectors:
                avg_vectors.append(np.zeros(self.vector_size))
            else:
                avg_vectors.append(np.mean(doc_vectors, axis=0))

        return np.array(avg_vectors)


In [26]:
df_train = pd.DataFrame(df_train)
df_test = pd.DataFrame(df_test)

X_train = df_train['text']
y_train = df_train['label'].values
X_test = df_test['text']
y_test = df_test['label'].values

In [None]:
from keras.src.callbacks import EarlyStopping
from keras.src.layers import Bidirectional, Dropout

# --- Pre-process all text data ---
print("--- Preprocessing Text ---")
pre_processor = pre_process_text()

# Fit on training data and transform both train and test
X_train = pre_processor.fit_transform(X_train)
X_test = pre_processor.transform(X_test)
print(f"Example processed review: {X_train[0][:150]}...")

tokenized_processed_train = [review.split() for review in X_train]


embedding_dim = 100


min_word_count = 3


window_size = 5


num_workers = 4

word2vec_model = Word2Vec(
    sentences=tokenized_processed_train,
    vector_size=embedding_dim,
    window=window_size,
    min_count=min_word_count,
    workers=num_workers,
    sg=1,       # Use Skip-gram
    negative=5  # Use Negative Sampling
)

keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts(X_train) # Fit on the processed text
X_train_sequences = keras_tokenizer.texts_to_sequences(X_train)


all_lengths = [len(seq) for seq in X_train_sequences]
# max_length = int(np.percentile(all_lengths, 95))
max_length = max(len(sentence.split())
                for text in df_train['text']
                for sentence in text.split('.'))

print(max_length)

X_train_padded = pad_sequences(
    X_train_sequences,
    maxlen=max_length,
    padding='post'
)


vocab_size = len(keras_tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in keras_tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]


def build_lstm_model():
    model = Sequential()
    model.add(Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_length,  # Use calculated max_length
        trainable=False  # Freeze the embeddings
    ))
    # Use Bidirectional LSTM
    model.add(Bidirectional(LSTM(
        units=100,
        recurrent_dropout=0.2  # Add recurrent dropout
    )))
    model.add(Dropout(0.3))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

early_stopper = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True  # Critically important!
)
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
acc_scores, f1_scores, precision_scores, recall_scores = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_padded, y_train)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")
    X_fold_train = X_train_padded[train_idx]
    y_fold_train = y_train[train_idx]
    X_fold_val = X_train_padded[val_idx]
    y_fold_val = y_train[val_idx]

    model = build_lstm_model()
    print(f"Training on {len(X_fold_train)} samples, validating on {len(X_fold_val)} samples.")
    model.fit(
        X_fold_train, y_fold_train,
        epochs=20,
        callbacks=[early_stopper],
        validation_data=(X_fold_val, y_fold_val),
        verbose=0
    )

    y_pred_probs = model.predict(X_fold_val)
    y_pred_classes = (y_pred_probs > 0.5).astype(int).flatten()

    acc = accuracy_score(y_fold_val, y_pred_classes)
    precision = precision_score(y_fold_val, y_pred_classes, zero_division=0)
    recall = recall_score(y_fold_val, y_pred_classes, zero_division=0)
    f1 = f1_score(y_fold_val, y_pred_classes, zero_division=0)

    acc_scores.append(acc)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

    print(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    print(classification_report(y_fold_val, y_pred_classes, zero_division=0))




final_model = build_lstm_model()
final_model.fit(
    X_train_padded,  # All processed, padded training data
    y_train,         # All training labels
    epochs=20,
    verbose=0
)


print("Evaluating on Test Set...")
X_test_sequences = keras_tokenizer.texts_to_sequences(X_test) # Use processed X_test
X_test_padded = pad_sequences(
    X_test_sequences,
    maxlen=max_length, # Use same max_length
    padding='post'
)

# Make predictions on the test set
test_pred_probs = final_model.predict(X_test_padded)
test_pred_classes = (test_pred_probs > 0.5).astype(int).flatten()

print("\n--- Test Set Performance ---")
test_acc = accuracy_score(y_test, test_pred_classes)
print(f"Test Accuracy: {test_acc:.4f}")
print(classification_report(y_test, test_pred_classes, zero_division=0))

--- Preprocessing Text ---
Example processed review: rented video store controversy surrounded first released also heard first seized custom ever tried enter country therefore fan film considered controv...
645

--- Fold 1/5 ---
Training on 20000 samples, validating on 5000 samples.
