In [1]:
print("hello")

hello


In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score

# --- 0. Example Data (Simulating your DataFrames) ---
# We create dummy data that matches your variable names
# (X_train, y_train, X_test, y_test)

train_data = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
test_data = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet")
df_unsupervised = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/unsupervised-00000-of-00001.parquet")


df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

X_train = df_train['text']
y_train = df_train['label'].values # Use .values for numpy array
X_test = df_test['text']
y_test = df_test['label'].values

# --- 1. PRE-LOOP: Train Word2Vec (The "Dictionary") ---
# We train Word2Vec on ALL training text to get the best vocabulary.
print("Training Word2Vec model...")
tokenized_train_text = [review.split() for review in X_train]

embedding_dim = 50
min_word_count = 1
window_size = 2

word2vec_model = Word2Vec(
    sentences=tokenized_train_text,
    vector_size=embedding_dim,
    window=window_size,
    min_count=min_word_count
)
print("Word2Vec model trained.")

# --- 2. PRE-LOOP: Prepare Keras Tokenizer & Padding ---
# We fit the tokenizer on ALL training text and pad ALL training sequences.
# This ensures the word-to-integer mapping is consistent across all folds.

max_length = 125  # Max length of a review (in words)

keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts(X_train)

# Convert all training text to padded sequences
X_train_sequences = keras_tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(
    X_train_sequences,
    maxlen=max_length,
    padding='post'
)

vocab_size = len(keras_tokenizer.word_index) + 1  # +1 for 0 padding

print(f"\nVocabulary Size: {vocab_size}")
print(f"Padded Training Sequences Shape: {X_train_padded.shape}")

# --- 3. PRE-LOOP: Create the Embedding Matrix ---
# This matrix maps the Keras integer index to the Word2Vec vector.
# This is also done only ONCE.
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in keras_tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
print(f"Embedding Matrix shape: {embedding_matrix.shape}")


# --- 4. Model Building Function ---
# We create a function to build a fresh, compiled model.
# This is CRUCIAL for K-Fold, as each fold needs a new model.
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=False  # Freeze the embeddings
    ))
    model.add(LSTM(units=100))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# --- 5. The K-Fold Cross-Validation Loop ---
print("\n--- Starting K-Fold Cross-Validation ---")

n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store scores from each fold
acc_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Note: We split the PADDED sequences and labels
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_padded, y_train)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    # 1. Get the data for this fold
    # We use the indices to slice our pre-padded data
    X_fold_train = X_train_padded[train_idx]
    y_fold_train = y_train[train_idx]
    X_fold_val = X_train_padded[val_idx]
    y_fold_val = y_train[val_idx]

    # 2. Build a new, fresh model
    model = build_lstm_model()

    # 3. Fit the model on the TRAINING fold
    # We use the validation fold for the 'validation_data' argument
    print(f"Training on {len(X_fold_train)} samples, validating on {len(X_fold_val)} samples.")
    model.fit(
        X_fold_train,
        y_fold_train,
        epochs=20,
        validation_data=(X_fold_val, y_fold_val),
        verbose=0  # Set to 1 to see epoch-by-epoch training
    )

    # 4. Evaluate on the VALIDATION fold
    # Get probabilities
    y_pred_probs = model.predict(X_fold_val)
    # Convert probabilities to classes (0 or 1)
    y_pred_classes = (y_pred_probs > 0.5).astype(int).flatten()

    # 5. Calculate and store metrics
    acc = accuracy_score(y_fold_val, y_pred_classes)
    precision = precision_score(y_fold_val, y_pred_classes, zero_division=0)
    recall = recall_score(y_fold_val, y_pred_classes, zero_division=0)
    f1 = f1_score(y_fold_val, y_pred_classes, zero_division=0)

    acc_scores.append(acc)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

    print(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    print(classification_report(y_fold_val, y_pred_classes, zero_division=0))

# --- 6. Final Results ---
print("\n--- Cross-Validation Summary ---")
print(f"Mean Accuracy:   {np.mean(acc_scores):.4f} (+/- {np.std(acc_scores):.4f})")
print(f"Mean F1-Score:   {np.mean(f1_scores):.4f}")
print(f"Mean Precision:  {np.mean(precision_scores):.4f}")
print(f"Mean Recall:     {np.mean(recall_scores):.4f}")


# --- 7. Final Evaluation on TEST Set ---
# After CV, you train ONE final model on ALL training data
# and evaluate it on the unseen TEST set.

print("\n--- Training Final Model on All Training Data ---")
final_model = build_lstm_model()
final_model.fit(
    X_train_padded,  # All training data
    y_train,         # All training labels
    epochs=20,
    verbose=0
)

# Prepare the X_test data using the SAME tokenizer and max_length
print("Evaluating on Test Set...")
X_test_sequences = keras_tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(
    X_test_sequences,
    maxlen=max_length,
    padding='post'
)

# Make predictions on the test set
test_pred_probs = final_model.predict(X_test_padded)
test_pred_classes = (test_pred_probs > 0.5).astype(int).flatten()

print("\n--- Test Set Performance ---")
test_acc = accuracy_score(y_test, test_pred_classes)
print(f"Test Accuracy: {test_acc:.4f}")
print(classification_report(y_test, test_pred_classes, zero_division=0))

Training Word2Vec model...
