In [3]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)

Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [4]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 2880000
Testing samples: 720000


In [6]:
import os
import random
import psutil
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm
from copy import deepcopy
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import ttest_rel, uniform, randint
import xgboost as xgb
import lightgbm as lgb

In [7]:
# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# --- Load Preprocessed Data ---
processed_dir = "/kaggle/input/processed-chunks-1/"
print("Loading preprocessed data from processed-chunks-1...")

csv_files = sorted([f for f in os.listdir(processed_dir) if f.endswith('.csv')])
print(f"Found {len(csv_files)} CSV files: {csv_files[:5]}...")

dataframes = []
for csv_file in tqdm(csv_files, desc="Loading CSV files"):
    file_path = os.path.join(processed_dir, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

# Extract text and labels
text_column = 'cleaned_review'
label_column = 'label'
X = data[text_column].fillna("")
y = data[label_column].values

print("Label distribution:", np.bincount(y - 1))
print("Number of empty cleaned reviews:", (X == "").sum())

# Shuffle the data
print("Shuffling data...")
data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
X_shuffled = data_shuffled[text_column].fillna("")
y_shuffled = data_shuffled[label_column].values

# Split into train (2.88M) and test (720K)
X_train_processed = X_shuffled[:2880000].tolist()
X_test_processed = X_shuffled[2880000:2880000 + 720000].tolist()
y_train = y_shuffled[:2880000]
y_test = y_shuffled[2880000:2880000 + 720000]

y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Tokenize the text for Word2Vec
print("Tokenizing text data...")
X_train_tokenized = [simple_preprocess(text) for text in tqdm(X_train_processed, desc="Tokenizing X_train")]
X_test_tokenized = [simple_preprocess(text) for text in tqdm(X_test_processed, desc="Tokenizing X_test")]

# Train Word2Vec model
print("Training Word2Vec model...")
N_CORES = max(1, multiprocessing.cpu_count() // 2)
word2vec_model = Word2Vec(
    sentences=X_train_tokenized, 
    vector_size=50, 
    window=5, 
    min_count=5, 
    workers=N_CORES, 
    epochs=5, 
    sg=1
)
print("Word2Vec training complete!")

# Function to generate sentence embeddings
def get_word2vec_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Generate embeddings
print("Converting tokenized text to Word2Vec embeddings...")
X_train_vectors = np.array([get_word2vec_sentence_vector(tokens, word2vec_model) for tokens in tqdm(X_train_tokenized, desc="Embedding X_train")])
X_test_vectors = np.array([get_word2vec_sentence_vector(tokens, word2vec_model) for tokens in tqdm(X_test_tokenized, desc="Embedding X_test")])
print("Word2Vec transformation complete!")
print(f"Train Word2Vec shape: {X_train_vectors.shape}")
print(f"Test Word2Vec shape: {X_test_vectors.shape}")

# Scale embeddings for Multinomial NB (requires non-negative values)
scaler = MinMaxScaler()
X_train_vectors_scaled = scaler.fit_transform(X_train_vectors)
X_test_vectors_scaled = scaler.transform(X_test_vectors)

# --- Model Evaluation with Hyperparameter Tuning ---
N_CORES = multiprocessing.cpu_count() - 1

# Define base models and hyperparameter grids
model_templates = {
    'SVM': LinearSVC(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, n_jobs=N_CORES),
    'Multinomial NB': MultinomialNB(),
    'MLP Classifier': MLPClassifier(random_state=42, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', tree_method='hist', random_state=42, n_jobs=N_CORES),
    'LightGBM': lgb.LGBMClassifier(random_state=42, n_jobs=N_CORES, verbose=-1)
}

param_distributions = {
    'SVM': {'C': uniform(0.1, 10), 'max_iter': [5000]},
    'Logistic Regression': {'C': uniform(0.1, 10), 'max_iter': [5000]},
    'Multinomial NB': {'alpha': uniform(0.1, 2)},
    'MLP Classifier': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'learning_rate_init': uniform(0.0001, 0.01),
        'max_iter': [500]
    },
    'XGBoost': {
        'n_estimators': randint(100, 300),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2)
    },
    'LightGBM': {
        'n_estimators': randint(100, 300),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2)
    }
}

# Hyperparameter tuning on a small subset
best_models = {}
print("Performing hyperparameter tuning...")
subset_idx = random.sample(range(len(X_train_vectors)), 10000)  # Small subset for tuning
X_tune_subset = X_train_vectors[subset_idx]  # Use unscaled for most models
X_tune_subset_scaled = X_train_vectors_scaled[subset_idx]  # Scaled for Multinomial NB
y_tune_subset = y_train_adjusted[subset_idx]

for name, model in model_templates.items():
    print(f"Tuning {name}...")
    X_tune = X_tune_subset_scaled if name == 'Multinomial NB' else X_tune_subset
    search = RandomizedSearchCV(
        model, 
        param_distributions[name], 
        n_iter=10, 
        scoring='accuracy', 
        cv=3, 
        random_state=42, 
        n_jobs=N_CORES
    )
    search.fit(X_tune, y_tune_subset)
    best_models[name] = search.best_estimator_
    print(f"Best parameters for {name}: {search.best_params_}")

# Step 1: Train and evaluate on test set with tuned models
accuracies = {}
print("\nTraining and evaluating tuned models on test set...")
for name, model in best_models.items():
    X_train_subset = X_train_vectors_scaled if name == 'Multinomial NB' else X_train_vectors
    X_test_subset = X_test_vectors_scaled if name == 'Multinomial NB' else X_test_vectors
    subset_idx = random.sample(range(len(X_train_subset)), min(50000, len(X_train_subset)))
    model.fit(X_train_subset[subset_idx], y_train_adjusted[subset_idx])
    y_pred = model.predict(X_test_subset)
    acc = accuracy_score(y_test_adjusted, y_pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
check_memory()

top_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)[:2]
print("\nTop 2 models by accuracy:")
for name, acc in top_models:
    print(f"{name}: {acc:.4f}")

# Step 2: 10-Fold Cross-Validation with tuned models
kf = KFold(n_splits=10, shuffle=True, random_state=42)
metrics = ['accuracy', 'f1', 'roc_auc']
results = {name: {metric: [] for metric in metrics} for name in best_models.keys()}

print("\nPerforming 10-fold cross-validation with tuned models...")
for name, model in best_models.items():
    print(f"Cross-validating {name}...")
    X_cv = X_train_vectors_scaled if name == 'Multinomial NB' else X_train_vectors
    for train_idx, val_idx in tqdm(kf.split(X_cv), total=10, desc=f"Folds for {name}"):
        X_train_fold = X_cv[train_idx]
        y_train_fold = y_train_adjusted[train_idx]
        X_val_fold = X_cv[val_idx]
        y_val_fold = y_train_adjusted[val_idx]
        
        subset_idx = random.sample(list(train_idx), min(50000, len(train_idx)))
        X_train_subset = X_cv[subset_idx]
        y_train_subset = y_train_adjusted[subset_idx]
        
        model.fit(X_train_subset, y_train_subset)
        y_pred_fold = model.predict(X_val_fold)
        
        acc = accuracy_score(y_val_fold, y_pred_fold)
        f1 = f1_score(y_val_fold, y_pred_fold, average='weighted')
        if hasattr(model, "predict_proba"):
            y_prob_fold = model.predict_proba(X_val_fold)[:, 1]
            auc = roc_auc_score(y_val_fold, y_prob_fold)
        else:
            auc = np.nan
        
        results[name]['accuracy'].append(acc)
        results[name]['f1'].append(f1)
        results[name]['roc_auc'].append(auc)

# Display CV results
print("\n### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics")
print("Table: Performance of Tuned Models with Word2Vec Features (10-Fold Cross-Validation)")
print("| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |")
print("|--------------------|---------------------|---------------------|--------------------|")
for name in best_models.keys():
    acc_mean = np.mean(results[name]['accuracy'])
    acc_std = np.std(results[name]['accuracy'])
    f1_mean = np.mean(results[name]['f1'])
    f1_std = np.std(results[name]['f1'])
    auc_mean = np.mean(results[name]['roc_auc'])
    auc_std = np.std(results[name]['roc_auc'])
    print(f"| {name:<18} | {acc_mean:.4f} ± {acc_std:.4f}   | {f1_mean:.4f} ± {f1_std:.4f}   | {auc_mean:.4f} ± {auc_std:.4f}   |")

# Step 3: Statistical Significance Tests
top_model_1_name, top_model_2_name = top_models[0][0], top_models[1][0]
top_model_1_scores = results[top_model_1_name]['accuracy']
top_model_2_scores = results[top_model_2_name]['accuracy']

print("\n### 2. Conduct Statistical Significance Tests and Present p-Values")
print(f"Table: Statistical Significance of {top_model_1_name} and {top_model_2_name} vs. Other Models (Word2Vec)")
print("| Comparison                  | p-value (Accuracy) | Significant? |")
print("|-----------------------------|-------------------|--------------|")
for name in best_models.keys():
    if name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_1_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_1_name} vs. {name:<15} | {p_value:.10f}         | {significant:<12} |")
for name in best_models.keys():
    if name != top_model_2_name and name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_2_scores, other_scores)  # Fixed typo here
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_2_name} vs. {name:<15} | {p_value:.10f}         | {significant:<12} |")
check_memory()

Loading preprocessed data from processed-chunks-1...
Found 72 CSV files: ['processed_chunk_0_50000.csv', 'processed_chunk_1000000_1050000.csv', 'processed_chunk_100000_150000.csv', 'processed_chunk_1050000_1100000.csv', 'processed_chunk_1100000_1150000.csv']...


Loading CSV files: 100%|██████████| 72/72 [00:42<00:00,  1.68it/s]


Total rows loaded: 3600000
Label distribution: [1800000 1800000]
Number of empty cleaned reviews: 13
Shuffling data...
Tokenizing text data...


Tokenizing X_train: 100%|██████████| 2880000/2880000 [02:45<00:00, 17377.13it/s]
Tokenizing X_test: 100%|██████████| 720000/720000 [00:37<00:00, 19258.02it/s]


Training Word2Vec model...
Word2Vec training complete!
Converting tokenized text to Word2Vec embeddings...


Embedding X_train: 100%|██████████| 2880000/2880000 [05:03<00:00, 9503.66it/s] 
Embedding X_test: 100%|██████████| 720000/720000 [01:16<00:00, 9378.48it/s]


Word2Vec transformation complete!
Train Word2Vec shape: (2880000, 50)
Test Word2Vec shape: (720000, 50)
Performing hyperparameter tuning...
Tuning SVM...
Best parameters for SVM: {'C': 3.845401188473625, 'max_iter': 5000}
Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 3.845401188473625, 'max_iter': 5000}
Tuning Multinomial NB...
Best parameters for Multinomial NB: {'alpha': 0.412037280884873}
Tuning MLP Classifier...
Best parameters for MLP Classifier: {'hidden_layer_sizes': (50,), 'learning_rate_init': 0.0062748150962771655, 'max_iter': 500}
Tuning XGBoost...
Best parameters for XGBoost: {'learning_rate': 0.11495493205167782, 'max_depth': 9, 'n_estimators': 269}
Tuning LightGBM...
Best parameters for LightGBM: {'learning_rate': 0.11495493205167782, 'max_depth': 9, 'n_estimators': 269}

Training and evaluating tuned models on test set...
SVM Accuracy: 0.8495
Logistic Regression Accuracy: 0.8494
Multinomial NB Accuracy: 0.6817
MLP Classifier Accuracy: 0.856

Folds for SVM: 100%|██████████| 10/10 [04:51<00:00, 29.19s/it]


Cross-validating Logistic Regression...


Folds for Logistic Regression: 100%|██████████| 10/10 [00:31<00:00,  3.13s/it]


Cross-validating Multinomial NB...


Folds for Multinomial NB: 100%|██████████| 10/10 [00:11<00:00,  1.18s/it]


Cross-validating MLP Classifier...


Folds for MLP Classifier: 100%|██████████| 10/10 [01:04<00:00,  6.49s/it]


Cross-validating XGBoost...


Folds for XGBoost: 100%|██████████| 10/10 [03:38<00:00, 21.84s/it]


Cross-validating LightGBM...


Folds for LightGBM: 100%|██████████| 10/10 [01:45<00:00, 10.51s/it]


### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics
Table: Performance of Tuned Models with Word2Vec Features (10-Fold Cross-Validation)
| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |
|--------------------|---------------------|---------------------|--------------------|
| SVM                | 0.8493 ± 0.0008   | 0.8493 ± 0.0008   | nan ± nan   |
| Logistic Regression | 0.8492 ± 0.0007   | 0.8492 ± 0.0007   | 0.9250 ± 0.0005   |
| Multinomial NB     | 0.6949 ± 0.0174   | 0.6894 ± 0.0199   | 0.7862 ± 0.0017   |
| MLP Classifier     | 0.8561 ± 0.0025   | 0.8561 ± 0.0026   | 0.9321 ± 0.0020   |
| XGBoost            | 0.8487 ± 0.0007   | 0.8487 ± 0.0007   | 0.9269 ± 0.0005   |
| LightGBM           | 0.8493 ± 0.0008   | 0.8493 ± 0.0008   | 0.9271 ± 0.0005   |

### 2. Conduct Statistical Significance Tests and Present p-Values
Table: Statistical Significance of MLP Classifier and SVM vs. Other Models (Word2Vec)
| Compari


