In [2]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)

Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [3]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 2880000
Testing samples: 720000


# FASTTEXT

In [5]:
# Install DEAP and GPU-enabled libraries for Kaggle
!pip install deap
!pip install xgboost --upgrade  # GPU support included in recent versions
!pip install lightgbm  # Default Kaggle version may lack GPU; we'll configure it below

import fasttext
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import ttest_rel
from deap import base, creator, tools, algorithms
import psutil
from copy import deepcopy
import random
import os
import torch

In [None]:


# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Check GPU availability
print("GPU available for PyTorch:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))

# --- Load Preprocessed Data ---
processed_dir = "/kaggle/input/processed-chunks-1/"
print("Loading preprocessed data from processed-chunks-1...")

csv_files = sorted([f for f in os.listdir(processed_dir) if f.endswith('.csv')])
print(f"Found {len(csv_files)} CSV files: {csv_files[:5]}...")

dataframes = []
for csv_file in tqdm(csv_files, desc="Loading CSV files"):
    file_path = os.path.join(processed_dir, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

# Extract text and labels
text_column = 'cleaned_review'
label_column = 'label'
X = data[text_column].fillna("")
y = data[label_column].values

# Check data integrity
print("Label distribution:", np.bincount(y - 1))
print("Number of empty cleaned reviews:", (X == "").sum())

# Shuffle the dataset
print("Shuffling data...")
np.random.seed(42)
shuffle_indices = np.random.permutation(len(data))
X = X[shuffle_indices]
y = y[shuffle_indices]

# Filter out empty reviews
print("Filtering out empty reviews for FastText training...")
valid_mask = X != ""
X_valid = X[valid_mask]
y_valid = y[valid_mask]
print(f"Rows after filtering empty reviews: {len(X_valid)}")

# Split into train (2.88M) and test (720K)
X_train_processed = X_valid[:2880000].tolist()
X_test_processed = X_valid[2880000:2880000 + 720000].tolist()
y_train = y_valid[:2880000]
y_test = y_valid[2880000:2880000 + 720000]

# Adjust labels from {1, 2} to {0, 1}
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Sample subset for FastText training
print("Sampling subset for FastText training...")
subset_size = min(1000000, len(X_train_processed))
subset_idx = random.sample(range(len(X_train_processed)), subset_size)
X_train_subset = [X_train_processed[i] for i in subset_idx]

# Save training data for FastText unsupervised training
train_unsupervised_file = "/kaggle/working/train_unsupervised.txt"
print("Writing unsupervised training data to file...")
with open(train_unsupervised_file, "w", encoding="utf-8") as f:
    for text in X_train_subset:
        if text.strip():
            f.write(f"__label__dummy {text}\n")

# Train FastText model
print("Training FastText model...")
N_CORES = max(1, multiprocessing.cpu_count() // 2)
model = fasttext.train_unsupervised(
    train_unsupervised_file,
    model="skipgram",
    dim=50,
    epoch=5,
    minCount=5,
    thread=N_CORES,
    lr=0.2,
    bucket=500000
)
print("FastText training complete!")

# Generate embeddings
def get_fasttext_vector(text):
    return model.get_sentence_vector(text)

print("Converting text to FastText embeddings...")
X_train_fasttext = [get_fasttext_vector(text) for text in tqdm(X_train_processed, desc="Embedding X_train")]
X_test_fasttext = [get_fasttext_vector(text) for text in tqdm(X_test_processed, desc="Embedding X_test")]
print("FastText transformation complete!")

X_train_vectors = np.array(X_train_fasttext)
X_test_vectors = np.array(X_test_fasttext)
print(f"Train FastText shape: {X_train_vectors.shape}")
print(f"Test FastText shape: {X_test_vectors.shape}")

# Scale embeddings for Multinomial NB (requires non-negative values)
scaler = MinMaxScaler()
X_train_vectors_scaled = scaler.fit_transform(X_train_vectors)
X_test_vectors_scaled = scaler.transform(X_test_vectors)

# --- Hyperparameter Tuning with Evolutionary Algorithm ---
# Define a subset of data for tuning
subset_size = 100000  # Adjust based on resources
X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(
    X_train_vectors[:subset_size], y_train_adjusted[:subset_size], test_size=0.2, random_state=42
)
X_train_subset_scaled, X_test_subset_scaled = train_test_split(
    X_train_vectors_scaled[:subset_size], y_train_adjusted[:subset_size], test_size=0.2, random_state=42
)

# Define evaluation function for a model
def evaluate_individual(individual, model_class, hyperparam_keys):
    """Evaluate a model's performance given an individual's hyperparameters."""
    params = dict(zip(hyperparam_keys, individual))
    
    # Instantiate and train the model
    if model_class == MultinomialNB:
        model = MultinomialNB(alpha=max(params['alpha'], 0.001))
    elif model_class == LinearSVC:
        model = LinearSVC(C=max(params['C'], 0.001), max_iter=5000)
    elif model_class == LogisticRegression:
        model = LogisticRegression(C=max(params['C'], 0.001), max_iter=1000)
    elif model_class == MLPClassifier:
        hidden_size = (int(max(params['hidden_size'], 10)),)
        model = MLPClassifier(hidden_layer_sizes=hidden_size, alpha=max(params['alpha'], 0.0001), max_iter=1000)
    elif model_class == XGBClassifier:
        model = XGBClassifier(
            n_estimators=int(max(params['n_estimators'], 50)),
            max_depth=int(max(params['max_depth'], 3)),
            use_label_encoder=False,
            eval_metric='logloss',
            tree_method='gpu_hist',
            predictor='gpu_predictor'
        )
    elif model_class == lgb.LGBMClassifier:
        model = lgb.LGBMClassifier(
            n_estimators=int(max(params['n_estimators'], 50)),
            num_leaves=int(max(params['num_leaves'], 20)),
            device='gpu',
            gpu_platform_id=0,
            gpu_device_id=0
        )
    
    # Use scaled data for Multinomial NB, original for others
    X_train_eval = X_train_subset_scaled if model_class == MultinomialNB else X_train_subset
    X_test_eval = X_test_subset_scaled if model_class == MultinomialNB else X_test_subset
    
    # Train and evaluate
    model.fit(X_train_eval, y_train_subset)
    y_pred = model.predict(X_test_eval)
    return accuracy_score(y_test_subset, y_pred),

# Define hyperparameter spaces
models_to_tune = {
    'Multinomial NB': {
        'class': MultinomialNB,
        'params': {'alpha': (0.001, 10.0)}
    },
    'SVM': {
        'class': LinearSVC,
        'params': {'C': (0.001, 10.0)}
    },
    'Logistic Regression': {
        'class': LogisticRegression,
        'params': {'C': (0.001, 10.0)}
    },
    'MLP Classifier': {
        'class': MLPClassifier,
        'params': {'hidden_size': (10, 200), 'alpha': (0.0001, 0.1)}
    },
    'XGBoost': {
        'class': XGBClassifier,
        'params': {'n_estimators': (50, 200), 'max_depth': (3, 10)}
    },
    'LightGBM': {
        'class': lgb.LGBMClassifier,
        'params': {'n_estimators': (50, 200), 'num_leaves': (20, 100)}
    }
}

# Set up DEAP for evolutionary optimization
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Function to run EA for a given model
def optimize_model(model_name, model_class, param_ranges, n_gen=10, pop_size=20):
    toolbox = base.Toolbox()
    
    hyperparam_keys = list(param_ranges.keys())
    for i, key in enumerate(hyperparam_keys):
        min_val, max_val = param_ranges[key]
        toolbox.register(f"attr_{key}", np.random.uniform, min_val, max_val)
    
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     [getattr(toolbox, f"attr_{key}") for key in hyperparam_keys], n=1)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    
    toolbox.register("evaluate", evaluate_individual, model_class=model_class, hyperparam_keys=hyperparam_keys)
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.5, indpb=0.2)
    toolbox.register("select", tools.selTournament, tournsize=3)
    
    def clamp_individual(ind):
        for i, (key, (min_val, max_val)) in enumerate(param_ranges.items()):
            ind[i] = max(min_val, min(max_val, ind[i]))
        return ind
    
    def mate_and_clamp(ind1, ind2):
        tools.cxBlend(ind1, ind2, alpha=0.5)
        clamp_individual(ind1)
        clamp_individual(ind2)
        return ind1, ind2
    
    def mutate_and_clamp(ind):
        tools.mutGaussian(ind, mu=0, sigma=0.5, indpb=0.2)
        clamp_individual(ind)
        return ind,
    
    toolbox.register("mate_clamped", mate_and_clamp)
    toolbox.register("mutate_clamped", mutate_and_clamp)
    
    population = toolbox.population(n=pop_size)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("max", np.max)
    
    population, logbook = algorithms.eaSimple(
        population, toolbox, cxpb=0.7, mutpb=0.3, ngen=n_gen, stats=stats, halloffame=hof, verbose=True
    )
    
    best_params = dict(zip(hyperparam_keys, hof[0]))
    best_score = hof[0].fitness.values[0]
    return best_params, best_score

# Optimize hyperparameters for each model
best_hyperparameters = {}
for model_name, info in models_to_tune.items():
    print(f"Optimizing hyperparameters for {model_name} using Evolutionary Algorithm...")
    best_params, best_score = optimize_model(model_name, info['class'], info['params'])
    if model_name in ['MLP Classifier', 'XGBoost', 'LightGBM']:
        for key in best_params:
            if key in ['hidden_size', 'n_estimators', 'max_depth', 'num_leaves']:
                best_params[key] = int(best_params[key])
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best accuracy: {best_score}")
    best_hyperparameters[model_name] = best_params

# Instantiate best models with tuned hyperparameters
N_CORES = multiprocessing.cpu_count() - 1
best_models = {
    'SVM': LinearSVC(C=max(best_hyperparameters['SVM']['C'], 0.001), max_iter=5000, random_state=42),
    'Logistic Regression': LogisticRegression(C=max(best_hyperparameters['Logistic Regression']['C'], 0.001), max_iter=1000, n_jobs=N_CORES, random_state=42),
    'Multinomial NB': MultinomialNB(alpha=max(best_hyperparameters['Multinomial NB']['alpha'], 0.001)),
    'MLP Classifier': MLPClassifier(hidden_layer_sizes=(int(max(best_hyperparameters['MLP Classifier']['hidden_size'], 10)),), 
                                    alpha=max(best_hyperparameters['MLP Classifier']['alpha'], 0.0001), max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=int(max(best_hyperparameters['XGBoost']['n_estimators'], 50)), 
                             max_depth=int(max(best_hyperparameters['XGBoost']['max_depth'], 3)), 
                             use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist', predictor='gpu_predictor', n_jobs=N_CORES, random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=int(max(best_hyperparameters['LightGBM']['n_estimators'], 50)), 
                                   num_leaves=int(max(best_hyperparameters['LightGBM']['num_leaves'], 20)), 
                                   device='gpu', gpu_platform_id=0, gpu_device_id=0, n_jobs=N_CORES, random_state=42, verbose=-1)
}

# Step 1: Train and evaluate on test set with tuned models
accuracies = {}
print("Training and evaluating tuned models on test set...")
for name, model in best_models.items():
    print(f"Training {name}...")
    subset_idx = random.sample(range(X_train_vectors.shape[0]), min(50000, X_train_vectors.shape[0]))
    X_train_subset = X_train_vectors[subset_idx] if name != 'Multinomial NB' else X_train_vectors_scaled[subset_idx]
    y_train_subset = y_train_adjusted[subset_idx]
    
    model.fit(X_train_subset, y_train_subset)
    X_test_input = X_test_vectors if name != 'Multinomial NB' else X_test_vectors_scaled
    y_pred = model.predict(X_test_input)
    y_pred_adjusted = y_pred + 1
    acc = accuracy_score(y_test, y_pred_adjusted)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
check_memory()

top_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)[:2]
print("\nTop 2 models by accuracy:")
for name, acc in top_models:
    print(f"{name}: {acc:.4f}")

# Step 2: 5-Fold Cross-Validation with tuned models
kf = KFold(n_splits=5, shuffle=True, random_state=42)
metrics = ['accuracy', 'f1', 'roc_auc']
results = {name: {metric: [] for metric in metrics} for name in best_models.keys()}

print("\nPerforming 5-fold cross-validation with tuned models...")
for name, model in best_models.items():
    print(f"Cross-validating {name}...")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_vectors)):
        X_train_fold = X_train_vectors[train_idx] if name != 'Multinomial NB' else X_train_vectors_scaled[train_idx]
        y_train_fold = y_train_adjusted[train_idx]
        X_val_fold = X_train_vectors[val_idx] if name != 'Multinomial NB' else X_train_vectors_scaled[val_idx]
        y_val_fold = y_train_adjusted[val_idx]
        
        subset_idx = random.sample(range(X_train_fold.shape[0]), min(50000, X_train_fold.shape[0]))
        X_train_subset = X_train_fold[subset_idx]
        y_train_subset = y_train_fold[subset_idx]
        
        model_instance = deepcopy(model)
        model_instance.fit(X_train_subset, y_train_subset)
        y_pred_fold = model_instance.predict(X_val_fold)
        
        y_pred_fold_adjusted = y_pred_fold + 1
        y_val_fold_adjusted = y_val_fold + 1
        
        acc = accuracy_score(y_val_fold_adjusted, y_pred_fold_adjusted)
        f1 = f1_score(y_val_fold_adjusted, y_pred_fold_adjusted, average='weighted')
        
        if hasattr(model_instance, "predict_proba"):
            y_prob_fold = model_instance.predict_proba(X_val_fold)[:, 1]
            auc = roc_auc_score(y_val_fold, y_prob_fold)
        else:
            if name == 'SVM':
                y_scores_fold = model_instance.decision_function(X_val_fold)
                auc = roc_auc_score(y_val_fold, y_scores_fold)
            else:
                auc = np.nan
        
        results[name]['accuracy'].append(acc)
        results[name]['f1'].append(f1)
        results[name]['roc_auc'].append(auc)
        
        print(f"{name} Fold {fold+1}: Acc={acc:.6f}, F1={f1:.6f}, AUC={auc:.6f}" if not np.isnan(auc) else f"{name} Fold {fold+1}: Acc={acc:.6f}, F1={f1:.6f}, AUC=nan")

# Display CV results
print("\n### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics")
print("| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |")
print("|--------------------|---------------------|---------------------|--------------------|")
for name in best_models.keys():
    acc_mean = np.mean(results[name]['accuracy'])
    acc_std = np.std(results[name]['accuracy'])
    f1_mean = np.mean(results[name]['f1'])
    f1_std = np.std(results[name]['f1'])
    auc_mean = np.mean(results[name]['roc_auc'])
    auc_std = np.std(results[name]['roc_auc'])
    auc_mean_str = f"{auc_mean:.4f}" if not np.isnan(auc_mean) else "nan"
    auc_std_str = f"{auc_std:.4f}" if not np.isnan(auc_std) else "nan"
    print(f"| {name:<18} | {acc_mean:.4f} ± {acc_std:.4f} | {f1_mean:.4f} ± {f1_std:.4f} | {auc_mean_str} ± {auc_std_str} |")

# Step 3: Statistical Significance Tests
top_model_1_name, top_model_2_name = top_models[0][0], top_models[1][0]
top_model_1_scores = results[top_model_1_name]['accuracy']
top_model_2_scores = results[top_model_2_name]['accuracy']

print("\n### 2. Conduct Statistical Significance Tests and Present p-Values")
print(f"Table: Statistical Significance of {top_model_1_name} and {top_model_2_name} vs. Other Models (FastText)")
print("| Comparison                  | p-value (Accuracy) | Significant? |")
print("|-----------------------------|-------------------|--------------|")
for name in best_models.keys():
    if name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_1_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_1_name} vs. {name:<15} | {p_value:.4f}             | {significant:<12} |")
for name in best_models.keys():
    if name != top_model_2_name and name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_2_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_2_name} vs. {name:<15} | {p_value:.4f}             | {significant:<12} |")
check_memory()

Loading preprocessed data from processed-chunks-1...
Found 72 CSV files: ['processed_chunk_0_50000.csv', 'processed_chunk_1000000_1050000.csv', 'processed_chunk_100000_150000.csv', 'processed_chunk_1050000_1100000.csv', 'processed_chunk_1100000_1150000.csv']...


Loading CSV files: 100%|██████████| 72/72 [00:57<00:00,  1.24it/s]


Total rows loaded: 3600000
Label distribution: [1800000 1800000]
Number of empty cleaned reviews: 13
Shuffling data...
Filtering out empty reviews for FastText training...
Rows after filtering empty reviews: 3599987
Sampling subset for FastText training...
Writing unsupervised training data to file...
Training FastText model...
