In [3]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)

Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [4]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 2880000
Testing samples: 720000


# TF-IDF

# All in one

In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from scipy.stats import ttest_rel
from deap import base, creator, tools, algorithms
import psutil
from copy import deepcopy
import random
import os
import torch

In [None]:
# Install DEAP and GPU-enabled libraries for Kaggle

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Check GPU availability
print("GPU available for PyTorch:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))

# --- Load Preprocessed Data ---
processed_dir = "/kaggle/input/processed-chunks-1/"
print("Loading preprocessed data from processed-chunks-1...")

csv_files = sorted([f for f in os.listdir(processed_dir) if f.endswith('.csv')])
print(f"Found {len(csv_files)} CSV files: {csv_files[:5]}...")

dataframes = []
for csv_file in tqdm(csv_files, desc="Loading CSV files"):
    file_path = os.path.join(processed_dir, csv_file)
    df = pd.read_csv(file_path, usecols=['cleaned_review', 'label'])
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

# Extract text and labels
text_column = 'cleaned_review'
label_column = 'label'
X = data[text_column].fillna("")
y = data[label_column].values

print("Label distribution before shuffling:", np.bincount(y - 1))

# Shuffle the dataset
print("Shuffling the dataset...")
np.random.seed(42)
shuffle_indices = np.random.permutation(len(data))
X = X[shuffle_indices]
y = y[shuffle_indices]
print("Label distribution after shuffling:", np.bincount(y - 1))

# Split into train (2.88M) and test (720K)
X_train_processed = X[:2880000].tolist()
X_test_processed = X[2880000:2880000 + 720000].tolist()
y_train = y[:2880000]
y_test = y[2880000:2880000 + 720000]

y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Initialize TF-IDF Vectorizer
print("Initializing TF-IDF Vectorizer...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=2000,
    min_df=5,
    max_df=0.95,
    ngram_range=(1, 2),
    sublinear_tf=True
)

print("Transforming text to TF-IDF features...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_processed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_processed)
print("TF-IDF transformation complete!")
print(f"Train TF-IDF shape: {X_train_tfidf.shape}")
print(f"Test TF-IDF shape: {X_test_tfidf.shape}")

# --- Hyperparameter Tuning with Evolutionary Algorithm ---
# Define a subset of data for tuning
subset_size = 100000  # Adjust based on resources
X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(
    X_train_tfidf[:subset_size], y_train_adjusted[:subset_size], test_size=0.2, random_state=42
)

# Define evaluation function for a model
def evaluate_individual(individual, model_class, hyperparam_keys):
    """Evaluate a model's performance given an individual's hyperparameters."""
    params = dict(zip(hyperparam_keys, individual))
    
    # Instantiate and train the model
    if model_class == MultinomialNB:
        model = MultinomialNB(alpha=max(params['alpha'], 0.001))
    elif model_class == LinearSVC:
        model = LinearSVC(C=max(params['C'], 0.001), max_iter=5000)
    elif model_class == LogisticRegression:
        model = LogisticRegression(C=max(params['C'], 0.001), max_iter=1000)
    elif model_class == MLPClassifier:
        hidden_size = (int(max(params['hidden_size'], 10)),)
        model = MLPClassifier(hidden_layer_sizes=hidden_size, alpha=max(params['alpha'], 0.0001), max_iter=1000)
    elif model_class == XGBClassifier:
        model = XGBClassifier(
            n_estimators=int(max(params['n_estimators'], 50)),
            max_depth=int(max(params['max_depth'], 3)),
            use_label_encoder=False,
            eval_metric='logloss',
            tree_method='gpu_hist',
            predictor='gpu_predictor'
        )
    elif model_class == lgb.LGBMClassifier:
        model = lgb.LGBMClassifier(
            n_estimators=int(max(params['n_estimators'], 50)),
            num_leaves=int(max(params['num_leaves'], 20)),
            device='gpu',
            gpu_platform_id=0,
            gpu_device_id=0
        )
    
    # Convert to dense for models requiring it
    X_train_eval = X_train_subset.toarray() if model_class in [MLPClassifier, XGBClassifier, lgb.LGBMClassifier] else X_train_subset
    X_test_eval = X_test_subset.toarray() if model_class in [MLPClassifier, XGBClassifier, lgb.LGBMClassifier] else X_test_subset
    
    # Train and evaluate
    model.fit(X_train_eval, y_train_subset)
    y_pred = model.predict(X_test_eval)
    return accuracy_score(y_test_subset, y_pred),

# Define hyperparameter spaces
models_to_tune = {
    'Multinomial NB': {
        'class': MultinomialNB,
        'params': {'alpha': (0.001, 10.0)}
    },
    'SVM': {
        'class': LinearSVC,
        'params': {'C': (0.001, 10.0)}
    },
    'Logistic Regression': {
        'class': LogisticRegression,
        'params': {'C': (0.001, 10.0)}
    },
    'MLP Classifier': {
        'class': MLPClassifier,
        'params': {'hidden_size': (10, 200), 'alpha': (0.0001, 0.1)}
    },
    'XGBoost': {
        'class': XGBClassifier,
        'params': {'n_estimators': (50, 200), 'max_depth': (3, 10)}
    },
    'LightGBM': {
        'class': lgb.LGBMClassifier,
        'params': {'n_estimators': (50, 200), 'num_leaves': (20, 100)}
    }
}

# Set up DEAP for evolutionary optimization
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Function to run EA for a given model
def optimize_model(model_name, model_class, param_ranges, n_gen=10, pop_size=20):
    toolbox = base.Toolbox()
    
    hyperparam_keys = list(param_ranges.keys())
    for i, key in enumerate(hyperparam_keys):
        min_val, max_val = param_ranges[key]
        toolbox.register(f"attr_{key}", np.random.uniform, min_val, max_val)
    
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     [getattr(toolbox, f"attr_{key}") for key in hyperparam_keys], n=1)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    
    toolbox.register("evaluate", evaluate_individual, model_class=model_class, hyperparam_keys=hyperparam_keys)
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.5, indpb=0.2)
    toolbox.register("select", tools.selTournament, tournsize=3)
    
    def clamp_individual(ind):
        for i, (key, (min_val, max_val)) in enumerate(param_ranges.items()):
            ind[i] = max(min_val, min(max_val, ind[i]))
        return ind
    
    def mate_and_clamp(ind1, ind2):
        tools.cxBlend(ind1, ind2, alpha=0.5)
        clamp_individual(ind1)
        clamp_individual(ind2)
        return ind1, ind2
    
    def mutate_and_clamp(ind):
        tools.mutGaussian(ind, mu=0, sigma=0.5, indpb=0.2)
        clamp_individual(ind)
        return ind,
    
    toolbox.register("mate_clamped", mate_and_clamp)
    toolbox.register("mutate_clamped", mutate_and_clamp)
    
    population = toolbox.population(n=pop_size)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("max", np.max)
    
    population, logbook = algorithms.eaSimple(
        population, toolbox, cxpb=0.7, mutpb=0.3, ngen=n_gen, stats=stats, halloffame=hof, verbose=True
    )
    
    best_params = dict(zip(hyperparam_keys, hof[0]))
    best_score = hof[0].fitness.values[0]
    return best_params, best_score

# Optimize hyperparameters for each model
best_hyperparameters = {}
for model_name, info in models_to_tune.items():
    print(f"Optimizing hyperparameters for {model_name} using Evolutionary Algorithm...")
    best_params, best_score = optimize_model(model_name, info['class'], info['params'])
    if model_name in ['MLP Classifier', 'XGBoost', 'LightGBM']:
        for key in best_params:
            if key in ['hidden_size', 'n_estimators', 'max_depth', 'num_leaves']:
                best_params[key] = int(best_params[key])
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best accuracy: {best_score}")
    best_hyperparameters[model_name] = best_params

# Instantiate best models with tuned hyperparameters
N_CORES = multiprocessing.cpu_count() - 1
best_models = {
    'SVM': LinearSVC(C=max(best_hyperparameters['SVM']['C'], 0.001), max_iter=5000, random_state=42),
    'Logistic Regression': LogisticRegression(C=max(best_hyperparameters['Logistic Regression']['C'], 0.001), max_iter=1000, n_jobs=N_CORES, random_state=42),
    'Multinomial NB': MultinomialNB(alpha=max(best_hyperparameters['Multinomial NB']['alpha'], 0.001)),
    'MLP Classifier': MLPClassifier(hidden_layer_sizes=(int(max(best_hyperparameters['MLP Classifier']['hidden_size'], 10)),), 
                                    alpha=max(best_hyperparameters['MLP Classifier']['alpha'], 0.0001), max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=int(max(best_hyperparameters['XGBoost']['n_estimators'], 50)), 
                             max_depth=int(max(best_hyperparameters['XGBoost']['max_depth'], 3)), 
                             use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist', predictor='gpu_predictor', n_jobs=N_CORES, random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=int(max(best_hyperparameters['LightGBM']['n_estimators'], 50)), 
                                   num_leaves=int(max(best_hyperparameters['LightGBM']['num_leaves'], 20)), 
                                   device='gpu', gpu_platform_id=0, gpu_device_id=0, n_jobs=N_CORES, random_state=42, verbose=-1)
}

# Step 1: Train and evaluate on test set with tuned models
accuracies = {}
print("Training and evaluating tuned models on test set...")
for name, model in best_models.items():
    print(f"Training {name}...")
    subset_idx = random.sample(range(X_train_tfidf.shape[0]), 20000)
    X_train_subset = X_train_tfidf[subset_idx]
    y_train_subset = y_train_adjusted[subset_idx]
    
    if name in ['MLP Classifier', 'XGBoost', 'LightGBM']:
        X_train_subset = X_train_subset.toarray()
        batch_size = 50000
        y_pred = []
        for i in range(0, X_test_tfidf.shape[0], batch_size):
            X_test_batch = X_test_tfidf[i:i + batch_size].toarray()
            y_pred.extend(model.fit(X_train_subset, y_train_subset).predict(X_test_batch))
        y_pred = np.array(y_pred)
    else:
        model.fit(X_train_subset, y_train_subset)
        y_pred = model.predict(X_test_tfidf)
    
    y_pred_adjusted = y_pred + 1
    acc = accuracy_score(y_test, y_pred_adjusted)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
check_memory()

top_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)[:2]
print("\nTop 2 models by accuracy:")
for name, acc in top_models:
    print(f"{name}: {acc:.4f}")

# Step 2: 5-Fold Cross-Validation with tuned models
kf = KFold(n_splits=5, shuffle=True, random_state=42)
metrics = ['accuracy', 'f1', 'roc_auc']
results = {name: {metric: [] for metric in metrics} for name in best_models.keys()}

print("\nPerforming 5-fold cross-validation with tuned models...")
for name, model in best_models.items():
    print(f"Cross-validating {name}...")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_tfidf)):
        X_train_fold = X_train_tfidf[train_idx]
        y_train_fold = y_train_adjusted[train_idx]
        X_val_fold = X_train_tfidf[val_idx]
        y_val_fold = y_train_adjusted[val_idx]
        
        subset_idx = random.sample(range(X_train_fold.shape[0]), 20000)
        X_train_subset = X_train_fold[subset_idx]
        y_train_subset = y_train_fold[subset_idx]
        
        if name in ['MLP Classifier', 'XGBoost', 'LightGBM']:
            X_train_subset = X_train_subset.toarray()
            X_val_fold_converted = X_val_fold[:20000].toarray()
            y_val_fold_converted = y_val_fold[:20000]
        else:
            X_val_fold_converted = X_val_fold
            y_val_fold_converted = y_val_fold
        
        model_instance = deepcopy(model)
        model_instance.fit(X_train_subset, y_train_subset)
        y_pred_fold = model_instance.predict(X_val_fold_converted)
        
        y_pred_fold_adjusted = y_pred_fold + 1
        y_val_fold_adjusted = y_val_fold_converted + 1
        
        acc = accuracy_score(y_val_fold_adjusted, y_pred_fold_adjusted)
        f1 = f1_score(y_val_fold_adjusted, y_pred_fold_adjusted, average='weighted')
        
        if hasattr(model_instance, "predict_proba"):
            y_prob_fold = model_instance.predict_proba(X_val_fold_converted)[:, 1]
            auc = roc_auc_score(y_val_fold_converted, y_prob_fold)
        else:
            if name == 'SVM':
                y_scores_fold = model_instance.decision_function(X_val_fold_converted)
                auc = roc_auc_score(y_val_fold_converted, y_scores_fold)
            else:
                auc = np.nan
        
        results[name]['accuracy'].append(acc)
        results[name]['f1'].append(f1)
        results[name]['roc_auc'].append(auc)
        
        print(f"{name} Fold {fold+1}: Acc={acc:.6f}, F1={f1:.6f}, AUC={auc:.6f}" if not np.isnan(auc) else f"{name} Fold {fold+1}: Acc={acc:.6f}, F1={f1:.6f}, AUC=nan")

# Display CV results
print("\n### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics")
print("| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |")
print("|--------------------|---------------------|---------------------|--------------------|")
for name in best_models.keys():
    acc_mean = np.mean(results[name]['accuracy'])
    acc_std = np.std(results[name]['accuracy'])
    f1_mean = np.mean(results[name]['f1'])
    f1_std = np.std(results[name]['f1'])
    auc_mean = np.mean(results[name]['roc_auc'])
    auc_std = np.std(results[name]['roc_auc'])
    auc_mean_str = f"{auc_mean:.4f}" if not np.isnan(auc_mean) else "nan"
    auc_std_str = f"{auc_std:.4f}" if not np.isnan(auc_std) else "nan"
    print(f"| {name:<18} | {acc_mean:.4f} ± {acc_std:.4f} | {f1_mean:.4f} ± {f1_std:.4f} | {auc_mean_str} ± {auc_std_str} |")

# Step 3: Statistical Significance Tests
top_model_1_name, top_model_2_name = top_models[0][0], top_models[1][0]
top_model_1_scores = results[top_model_1_name]['accuracy']
top_model_2_scores = results[top_model_2_name]['accuracy']

print("\n### 2. Conduct Statistical Significance Tests and Present p-Values")
print(f"Table: Statistical Significance of {top_model_1_name} and {top_model_2_name} vs. Other Models (TF-IDF)")
print("| Comparison                  | p-value (Accuracy) | Significant? |")
print("|-----------------------------|-------------------|--------------|")
for name in best_models.keys():
    if name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_1_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_1_name} vs. {name:<15} | {p_value:.4f}             | {significant:<12} |")
for name in best_models.keys():
    if name != top_model_2_name and name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_2_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_2_name} vs. {name:<15} | {p_value:.4f}             | {significant:<12} |")
check_memory()

GPU available for PyTorch: False
Loading preprocessed data from processed-chunks-1...
Found 72 CSV files: ['processed_chunk_0_50000.csv', 'processed_chunk_1000000_1050000.csv', 'processed_chunk_100000_150000.csv', 'processed_chunk_1050000_1100000.csv', 'processed_chunk_1100000_1150000.csv']...


Loading CSV files: 100%|██████████| 72/72 [00:22<00:00,  3.23it/s]


Total rows loaded: 3600000
Label distribution before shuffling: [1800000 1800000]
Shuffling the dataset...
Label distribution after shuffling: [1800000 1800000]
Initializing TF-IDF Vectorizer...
Transforming text to TF-IDF features...
TF-IDF transformation complete!
Train TF-IDF shape: (2880000, 2000)
Test TF-IDF shape: (720000, 2000)
Optimizing hyperparameters for Multinomial NB using Evolutionary Algorithm...
gen	nevals	max   
0  	20    	0.8302
1  	19    	0.8301
2  	16    	0.8301
3  	14    	0.8304
4  	15    	0.8304
5  	13    	0.8304
6  	16    	0.8304
7  	19    	0.8304
8  	15    	0.8304
9  	14    	0.8304
10 	18    	0.8304
Best parameters for Multinomial NB: {'alpha': 4.720440041760033}
Best accuracy: 0.8304
Optimizing hyperparameters for SVM using Evolutionary Algorithm...
gen	nevals	max    
0  	20    	0.86285
1  	10    	0.8629 
2  	16    	0.8629 
3  	13    	0.8629 
4  	13    	0.8629 
5  	10    	0.8629 
6  	13    	0.8629 
7  	15    	0.86305
8  	18    	0.86305
9  	17    	0.86305
10 	16

# LightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are 0-based
y_train_adjusted = np.array(y_train) - 1
y_test_adjusted = np.array(y_test) - 1

# Define LightGBM hyperparameters with reduced iterations and depth
param_grid = {
    'learning_rate': [0.01, 0.05],
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'num_leaves': [15, 31]
}

# Stratified K-Fold Cross-Validation (using 3 folds for efficiency)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for iterations in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for num_leaves in param_grid['num_leaves']:
                print(f"Training LightGBM with learning_rate={lr}, iterations={iterations}, depth={depth}, num_leaves={num_leaves}...")
                
                # Initialize the LightGBM model
                lgbm_classifier = lgb.LGBMClassifier(
                    learning_rate=lr,
                    n_estimators=iterations,
                    max_depth=depth,
                    num_leaves=num_leaves,
                    verbose=-1,
                    n_jobs=-1  # Utilize all CPU cores
                )
                
                # Train on a subset for efficiency
                subset_size = min(10000, X_train_tfidf.shape[0])
                lgbm_classifier.fit(X_train_tfidf[:subset_size], y_train_adjusted[:subset_size])
                
                # Make predictions
                y_pred_lgbm = lgbm_classifier.predict(X_test_tfidf) + 1  # Adjust back to original label scale
                
                # Performance evaluation
                print(f"LightGBM Classification Report (learning_rate={lr}, iterations={iterations}, depth={depth}, num_leaves={num_leaves}):")
                print(classification_report(y_test, y_pred_lgbm))
                print(f"LightGBM Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}\n")
                
                # Check memory usage
                check_memory()

# CatBoost

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import psutil

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are 0-based
y_train_adjusted = np.array(y_train) - 1
y_test_adjusted = np.array(y_test) - 1

# Define hyperparameter grid for CatBoost
param_grid = {
    'learning_rate': [0.01, 0.05],
    'iterations': [50, 100],
    'depth': [3, 5],
    'l2_leaf_reg': [1, 3],
    'border_count': [32, 50]
}

# Stratified K-Fold Cross-Validation (using 3 folds for efficiency)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for iterations in param_grid['iterations']:
        for depth in param_grid['depth']:
            for l2_leaf_reg in param_grid['l2_leaf_reg']:
                for border_count in param_grid['border_count']:
                    print(f"Training CatBoost with learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}...")
                    
                    # Initialize the CatBoost model
                    catboost_classifier = CatBoostClassifier(
                        learning_rate=lr,
                        iterations=iterations,
                        depth=depth,
                        l2_leaf_reg=l2_leaf_reg,
                        border_count=border_count,
                        verbose=0,
                        random_state=42
                    )
                    
                    # Train on a subset for efficiency
                    subset_size = min(10000, X_train_tfidf.shape[0])
                    catboost_classifier.fit(X_train_tfidf[:subset_size], y_train_adjusted[:subset_size])
                    
                    # Make predictions
                    y_pred_catboost = catboost_classifier.predict(X_test_tfidf) + 1  # Adjust back to original label scale
                    
                    # Performance evaluation
                    print(f"CatBoost Classification Report (learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}):")
                    print(classification_report(y_test, y_pred_catboost))
                    print(f"CatBoost Accuracy: {accuracy_score(y_test, y_pred_catboost):.4f}\n")
                    
                    # Check memory usage
                    check_memory()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are 0-based
y_train_adjusted = np.array(y_train) - 1
y_test_adjusted = np.array(y_test) - 1

# Define hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

# Stratified K-Fold Cross-Validation (using 3 folds for efficiency)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Iterate over hyperparameters
for n_estimators in param_grid['n_estimators']:
    for depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            print(f"Training Random Forest with n_estimators={n_estimators}, depth={depth}, min_samples_split={min_samples_split}...")
            
            # Initialize the RandomForest model
            rf_classifier = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=depth,
                min_samples_split=min_samples_split,
                n_jobs=-1,  # Utilize all CPU cores
                random_state=42
            )
            
            # Train on a subset for efficiency
            subset_size = min(10000, X_train_tfidf.shape[0])
            rf_classifier.fit(X_train_tfidf[:subset_size], y_train_adjusted[:subset_size])
            
            # Make predictions
            y_pred_rf = rf_classifier.predict(X_test_tfidf) + 1  # Adjust back to original label scale
            
            # Performance evaluation
            print(f"Random Forest Classification Report (n_estimators={n_estimators}, depth={depth}, min_samples_split={min_samples_split}):")
            print(classification_report(y_test, y_pred_rf))
            print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}\n")
            
            # Check memory usage
            check_memory()

# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are 0-based
y_train_adjusted = np.array(y_train) - 1
y_test_adjusted = np.array(y_test) - 1

# Define hyperparameter grid for AdaBoost
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1]
}

# Stratified K-Fold Cross-Validation (using 3 folds for efficiency)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Iterate over hyperparameters
for n_estimators in param_grid['n_estimators']:
    for learning_rate in param_grid['learning_rate']:
        print(f"Training AdaBoost with n_estimators={n_estimators}, learning_rate={learning_rate}...")
        
        # Initialize the AdaBoost model
        adaboost_classifier = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=42
        )
        
        # Train on a subset for efficiency
        subset_size = min(10000, X_train_tfidf.shape[0])
        adaboost_classifier.fit(X_train_tfidf[:subset_size], y_train_adjusted[:subset_size])
        
        # Make predictions
        y_pred_adaboost = adaboost_classifier.predict(X_test_tfidf) + 1  # Adjust back to original label scale
        
        # Performance evaluation
        print(f"AdaBoost Classification Report (n_estimators={n_estimators}, learning_rate={learning_rate}):")
        print(classification_report(y_test, y_pred_adaboost))
        print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}\n")
        
        # Check memory usage
        check_memory()


# Naive Bayes and SVM

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import psutil

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are 0-based
y_train_adjusted = np.array(y_train) - 1
y_test_adjusted = np.array(y_test) - 1

# Naive Bayes hyperparameter tuning
alpha_values = [0.1, 0.5, 1.0, 2.0]
for alpha in alpha_values:
    print(f"Training Naive Bayes with alpha={alpha}...")
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(X_train_tfidf, y_train_adjusted)
    
    # Predict
    y_pred_nb = nb_classifier.predict(X_test_tfidf) + 1  # Adjust back to original labels
    
    # Performance evaluation
    print(f"Naive Bayes Classification Report (alpha={alpha}):")
    print(classification_report(y_test, y_pred_nb))
    print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}\n")
    check_memory()

# SVM hyperparameter tuning
c_values = [0.1, 0.5, 1.0, 2.0]
for c_value in c_values:
    print(f"Training SVM with C={c_value}...")
    svm_classifier = LinearSVC(C=c_value, max_iter=5000, random_state=42)
    svm_classifier.fit(X_train_tfidf, y_train_adjusted)
    
    # Predict
    y_pred_svm = svm_classifier.predict(X_test_tfidf) + 1  # Adjust back to original labels
    
    # Performance evaluation
    print(f"SVM Classification Report (C={c_value}):")
    print(classification_report(y_test, y_pred_svm))
    print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}\n")
    check_memory()

# MLP Classifier

In [None]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import classification_report, accuracy_score
# import numpy as np
# import psutil

# # Function to check memory usage
# def check_memory():
#     print(f"Memory usage: {psutil.virtual_memory().percent}%")

# # Ensure labels are 0-based
# y_train_adjusted = np.array(y_train) - 1
# y_test_adjusted = np.array(y_test) - 1

# # Define hyperparameter grid for MLP
# hidden_layer_sizes_values = [(100,), (128, 64)]
# activation_values = ['relu', 'tanh']
# solver_values = ['adam', 'sgd']

# # Iterate over hyperparameters
# for hidden_layer_sizes in hidden_layer_sizes_values:
#     for activation in activation_values:
#         for solver in solver_values:
#             print(f"Training MLPClassifier with hidden_layer_sizes={hidden_layer_sizes}, activation={activation}, solver={solver}...")

#             # Set up the MLPClassifier model with the specified hyperparameters
#             mlp_classifier = MLPClassifier(
#                 hidden_layer_sizes=hidden_layer_sizes,
#                 activation=activation,
#                 solver=solver,
#                 max_iter=300,
#                 random_state=42
#             )

#             # Train the model
#             mlp_classifier.fit(X_train_tfidf[:50000], y_train_adjusted[:50000])  # Use a larger subset
#             y_pred_mlp = mlp_classifier.predict(X_test_tfidf) + 1  # Adjust back to original labels

#             # Performance reporting
#             print(f"MLPClassifier Classification Report for hidden_layer_sizes={hidden_layer_sizes}, activation={activation}, solver={solver}:")
#             print(classification_report(y_test, y_pred_mlp))
#             print(f"MLPClassifier Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}\n")

#             # Check memory usage after each iteration
#             check_memory()

# XG Boost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import psutil

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are 0-based
y_train_adjusted = np.array(y_train) - 1
y_test_adjusted = np.array(y_test) - 1

# Define hyperparameter grid for XGBoost
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'min_child_weight': [1, 5],
    'gamma': [0, 0.1]
}

# Stratified K-Fold Cross-Validation (using 3 folds for efficiency)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for n_est in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_child_weight in param_grid['min_child_weight']:
                for gamma in param_grid['gamma']:
                    print(f"Training XGBoost with learning_rate={lr}, n_estimators={n_est}, max_depth={max_depth}, min_child_weight={min_child_weight}, gamma={gamma}...")
                    
                    # Initialize the XGBoost model
                    xgb_classifier = XGBClassifier(
                        learning_rate=lr,
                        n_estimators=n_est,
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        gamma=gamma,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        tree_method='hist',
                        random_state=42
                    )
                    
                    # Train on a subset for efficiency
                    subset_size = min(10000, X_train_tfidf.shape[0])
                    xgb_classifier.fit(X_train_tfidf[:subset_size], y_train_adjusted[:subset_size])
                    
                    # Make predictions
                    y_pred_xgb = xgb_classifier.predict(X_test_tfidf) + 1  # Adjust back to original label scale
                    
                    # Performance evaluation
                    print(f"XGBoost Classification Report (learning_rate={lr}, n_estimators={n_est}, max_depth={max_depth}, min_child_weight={min_child_weight}, gamma={gamma}):")
                    print(classification_report(y_test, y_pred_xgb))
                    print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}\n")
                    
                    # Check memory usage
                    check_memory()