In [2]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)


Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [3]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 2880000
Testing samples: 720000


# FASTTEXT

In [7]:
import pandas as pd
import numpy as np
from gensim.models import FastText
import multiprocessing
from tqdm import tqdm
import logging

# Disable gensim logging to reduce I/O overhead
logging.getLogger('gensim').setLevel(logging.ERROR)

# Parameters
N_CORES = multiprocessing.cpu_count() - 1
VECTOR_SIZE = 50  # Even smaller vectors for speed
WINDOW = 3  # Smaller window
MIN_COUNT = 10  # Higher threshold to reduce vocab size
EPOCHS = 3  # Fewer epochs
SAMPLE_SIZE = 100000  # Subsample if dataset is huge (adjust as needed)

# Assuming X_train and X_test are lists
print("Preparing data...")
# Replace None or NaN with empty string in lists
X_train = ["" if x is None or pd.isna(x) else x for x in X_train]
X_test = ["" if x is None or pd.isna(x) else x for x in X_test]

# Subsample if dataset is too large
if len(X_train) > SAMPLE_SIZE:
    print(f"Subsampling training data to {SAMPLE_SIZE} samples...")
    X_train = np.random.choice(X_train, SAMPLE_SIZE, replace=False).tolist()

# Simple preprocessing and tokenization in one step
print("Tokenizing data...")
X_train_tokenized = [str(text).lower().split() for text in tqdm(X_train)]
X_test_tokenized = [str(text).lower().split() for text in tqdm(X_test)]
print("Tokenization complete!")

# Train a minimal FastText model
print("Training FastText model...")
fasttext_model = FastText(
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=N_CORES,
    sg=1,  # Skip-gram
    hs=0,  # Negative sampling
    negative=5,
    seed=42,
    sample=1e-4,  # Downsample frequent words
)

# Build vocab and train in one go
fasttext_model.build_vocab(corpus_iterable=X_train_tokenized)
fasttext_model.train(
    corpus_iterable=X_train_tokenized,
    total_examples=len(X_train_tokenized),
    epochs=EPOCHS,
)
print("FastText training complete!")

# Optimized vector averaging using numpy directly
def get_document_vector(tokens, model):
    try:
        return np.mean(model.wv[tokens], axis=0)
    except:
        return np.zeros(VECTOR_SIZE)

# Vectorize using list comprehension instead of joblib
print("Transforming data to vectors...")
X_train_vectors = np.array([get_document_vector(tokens, fasttext_model) 
                          for tokens in tqdm(X_train_tokenized)])
X_test_vectors = np.array([get_document_vector(tokens, fasttext_model) 
                         for tokens in tqdm(X_test_tokenized)])
print("Vector transformation complete!")

# Check shape
print(f"Train vectors shape: {X_train_vectors.shape}")
print(f"Test vectors shape: {X_test_vectors.shape}")

Preparing data...
Tokenizing data...


100%|██████████| 100000/100000 [00:01<00:00, 78179.33it/s]
100%|██████████| 720000/720000 [00:13<00:00, 51596.26it/s]


Tokenization complete!
Training FastText model...
FastText training complete!
Transforming data to vectors...


100%|██████████| 100000/100000 [00:18<00:00, 5269.74it/s]
100%|██████████| 720000/720000 [02:21<00:00, 5080.39it/s]


Vector transformation complete!
Train vectors shape: (100000, 50)
Test vectors shape: (720000, 50)


# LightGBM

In [8]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric (assuming y_train, y_test are lists from FastText pipeline)
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])

# If labels need to be 0-based (e.g., originally 1-based), adjust them
# Comment out if your labels are already 0-based
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Define LightGBM hyperparameters with reduced iterations and depth
param_grid = {
    'learning_rate': [0.01, 0.05],
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'num_leaves': [15, 31]
}

# Stratified K-Fold Cross-Validation (3 folds for efficiency)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Use N_CORES from FastText code for consistency
N_CORES = multiprocessing.cpu_count() - 1

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for iterations in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for num_leaves in param_grid['num_leaves']:
                print(f"Training LightGBM with learning_rate={lr}, iterations={iterations}, depth={depth}, num_leaves={num_leaves}...")
                
                # Initialize the LightGBM model
                lgbm_classifier = lgb.LGBMClassifier(
                    learning_rate=lr,
                    n_estimators=iterations,
                    max_depth=depth,
                    num_leaves=num_leaves,
                    verbose=-1,  # Suppress output
                    n_jobs=N_CORES  # Use same core count as FastText
                )
                
                # Train on full FastText vectors (or subset if too large)
                subset_size = min(10000, X_train_vectors.shape[0])  # Align with FastText output
                lgbm_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
                
                # Make predictions
                y_pred_lgbm = lgbm_classifier.predict(X_test_vectors)
                
                # Adjust predictions back to original scale if labels were shifted
                if np.min(y_train) > 0:
                    y_pred_lgbm = y_pred_lgbm + 1
                
                # Performance evaluation
                print(f"LightGBM Classification Report (learning_rate={lr}, iterations={iterations}, depth={depth}, num_leaves={num_leaves}):")
                print(classification_report(y_test, y_pred_lgbm))
                print(f"LightGBM Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}\n")
                
                # Check memory usage
                check_memory()

Training LightGBM with learning_rate=0.01, iterations=50, depth=3, num_leaves=15...
LightGBM Classification Report (learning_rate=0.01, iterations=50, depth=3, num_leaves=15):
              precision    recall  f1-score   support

           1       0.50      0.66      0.57    359811
           2       0.49      0.33      0.39    360189

    accuracy                           0.50    720000
   macro avg       0.50      0.50      0.48    720000
weighted avg       0.50      0.50      0.48    720000

LightGBM Accuracy: 0.4959

Memory usage: 45.2%
Training LightGBM with learning_rate=0.01, iterations=50, depth=3, num_leaves=31...
LightGBM Classification Report (learning_rate=0.01, iterations=50, depth=3, num_leaves=31):
              precision    recall  f1-score   support

           1       0.50      0.66      0.57    359811
           2       0.49      0.33      0.39    360189

    accuracy                           0.50    720000
   macro avg       0.50      0.50      0.48    720000
we

# Catboost

In [10]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import psutil
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])

# Conditional label adjustment (assuming original labels are 1-based [1, 2])
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  # Slightly higher learning rates
    'iterations': [100, 200],  # More iterations
    'depth': [5, 7],  # Deeper trees
    'l2_leaf_reg': [1, 3],
    'border_count': [50, 100]  # More quantization levels
}

# Stratified K-Fold (increase to 5 folds for better validation)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

N_CORES = multiprocessing.cpu_count() - 1

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for iterations in param_grid['iterations']:
        for depth in param_grid['depth']:
            for l2_leaf_reg in param_grid['l2_leaf_reg']:
                for border_count in param_grid['border_count']:
                    print(f"Training CatBoost with learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}...")
                    
                    catboost_classifier = CatBoostClassifier(
                        learning_rate=lr,
                        iterations=iterations,
                        depth=depth,
                        l2_leaf_reg=l2_leaf_reg,
                        border_count=border_count,
                        verbose=0,
                        random_state=42,
                        thread_count=N_CORES
                    )
                    
                    # Train on more data (e.g., 50,000 samples or full dataset if memory allows)
                    subset_size = min(50000, X_train_vectors.shape[0])
                    catboost_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
                    
                    # Make predictions
                    y_pred_catboost = catboost_classifier.predict(X_test_vectors)
                    if np.min(y_train) > 0:
                        y_pred_catboost = y_pred_catboost + 1
                    
                    # Performance evaluation
                    print(f"CatBoost Classification Report (learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}):")
                    print(classification_report(y_test, y_pred_catboost))
                    print(f"CatBoost Accuracy: {accuracy_score(y_test, y_pred_catboost):.4f}\n")
                    
                    check_memory()

Training CatBoost with learning_rate=0.05, iterations=100, depth=5, l2_leaf_reg=1, border_count=50...
CatBoost Classification Report (learning_rate=0.05, iterations=100, depth=5, l2_leaf_reg=1, border_count=50):
              precision    recall  f1-score   support

           1       0.48      0.38      0.42    359811
           2       0.48      0.58      0.53    360189

    accuracy                           0.48    720000
   macro avg       0.48      0.48      0.48    720000
weighted avg       0.48      0.48      0.48    720000

CatBoost Accuracy: 0.4803

Memory usage: 46.5%
Training CatBoost with learning_rate=0.05, iterations=100, depth=5, l2_leaf_reg=1, border_count=100...
CatBoost Classification Report (learning_rate=0.05, iterations=100, depth=5, l2_leaf_reg=1, border_count=100):
              precision    recall  f1-score   support

           1       0.46      0.37      0.41    359811
           2       0.47      0.57      0.52    360189

    accuracy                        

# Random forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric (assuming y_train, y_test are lists from FastText pipeline)
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])

# Conditional label adjustment (assuming original labels are 1-based [1, 2] as per CatBoost output)
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Define expanded hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200],  # More trees for better learning
    'max_depth': [5, 10],  # Deeper trees
    'min_samples_split': [2, 5]
}

# Stratified K-Fold Cross-Validation (increased to 5 folds for better validation)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use N_CORES from FastText code for consistency
N_CORES = multiprocessing.cpu_count() - 1

# Iterate over hyperparameters
for n_estimators in param_grid['n_estimators']:
    for depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            print(f"Training Random Forest with n_estimators={n_estimators}, depth={depth}, min_samples_split={min_samples_split}...")
            
            # Initialize the RandomForest model
            rf_classifier = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=depth,
                min_samples_split=min_samples_split,
                n_jobs=N_CORES,  # Use same core count as FastText
                random_state=42
            )
            
            # Train on more data (e.g., 50,000 samples or full dataset if memory allows)
            subset_size = min(50000, X_train_vectors.shape[0])  # Increased from 10,000
            rf_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
            
            # Make predictions
            y_pred_rf = rf_classifier.predict(X_test_vectors)
            if np.min(y_train) > 0:
                y_pred_rf = y_pred_rf + 1
            
            # Performance evaluation
            print(f"Random Forest Classification Report (n_estimators={n_estimators}, depth={depth}, min_samples_split={min_samples_split}):")
            print(classification_report(y_test, y_pred_rf))
            print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}\n")
            
            # Check memory usage
            check_memory()

Training Random Forest with n_estimators=100, depth=5, min_samples_split=2...
Random Forest Classification Report (n_estimators=100, depth=5, min_samples_split=2):
              precision    recall  f1-score   support

           1       0.46      0.29      0.35    359811
           2       0.48      0.66      0.56    360189

    accuracy                           0.48    720000
   macro avg       0.47      0.48      0.46    720000
weighted avg       0.47      0.48      0.46    720000

Random Forest Accuracy: 0.4761

Memory usage: 47.4%
Training Random Forest with n_estimators=100, depth=5, min_samples_split=5...
Random Forest Classification Report (n_estimators=100, depth=5, min_samples_split=5):
              precision    recall  f1-score   support

           1       0.45      0.29      0.35    359811
           2       0.48      0.66      0.55    360189

    accuracy                           0.47    720000
   macro avg       0.47      0.47      0.45    720000
weighted avg       0.

# AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric (assuming y_train, y_test are lists from FastText pipeline)
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])

# Conditional label adjustment (assuming original labels are 1-based [1, 2] as per previous outputs)
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Define expanded hyperparameter grid for AdaBoost
param_grid = {
    'n_estimators': [100, 200],  # More estimators for better boosting
    'learning_rate': [0.1, 0.5]  # Higher learning rates for faster adaptation
}

# Stratified K-Fold Cross-Validation (increased to 5 folds for better validation)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use N_CORES from FastText code for consistency (though AdaBoost doesn’t use n_jobs directly)
N_CORES = multiprocessing.cpu_count() - 1

# Iterate over hyperparameters
for n_estimators in param_grid['n_estimators']:
    for learning_rate in param_grid['learning_rate']:
        print(f"Training AdaBoost with n_estimators={n_estimators}, learning_rate={learning_rate}...")
        
        # Initialize the AdaBoost model
        adaboost_classifier = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=42
        )
        
        # Train on more data (e.g., 50,000 samples or full dataset if memory allows)
        subset_size = min(50000, X_train_vectors.shape[0])  # Increased from 10,000
        adaboost_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
        
        # Make predictions
        y_pred_adaboost = adaboost_classifier.predict(X_test_vectors)
        if np.min(y_train) > 0:
            y_pred_adaboost = y_pred_adaboost + 1
        
        # Performance evaluation
        print(f"AdaBoost Classification Report (n_estimators={n_estimators}, learning_rate={learning_rate}):")
        print(classification_report(y_test, y_pred_adaboost))
        print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}\n")
        
        # Check memory usage
        check_memory()

Training AdaBoost with n_estimators=100, learning_rate=0.1...
AdaBoost Classification Report (n_estimators=100, learning_rate=0.1):
              precision    recall  f1-score   support

           1       0.48      0.32      0.38    359811
           2       0.49      0.65      0.56    360189

    accuracy                           0.49    720000
   macro avg       0.49      0.49      0.47    720000
weighted avg       0.49      0.49      0.47    720000

AdaBoost Accuracy: 0.4871

Memory usage: 47.9%
Training AdaBoost with n_estimators=100, learning_rate=0.5...
AdaBoost Classification Report (n_estimators=100, learning_rate=0.5):
              precision    recall  f1-score   support

           1       0.46      0.41      0.43    359811
           2       0.47      0.52      0.49    360189

    accuracy                           0.47    720000
   macro avg       0.46      0.47      0.46    720000
weighted avg       0.46      0.47      0.46    720000

AdaBoost Accuracy: 0.4653

Memory u

# Naive Bayes and SVM

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import psutil
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric (assuming y_train, y_test are lists from FastText pipeline)
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])

# Conditional label adjustment (assuming original labels are 1-based [1, 2] as per previous outputs)
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Use N_CORES from FastText code for consistency (not used here, kept for reference)
N_CORES = multiprocessing.cpu_count() - 1

# Naive Bayes (GaussianNB) training
print("Training Gaussian Naive Bayes...")
nb_classifier = GaussianNB()
# Train on a subset for efficiency
subset_size = min(50000, X_train_vectors.shape[0])
nb_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])

# Predict
y_pred_nb = nb_classifier.predict(X_test_vectors)
if np.min(y_train) > 0:
    y_pred_nb = y_pred_nb + 1

# Performance evaluation
print("Gaussian Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print(f"Gaussian Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}\n")
check_memory()

# SVM hyperparameter tuning
c_values = [0.1, 0.5, 1.0, 2.0]
for c_value in c_values:
    print(f"Training SVM with C={c_value}...")
    svm_classifier = LinearSVC(
        C=c_value,
        max_iter=5000,
        random_state=42,
        dual=False  # Faster for dense data with n_samples > n_features
    )
    
    # Train on a subset for efficiency
    subset_size = min(50000, X_train_vectors.shape[0])
    svm_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
    
    # Predict
    y_pred_svm = svm_classifier.predict(X_test_vectors)
    if np.min(y_train) > 0:
        y_pred_svm = y_pred_svm + 1
    
    # Performance evaluation
    print(f"SVM Classification Report (C={c_value}):")
    print(classification_report(y_test, y_pred_svm))
    print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}\n")
    check_memory()

Training Gaussian Naive Bayes...
Gaussian Naive Bayes Classification Report:
              precision    recall  f1-score   support

           1       0.46      0.53      0.49    359811
           2       0.44      0.38      0.41    360189

    accuracy                           0.45    720000
   macro avg       0.45      0.45      0.45    720000
weighted avg       0.45      0.45      0.45    720000

Gaussian Naive Bayes Accuracy: 0.4525

Memory usage: 47.8%
Training SVM with C=0.1...
SVM Classification Report (C=0.1):
              precision    recall  f1-score   support

           1       0.46      0.40      0.43    359811
           2       0.47      0.53      0.50    360189

    accuracy                           0.47    720000
   macro avg       0.47      0.47      0.46    720000
weighted avg       0.47      0.47      0.46    720000

SVM Accuracy: 0.4667

Memory usage: 47.7%
Training SVM with C=0.5...
SVM Classification Report (C=0.5):
              precision    recall  f1-score 

# XG Boost

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import psutil
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric (assuming y_train, y_test are lists from FastText pipeline)
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])

# Conditional label adjustment (assuming original labels are 1-based [1, 2] as per previous outputs)
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Define hyperparameter grid for XGBoost
param_grid = {
    'learning_rate': [0.05, 0.1],  # Focus on moderate to higher rates
    'n_estimators': [100, 200],    # More trees for better learning
    'max_depth': [3, 5],           # Moderate depths
    'min_child_weight': [1, 5],    # Control overfitting
    'gamma': [0, 0.1]             # Minimum loss reduction
}

# Stratified K-Fold Cross-Validation (increased to 5 folds for better validation)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use N_CORES from FastText code for consistency
N_CORES = multiprocessing.cpu_count() - 1

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for n_est in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_child_weight in param_grid['min_child_weight']:
                for gamma in param_grid['gamma']:
                    print(f"Training XGBoost with learning_rate={lr}, n_estimators={n_est}, max_depth={max_depth}, min_child_weight={min_child_weight}, gamma={gamma}...")
                    
                    # Initialize the XGBoost model
                    xgb_classifier = XGBClassifier(
                        learning_rate=lr,
                        n_estimators=n_est,
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        gamma=gamma,
                        use_label_encoder=False,  # Avoid deprecation warning
                        eval_metric='logloss',    # For binary classification
                        tree_method='hist',       # Faster histogram-based method
                        n_jobs=N_CORES,           # Parallelize across cores
                        random_state=42
                    )
                    
                    # Train on more data (e.g., 50,000 samples or full dataset if memory allows)
                    subset_size = min(50000, X_train_vectors.shape[0])  # Increased from 10,000
                    xgb_classifier.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
                    
                    # Make predictions
                    y_pred_xgb = xgb_classifier.predict(X_test_vectors)
                    if np.min(y_train) > 0:
                        y_pred_xgb = y_pred_xgb + 1
                    
                    # Performance evaluation
                    print(f"XGBoost Classification Report (learning_rate={lr}, n_estimators={n_est}, max_depth={max_depth}, min_child_weight={min_child_weight}, gamma={gamma}):")
                    print(classification_report(y_test, y_pred_xgb))
                    print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}\n")
                    
                    # Check memory usage
                    check_memory()

Training XGBoost with learning_rate=0.05, n_estimators=100, max_depth=3, min_child_weight=1, gamma=0...
XGBoost Classification Report (learning_rate=0.05, n_estimators=100, max_depth=3, min_child_weight=1, gamma=0):
              precision    recall  f1-score   support

           1       0.48      0.39      0.43    359811
           2       0.48      0.57      0.53    360189

    accuracy                           0.48    720000
   macro avg       0.48      0.48      0.48    720000
weighted avg       0.48      0.48      0.48    720000

XGBoost Accuracy: 0.4805

Memory usage: 47.9%
Training XGBoost with learning_rate=0.05, n_estimators=100, max_depth=3, min_child_weight=1, gamma=0.1...
XGBoost Classification Report (learning_rate=0.05, n_estimators=100, max_depth=3, min_child_weight=1, gamma=0.1):
              precision    recall  f1-score   support

           1       0.48      0.39      0.43    359811
           2       0.48      0.57      0.53    360189

    accuracy               

# Logistic Regression

# MLP Classifier