In [2]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)


Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [3]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 2880000
Testing samples: 720000


# FASTTEXT

In [5]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold
from scipy.stats import ttest_rel
import psutil
import multiprocessing
from copy import deepcopy  # To ensure fresh model instances

In [10]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold
from scipy.stats import ttest_rel
import psutil
import multiprocessing

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Ensure labels are numeric and 0-based
y_train = np.array([int(label) for label in y_train])
y_test = np.array([int(label) for label in y_test])
y_train_adjusted = y_train - 1 if np.min(y_train) > 0 else y_train
y_test_adjusted = y_test - 1 if np.min(y_test) > 0 else y_test

# Define models
N_CORES = multiprocessing.cpu_count() - 1
models = {
    'SVM': LinearSVC(C=1.0, max_iter=5000, dual=False, random_state=42),
    'Logistic Regression': LogisticRegression(C=1.0, max_iter=5000, n_jobs=N_CORES, random_state=42),
    'Gaussian NB': GaussianNB(),
    'MLP Classifier': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, learning_rate_init=0.01, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, n_jobs=N_CORES, use_label_encoder=False, eval_metric='logloss', tree_method='hist', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=5, n_jobs=N_CORES, random_state=42, verbose=-1)
}

# Step 1: Train models and find top 2 by accuracy
accuracies = {}
print("Training and evaluating models on test set...")
for name, model in models.items():
    subset_size = min(50000, X_train_vectors.shape[0])
    model.fit(X_train_vectors[:subset_size], y_train_adjusted[:subset_size])
    y_pred = model.predict(X_test_vectors)
    if np.min(y_train) > 0:
        y_pred = y_pred + 1
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
check_memory()

# Identify top 2 models
top_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)[:2]
print("\nTop 2 models by accuracy:")
for name, acc in top_models:
    print(f"{name}: {acc:.4f}")

# Step 2: 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
metrics = ['accuracy', 'f1', 'roc_auc']
results = {name: {metric: [] for metric in metrics} for name in models.keys()}

print("\nPerforming 10-fold cross-validation...")
for name, model in models.items():
    print(f"Cross-validating {name}...")
    fold_results = {metric: [] for metric in metrics}  # Reset per model
    for train_idx, val_idx in kf.split(X_train_vectors):
        X_train_fold, X_val_fold = X_train_vectors[train_idx], X_train_vectors[val_idx]
        y_train_fold, y_val_fold = y_train_adjusted[train_idx], y_train_adjusted[val_idx]
        subset_size = min(50000, len(train_idx))  # Adjust subset per fold
        model.fit(X_train_fold[:subset_size], y_train_fold[:subset_size])
        y_pred_fold = model.predict(X_val_fold)
        if np.min(y_train) > 0:
            y_pred_fold = y_pred_fold + 1
            y_val_fold = y_val_fold + 1
        fold_results['accuracy'].append(accuracy_score(y_val_fold, y_pred_fold))
        fold_results['f1'].append(f1_score(y_val_fold, y_pred_fold, average='weighted'))
        if hasattr(model, "predict_proba"):
            y_prob_fold = model.predict_proba(X_val_fold)[:, 1]
            fold_results['roc_auc'].append(roc_auc_score(y_val_fold, y_prob_fold))
        else:
            fold_results['roc_auc'].append(np.nan)
    # Assign fold results to main results dict
    for metric in metrics:
        results[name][metric] = fold_results[metric]

# Format cross-validation results
print("\n### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics")
print("Table: Performance of Models with FastText Features (10-Fold Cross-Validation)")
print("| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |")
print("|--------------------|---------------------|---------------------|--------------------|")
for name in models.keys():
    acc_mean, acc_std = np.mean(results[name]['accuracy']), np.std(results[name]['accuracy'])
    f1_mean, f1_std = np.mean(results[name]['f1']), np.std(results[name]['f1'])
    auc_mean, auc_std = np.mean(results[name]['roc_auc']), np.std(results[name]['roc_auc'])
    print(f"| {name:<18} | {acc_mean:.2f} ± {acc_std:.2f}         | {f1_mean:.2f} ± {f1_std:.2f}         | {auc_mean:.2f} ± {auc_std:.2f}        |")

# Step 3: Statistical Significance Tests
top_model_1_name, top_model_2_name = top_models[0][0], top_models[1][0]
top_model_1_scores = results[top_model_1_name]['accuracy']
top_model_2_scores = results[top_model_2_name]['accuracy']

print("\n### 2. Conduct Statistical Significance Tests and Present p-Values")
print(f"Table: Statistical Significance of {top_model_1_name} and {top_model_2_name} vs. Other Models (FastText)")
print("| Comparison                  | p-value (Accuracy) | Significant? |")
print("|-----------------------------|-------------------|--------------|")
for name in models.keys():
    if name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_1_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_1_name} vs. {name:<15} | {p_value:.3f}             | {significant:<12} |")
for name in models.keys():
    if name != top_model_2_name and name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_2_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_2_name} vs. {name:<15} | {p_value:.3f}             | {significant:<12} |")
check_memory()

Training and evaluating models on test set...
SVM Accuracy: 0.8511
Logistic Regression Accuracy: 0.8504
Gaussian NB Accuracy: 0.7619
MLP Classifier Accuracy: 0.8537
XGBoost Accuracy: 0.8498
LightGBM Accuracy: 0.8500
Memory usage: 54.6%

Top 2 models by accuracy:
MLP Classifier: 0.8537
SVM: 0.8511

Performing 10-fold cross-validation...
Cross-validating SVM...
Cross-validating Logistic Regression...
Cross-validating Gaussian NB...
Cross-validating MLP Classifier...
Cross-validating XGBoost...
Cross-validating LightGBM...

### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics
Table: Performance of Models with FastText Features (10-Fold Cross-Validation)
| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |
|--------------------|---------------------|---------------------|--------------------|
| SVM                | 0.85 ± 0.00         | 0.85 ± 0.00         | nan ± nan        |
| Logistic Regression | 0.85 ± 0.00         | 0.85

In [15]:
import fasttext
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold
from scipy.stats import ttest_rel
import psutil
from copy import deepcopy
import random
import os

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# --- Load Preprocessed Data from processed-chunks-1 ---
processed_dir = "/kaggle/input/processed-chunks-1/"
print("Loading preprocessed data from processed-chunks-1...")

# List all CSV files in the directory
csv_files = sorted([f for f in os.listdir(processed_dir) if f.endswith('.csv')])
print(f"Found {len(csv_files)} CSV files: {csv_files[:5]}...")  # Show first 5 for verification

# Load and concatenate chunks
dataframes = []
for csv_file in tqdm(csv_files, desc="Loading CSV files"):
    file_path = os.path.join(processed_dir, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all chunks into one DataFrame
data = pd.concat(dataframes, ignore_index=True)
print(f"Total rows loaded: {len(data)}")

# Extract text and labels
text_column = 'cleaned_review'  # Use preprocessed text
label_column = 'label'         # Use label column
X = data[text_column].fillna("")  # Cleaned review text
y = data[label_column].values     # Labels (1 or 2)

# Split into train (2.88M) and test (720K)
X_train_processed = X[:2880000].tolist()
X_test_processed = X[2880000:2880000 + 720000].tolist()
y_train = y[:2880000]
y_test = y[2880000:2880000 + 720000]
print(f"Train samples: {len(X_train_processed)}, Test samples: {len(X_test_processed)}")

# Save training data for FastText unsupervised training
train_unsupervised_file = "/kaggle/working/train_unsupervised.txt"
print("Writing unsupervised training data to file...")
with open(train_unsupervised_file, "w", encoding="utf-8") as f:
    f.write("\n".join(f"__label__dummy {text}" for text in X_train_processed) + "\n")

# Train FastText model
print("Training FastText model...")
N_CORES = max(1, multiprocessing.cpu_count() // 2)
model = fasttext.train_unsupervised(
    train_unsupervised_file, model="skipgram", dim=50, epoch=1, minCount=5, thread=N_CORES, lr=0.2, bucket=500000
)
print("FastText training complete!")

# Generate embeddings
def get_fasttext_vector(text):
    return model.get_sentence_vector(text)

print("Converting text to FastText embeddings...")
X_train_fasttext = [get_fasttext_vector(text) for text in tqdm(X_train_processed, desc="Embedding X_train")]
X_test_fasttext = [get_fasttext_vector(text) for text in tqdm(X_test_processed, desc="Embedding X_test")]
print("FastText transformation complete!")

X_train_vectors = np.array(X_train_fasttext)
X_test_vectors = np.array(X_test_fasttext)
print(f"Train FastText shape: {X_train_vectors.shape}")
print(f"Test FastText shape: {X_test_vectors.shape}")

# --- Model Evaluation ---
# Adjust labels from {1, 2} to {0, 1}
y_train_adjusted = y_train - 1  # amazonreviews uses 1=negative, 2=positive
y_test_adjusted = y_test - 1

N_CORES = multiprocessing.cpu_count() - 1

model_templates = {
    'SVM': LinearSVC(C=1.0, max_iter=5000, dual=False, random_state=42),
    'Logistic Regression': LogisticRegression(C=1.0, max_iter=5000, n_jobs=N_CORES, random_state=42),
    'Gaussian NB': GaussianNB(),
    'MLP Classifier': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, learning_rate_init=0.01, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, n_jobs=N_CORES, use_label_encoder=False, eval_metric='logloss', tree_method='hist', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=5, n_jobs=N_CORES, random_state=42, verbose=-1)
}

# Step 1: Train and evaluate on test set
accuracies = {}
print("Training and evaluating models on test set...")
for name, model_template in model_templates.items():
    model = deepcopy(model_template)
    subset_idx = random.sample(range(len(X_train_vectors)), min(50000, len(X_train_vectors)))
    X_train_subset = X_train_vectors[subset_idx]
    y_train_subset = y_train_adjusted[subset_idx]
    model.fit(X_train_subset, y_train_subset)
    y_pred = model.predict(X_test_vectors)
    y_pred_adjusted = y_pred + 1  # Adjust back to {1, 2} for evaluation
    acc = accuracy_score(y_test, y_pred_adjusted)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
check_memory()

top_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)[:2]
print("\nTop 2 models by accuracy:")
for name, acc in top_models:
    print(f"{name}: {acc:.4f}")

# Step 2: 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
metrics = ['accuracy', 'f1', 'roc_auc']
results = {name: {metric: [] for metric in metrics} for name in model_templates.keys()}

print("\nPerforming 10-fold cross-validation...")
for name, model_template in model_templates.items():
    print(f"Cross-validating {name}...")
    for train_idx, val_idx in kf.split(X_train_vectors):
        X_train_fold = X_train_vectors[train_idx]
        y_train_fold = y_train_adjusted[train_idx]
        X_val_fold = X_train_vectors[val_idx]
        y_val_fold = y_train_adjusted[val_idx]
        
        subset_idx = random.sample(list(train_idx), min(50000, len(train_idx)))
        X_train_subset = X_train_vectors[subset_idx]
        y_train_subset = y_train_adjusted[subset_idx]
        
        model = deepcopy(model_template)
        model.fit(X_train_subset, y_train_subset)
        y_pred_fold = model.predict(X_val_fold)
        
        y_pred_fold_adjusted = y_pred_fold + 1
        y_val_fold_adjusted = y_val_fold + 1
        
        results[name]['accuracy'].append(accuracy_score(y_val_fold_adjusted, y_pred_fold_adjusted))
        results[name]['f1'].append(f1_score(y_val_fold_adjusted, y_pred_fold_adjusted, average='weighted'))
        if hasattr(model, "predict_proba"):
            y_prob_fold = model.predict_proba(X_val_fold)[:, 1]
            results[name]['roc_auc'].append(roc_auc_score(y_val_fold, y_prob_fold))
        else:
            results[name]['roc_auc'].append(np.nan)

# Display CV results
print("\n### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics")
print("Table: Performance of Models with FastText Features (10-Fold Cross-Validation)")
print("| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |")
print("|--------------------|---------------------|---------------------|--------------------|")
for name in model_templates.keys():
    acc_mean, acc_std = np.mean(results[name]['accuracy']), np.std(results[name]['accuracy'])
    f1_mean, f1_std = np.mean(results[name]['f1']), np.std(results[name]['f1'])
    auc_mean, auc_std = np.mean(results[name]['roc_auc']), np.std(results[name]['roc_auc'])
    print(f"| {name:<18} | {acc_mean:.2f} ± {acc_std:.2f}         | {f1_mean:.2f} ± {f1_std:.2f}         | {auc_mean:.2f} ± {auc_std:.2f}        |")

# Step 3: Statistical Significance Tests
top_model_1_name, top_model_2_name = top_models[0][0], top_models[1][0]
top_model_1_scores = results[top_model_1_name]['accuracy']
top_model_2_scores = results[top_model_2_name]['accuracy']

print("\n### 2. Conduct Statistical Significance Tests and Present p-Values")
print(f"Table: Statistical Significance of {top_model_1_name} and {top_model_2_name} vs. Other Models (FastText)")
print("| Comparison                  | p-value (Accuracy) | Significant? |")
print("|-----------------------------|-------------------|--------------|")
for name in model_templates.keys():
    if name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_1_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_1_name} vs. {name:<15} | {p_value:.3f}             | {significant:<12} |")
for name in model_templates.keys():
    if name != top_model_2_name and name != top_model_1_name:
        other_scores = results[name]['accuracy']
        t_stat, p_value = ttest_rel(top_model_2_scores, other_scores)
        significant = "Yes" if p_value < 0.05 else "No"
        print(f"| {top_model_2_name} vs. {name:<15} | {p_value:.3f}             | {significant:<12} |")
check_memory()

Loading preprocessed data from processed-chunks-1...
Found 72 CSV files: ['processed_chunk_0_50000.csv', 'processed_chunk_1000000_1050000.csv', 'processed_chunk_100000_150000.csv', 'processed_chunk_1050000_1100000.csv', 'processed_chunk_1100000_1150000.csv']...


Loading CSV files: 100%|██████████| 72/72 [00:31<00:00,  2.31it/s]


Total rows loaded: 3600000
Train samples: 2880000, Test samples: 720000
Writing unsupervised training data to file...
Training FastText model...
FastText training complete!
Converting text to FastText embeddings...


Embedding X_train: 100%|██████████| 2880000/2880000 [06:18<00:00, 7618.70it/s]
Embedding X_test: 100%|██████████| 720000/720000 [01:34<00:00, 7631.50it/s]


FastText transformation complete!
Train FastText shape: (2880000, 50)
Test FastText shape: (720000, 50)
Training and evaluating models on test set...
SVM Accuracy: 0.8427
Logistic Regression Accuracy: 0.8419
Gaussian NB Accuracy: 0.7440
MLP Classifier Accuracy: 0.8484
XGBoost Accuracy: 0.8418
LightGBM Accuracy: 0.8421
Memory usage: 79.2%

Top 2 models by accuracy:
MLP Classifier: 0.8484
SVM: 0.8427

Performing 10-fold cross-validation...
Cross-validating SVM...
Cross-validating Logistic Regression...
Cross-validating Gaussian NB...
Cross-validating MLP Classifier...
Cross-validating XGBoost...
Cross-validating LightGBM...

### 1. Perform K-Fold Cross-Validation and Report Detailed Performance Metrics
Table: Performance of Models with FastText Features (10-Fold Cross-Validation)
| Model              | Accuracy (Mean ± SD) | F1-Score (Mean ± SD) | AUC-ROC (Mean ± SD) |
|--------------------|---------------------|---------------------|--------------------|
| SVM                | 0.84 ± 0.