In [2]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)


Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [3]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


Training samples: 2880000
Testing samples: 720000


# Transformer model

In [5]:
# from gensim.models import Word2Vec, FastText
# from sklearn.model_selection import train_test_split
# import numpy as np
# import multiprocessing
# import gc

# # Ensure all reviews are strings before splitting and handle missing values
# combined_data['cleaned_review'] = combined_data['cleaned_review'].fillna('').astype(str)
# X = combined_data['cleaned_review']
# y = combined_data['label']

# # Reduce dataset size to prevent memory issues
# sample_size = min(500000, len(X))  # Limit to 500k samples if dataset is larger
# X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# # Split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# # Tokenize the text data
# sentences_train = [review.split() for review in X_train]
# sentences_test = [review.split() for review in X_test]

# # Optimize training parameters for speed and memory
# num_workers = multiprocessing.cpu_count() // 2  # Use half of available CPU cores
# vector_size = 50  # Reduce size to optimize speed and memory
# min_count = 5  # Ignore words that appear less frequently
# window = 4  # Slightly smaller context window
# epochs = 3  # Reduce epochs to save memory

# # Train Word2Vec Model
# w2v_model = Word2Vec(sentences_train, vector_size=vector_size, window=window, min_count=min_count, workers=num_workers)
# w2v_model.train(sentences_train, total_examples=len(sentences_train), epochs=epochs)

# # Train FastText Model
# ft_model = FastText(sentences_train, vector_size=vector_size, window=window, min_count=min_count, workers=num_workers)
# ft_model.train(sentences_train, total_examples=len(sentences_train), epochs=epochs)

# # Function to convert reviews to vectors by averaging word embeddings
# def get_avg_word_vector(words, model, vector_size):
#     vectors = [model.wv[word] for word in words if word in model.wv]
#     return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# # Convert train and test sets into vectors
# X_train_w2v = np.array([get_avg_word_vector(review, w2v_model, vector_size) for review in sentences_train], dtype=np.float32)
# X_test_w2v = np.array([get_avg_word_vector(review, w2v_model, vector_size) for review in sentences_test], dtype=np.float32)

# X_train_ft = np.array([get_avg_word_vector(review, ft_model, vector_size) for review in sentences_train], dtype=np.float32)
# X_test_ft = np.array([get_avg_word_vector(review, ft_model, vector_size) for review in sentences_test], dtype=np.float32)

# # Free memory
# gc.collect()

# print("Feature engineering complete.")

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import numpy as np
import gc

# Load SBERT model (optimized for speed and accuracy)
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Small, fast model

# Ensure all reviews are strings before splitting and handle missing values
combined_data['cleaned_review'] = combined_data['cleaned_review'].fillna('').astype(str)
X = combined_data['cleaned_review']
y = combined_data['label']

# Reduce dataset size to prevent memory issues
sample_size = min(500000, len(X))  # Limit to 500k samples if dataset is larger
X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# Compute SBERT embeddings for train and test sets
X_train_sbert = np.array(sbert_model.encode(X_train.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))
X_test_sbert = np.array(sbert_model.encode(X_test.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))

# Free memory
gc.collect()

print("SBERT feature engineering complete.")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/12500 [00:00<?, ?it/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

SBERT feature engineering complete.


# XG Boost

# LightGBM

In [6]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Adjust target labels to be 0-based
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Convert labels to numpy arrays
y_train_adjusted = np.array(y_train_adjusted)
y_test_adjusted = np.array(y_test_adjusted)

# Define hyperparameter grid for LightGBM (with smaller iterations)
learning_rates = [0.01, 0.05]
iterations_values = [50, 100]  # Reduced iterations
depth_values = [3, 5]
num_leaves_values = [15, 31]

# Cross-validation setup (reduced folds to save memory)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3 folds

# Iterate over hyperparameters
for lr in learning_rates:
    for iterations in iterations_values:
        for depth in depth_values:
            for num_leaves in num_leaves_values:
                print(f"Training LightGBM classifier with learning_rate={lr}, iterations={iterations}, depth={depth}, num_leaves={num_leaves}...")

                # Set up the LightGBM model with the specified hyperparameters
                lgbm_classifier = lgb.LGBMClassifier(
                    learning_rate=lr,
                    n_estimators=iterations,
                    max_depth=depth,
                    num_leaves=num_leaves,
                    verbose=-1,
                    n_jobs=-1  # Use all available CPU cores
                )

                # Cross-validation setup
                lgbm_classifier.fit(X_train_sbert[:10000], y_train_adjusted[:10000])  # Use a smaller subset
                y_pred_lgbm = lgbm_classifier.predict(X_test_sbert)

                # Adjust predictions back to the original label scale for reporting
                y_pred_lgbm_original = y_pred_lgbm + 1

                # Performance reporting
                print(f"LightGBM Classification Report for learning_rate={lr}, iterations={iterations}, depth={depth}, num_leaves={num_leaves}:")
                print(classification_report(y_test, y_pred_lgbm_original))
                accuracy = accuracy_score(y_test, y_pred_lgbm_original)
                print(f"LightGBM Accuracy: {accuracy:.4f}\n")

                # Check memory usage after each iteration
                check_memory()


Training LightGBM classifier with learning_rate=0.01, iterations=50, depth=3, num_leaves=15...
LightGBM Classification Report for learning_rate=0.01, iterations=50, depth=3, num_leaves=15:
              precision    recall  f1-score   support

           1       0.68      0.66      0.67     49906
           2       0.67      0.69      0.68     50094

    accuracy                           0.68    100000
   macro avg       0.68      0.68      0.68    100000
weighted avg       0.68      0.68      0.68    100000

LightGBM Accuracy: 0.6778

Memory usage: 38.2%
Training LightGBM classifier with learning_rate=0.01, iterations=50, depth=3, num_leaves=31...
LightGBM Classification Report for learning_rate=0.01, iterations=50, depth=3, num_leaves=31:
              precision    recall  f1-score   support

           1       0.68      0.66      0.67     49906
           2       0.67      0.69      0.68     50094

    accuracy                           0.68    100000
   macro avg       0.68      0

# Adaboost

In [8]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import psutil
import numpy as np

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Adjust target labels to be 0-based
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Convert labels to numpy arrays
y_train_adjusted = np.array(y_train_adjusted)
y_test_adjusted = np.array(y_test_adjusted)

# Define hyperparameter grid for AdaBoost
n_estimators_values = [50, 100]
learning_rate_values = [0.01, 0.1]

# Cross-validation setup (reduced folds to save memory)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3 folds

# Iterate over hyperparameters
for n_estimators in n_estimators_values:
    for learning_rate in learning_rate_values:
        print(f"Training AdaBoost classifier with n_estimators={n_estimators}, learning_rate={learning_rate}...")

        # Set up the AdaBoost model with the specified hyperparameters
        adaboost_classifier = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=42
        )

        # Train the model
        adaboost_classifier.fit(X_train_sbert[:10000], y_train_adjusted[:10000])  # Use a smaller subset
        y_pred_adaboost = adaboost_classifier.predict(X_test_sbert)

        # Adjust predictions back to the original label scale for reporting
        y_pred_adaboost_original = y_pred_adaboost + 1

        # Performance reporting
        print(f"AdaBoost Classification Report for n_estimators={n_estimators}, learning_rate={learning_rate}:")
        print(classification_report(y_test, y_pred_adaboost_original))
        accuracy = accuracy_score(y_test, y_pred_adaboost_original)
        print(f"AdaBoost Accuracy: {accuracy:.4f}\n")

        # Check memory usage after each iteration
        check_memory()


Training AdaBoost classifier with n_estimators=50, learning_rate=0.01...
AdaBoost Classification Report for n_estimators=50, learning_rate=0.01:
              precision    recall  f1-score   support

           1       0.63      0.66      0.65     49906
           2       0.65      0.61      0.63     50094

    accuracy                           0.64    100000
   macro avg       0.64      0.64      0.64    100000
weighted avg       0.64      0.64      0.64    100000

AdaBoost Accuracy: 0.6375

Memory usage: 42.5%
Training AdaBoost classifier with n_estimators=50, learning_rate=0.1...
AdaBoost Classification Report for n_estimators=50, learning_rate=0.1:
              precision    recall  f1-score   support

           1       0.72      0.71      0.72     49906
           2       0.72      0.73      0.72     50094

    accuracy                           0.72    100000
   macro avg       0.72      0.72      0.72    100000
weighted avg       0.72      0.72      0.72    100000

AdaBoost Ac

# Catboost

In [12]:
from catboost import CatBoostClassifier, cv, Pool
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
import psutil
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Ensure all reviews are strings before splitting and handle missing values
combined_data['cleaned_review'] = combined_data['cleaned_review'].fillna('').astype(str)
X = combined_data['cleaned_review']
y = combined_data['label']

# Reduce dataset size to prevent memory issues
sample_size = min(500000, len(X))  # Limit to 500k samples if dataset is larger
X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# Compute SBERT embeddings for train and test sets
X_train_sbert = np.array(sbert_model.encode(X_train.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))
X_test_sbert = np.array(sbert_model.encode(X_test.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Adjust target labels to be 0-based
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Convert labels to numpy arrays
y_train_adjusted = np.array(y_train_adjusted)
y_test_adjusted = np.array(y_test_adjusted)

# Define hyperparameter grid for CatBoost (with smaller iterations)
learning_rates = [0.01, 0.05]
iterations_values = [50, 100]  # Reduced iterations
depth_values = [3, 5]
l2_leaf_reg_values = [1, 3]
border_count_values = [32, 50]

# Cross-validation setup (reduced folds to save memory)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3 folds

# Iterate over hyperparameters
for lr in learning_rates:
    for iterations in iterations_values:
        for depth in depth_values:
            for l2_leaf_reg in l2_leaf_reg_values:
                for border_count in border_count_values:
                    print(f"Training CatBoost classifier with learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}...")

                    # Set up the CatBoost model with the specified hyperparameters
                    catboost_classifier = CatBoostClassifier(
                        learning_rate=lr,
                        iterations=iterations,
                        depth=depth,
                        l2_leaf_reg=l2_leaf_reg,
                        border_count=border_count,
                        verbose=0,  # To suppress verbose output
                        thread_count=-1  # Use all available CPU cores
                    )

                    # Prepare data as Pool objects for CatBoost
                    train_pool = Pool(X_train_sbert[:10000], label=y_train_adjusted[:10000])  # Use a smaller subset
                    test_pool = Pool(X_test_sbert, label=y_test_adjusted)

                    # Cross-validation to monitor the model's generalization
                    params = catboost_classifier.get_params()
                    params['eval_metric'] = 'Accuracy'  # Specify metric for evaluation
                    params['loss_function'] = 'Logloss'  # Specify the loss function

                    cv_results = cv(
                        pool=train_pool,  # Provide the training data in Pool format
                        params=params,  # Pass model parameters
                        num_boost_round=iterations, 
                        nfold=3,  # Reduced folds to save memory
                        early_stopping_rounds=10,  # Early stopping if performance doesn't improve
                        as_pandas=True,
                        seed=42
                    )

                    # Best iteration (round) from cross-validation results
                    best_iter = cv_results['test-Accuracy-mean'].idxmax()  # Best round based on accuracy
                    print(f"Best round from CV: {best_iter}")

                    # Train the model with the best number of rounds
                    catboost_classifier.fit(X_train_sbert, y_train_adjusted, verbose=0)

                    print("Predicting with CatBoost...")
                    y_pred_catboost = catboost_classifier.predict(X_test_sbert)

                    # Adjust predictions back to the original label scale for reporting
                    y_pred_catboost_original = y_pred_catboost + 1

                    # Performance reporting
                    print(f"CatBoost Classification Report for learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}:")
                    print(classification_report(y_test, y_pred_catboost_original))
                    accuracy = accuracy_score(y_test, y_pred_catboost_original)
                    print(f"CatBoost Accuracy with learning_rate={lr}, iterations={iterations}, depth={depth}, l2_leaf_reg={l2_leaf_reg}, border_count={border_count}: {accuracy:.4f}\n")

                    # Check memory usage after each iteration
                    check_memory()


Batches:   0%|          | 0/12500 [00:00<?, ?it/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

Training CatBoost classifier with learning_rate=0.01, iterations=50, depth=3, l2_leaf_reg=1, border_count=32...
Training on fold [0/3]

bestTest = 0.700659868
bestIteration = 49

Training on fold [1/3]

bestTest = 0.6898620276
bestIteration = 49

Training on fold [2/3]

bestTest = 0.68727491
bestIteration = 48

Best round from CV: 49
Predicting with CatBoost...
CatBoost Classification Report for learning_rate=0.01, iterations=50, depth=3, l2_leaf_reg=1, border_count=32:
              precision    recall  f1-score   support

           1       0.69      0.69      0.69     49906
           2       0.69      0.69      0.69     50094

    accuracy                           0.69    100000
   macro avg       0.69      0.69      0.69    100000
weighted avg       0.69      0.69      0.69    100000

CatBoost Accuracy with learning_rate=0.01, iterations=50, depth=3, l2_leaf_reg=1, border_count=32: 0.6906

Memory usage: 57.2%
Training CatBoost classifier with learning_rate=0.01, iterations=50, de

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import psutil

# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Ensure all reviews are strings before splitting and handle missing values
combined_data['cleaned_review'] = combined_data['cleaned_review'].fillna('').astype(str)
X = combined_data['cleaned_review']
y = combined_data['label']

# Reduce dataset size to prevent memory issues
sample_size = min(500000, len(X))  # Limit to 500k samples if dataset is larger
X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# Compute SBERT embeddings for train and test sets
X_train_sbert = np.array(sbert_model.encode(X_train.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))
X_test_sbert = np.array(sbert_model.encode(X_test.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Adjust target labels to be 0-based
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Convert labels to numpy arrays
y_train_adjusted = np.array(y_train_adjusted)
y_test_adjusted = np.array(y_test_adjusted)

# Define hyperparameter grid for SVM
C_values = [0.1, 1, 10]
kernel_values = ['linear', 'rbf']
gamma_values = ['scale', 'auto']

# Cross-validation setup (reduced folds to save memory)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3 folds

# Iterate over hyperparameters
for C in C_values:
    for kernel in kernel_values:
        for gamma in gamma_values:
            print(f"Training SVM classifier with C={C}, kernel={kernel}, gamma={gamma}...")

            # Set up the SVM model with the specified hyperparameters
            svm_classifier = SVC(
                C=C,
                kernel=kernel,
                gamma=gamma,
                random_state=42
            )

            # Train the model
            svm_classifier.fit(X_train_sbert[:10000], y_train_adjusted[:10000])  # Use a smaller subset
            y_pred_svm = svm_classifier.predict(X_test_sbert)

            # Adjust predictions back to the original label scale for reporting
            y_pred_svm_original = y_pred_svm + 1

            # Performance reporting
            print(f"SVM Classification Report for C={C}, kernel={kernel}, gamma={gamma}:")
            print(classification_report(y_test, y_pred_svm_original))
            accuracy = accuracy_score(y_test, y_pred_svm_original)
            print(f"SVM Accuracy with C={C}, kernel={kernel}, gamma={gamma}: {accuracy:.4f}\n")

            # Check memory usage after each iteration
            check_memory()


# Naive bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import psutil

# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Ensure all reviews are strings before splitting and handle missing values
combined_data['cleaned_review'] = combined_data['cleaned_review'].fillna('').astype(str)
X = combined_data['cleaned_review']
y = combined_data['label']

# Reduce dataset size to prevent memory issues
sample_size = min(500000, len(X))  # Limit to 500k samples if dataset is larger
X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# Compute SBERT embeddings for train and test sets
X_train_sbert = np.array(sbert_model.encode(X_train.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))
X_test_sbert = np.array(sbert_model.encode(X_test.tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True))

# Apply ReLU activation to make all values non-negative
X_train_sbert = np.maximum(0, X_train_sbert)
X_test_sbert = np.maximum(0, X_test_sbert)

# Function to check memory usage
def check_memory():
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

# Adjust target labels to be 0-based
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Convert labels to numpy arrays
y_train_adjusted = np.array(y_train_adjusted)
y_test_adjusted = np.array(y_test_adjusted)

# Define hyperparameter grid for Naive Bayes
alpha_values = [0.01, 0.1, 1.0]  # Regularization parameter for Naive Bayes

# Cross-validation setup (reduced folds to save memory)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3 folds

# Iterate over hyperparameters
for alpha in alpha_values:
    print(f"Training Naive Bayes classifier with alpha={alpha}...")

    # Set up the Naive Bayes model with the specified hyperparameters
    nb_classifier = MultinomialNB(alpha=alpha)

    # Train the model
    nb_classifier.fit(X_train_sbert[:10000], y_train_adjusted[:10000])  # Use a smaller subset
    y_pred_nb = nb_classifier.predict(X_test_sbert)

    # Adjust predictions back to the original label scale for reporting
    y_pred_nb_original = y_pred_nb + 1

    # Performance reporting
    print(f"Naive Bayes Classification Report for alpha={alpha}:")
    print(classification_report(y_test, y_pred_nb_original))
    accuracy = accuracy_score(y_test, y_pred_nb_original)
    print(f"Naive Bayes Accuracy with alpha={alpha}: {accuracy:.4f}\n")

    # Check memory usage after each iteration
    check_memory()


Batches:   0%|          | 0/12500 [00:00<?, ?it/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

Training Naive Bayes classifier with alpha=0.01...
Naive Bayes Classification Report for alpha=0.01:
              precision    recall  f1-score   support

           1       0.78      0.71      0.75     49906
           2       0.74      0.80      0.77     50094

    accuracy                           0.76    100000
   macro avg       0.76      0.76      0.76    100000
weighted avg       0.76      0.76      0.76    100000

Naive Bayes Accuracy with alpha=0.01: 0.7589

Memory usage: 74.4%
Training Naive Bayes classifier with alpha=0.1...
Naive Bayes Classification Report for alpha=0.1:
              precision    recall  f1-score   support

           1       0.78      0.71      0.75     49906
           2       0.74      0.80      0.77     50094

    accuracy                           0.76    100000
   macro avg       0.76      0.76      0.76    100000
weighted avg       0.76      0.76      0.76    100000

Naive Bayes Accuracy with alpha=0.1: 0.7589

Memory usage: 74.4%
Training Naive 