# Revised Phase 4: Mitigation Experiments (Month 3–Early Month 4)

**Objective**: Improve performance on 3-class (Negative, Neutral, Positive) sentiment classification on the Bangla Sentiment Dataset by applying imbalanced learning strategies, including data-level (SMOTE, Random Undersampling, NearMiss) and algorithm-level (Weighted Loss) methods. Train and tune Logistic Regression, SVM, Naive Bayes, and Random Forest, with comprehensive evaluations and visualizations (confusion matrices, ROC-AUC curves, precision-recall curves, F1 comparisons) to assess mitigation effectiveness.

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Define paths
data_dir = "text_representation/"
files = {
    'tfidf_train': f"{data_dir}tfidf_train.npz",
    'tfidf_val': f"{data_dir}tfidf_val.npz",
    'tfidf_test': f"{data_dir}tfidf_test.npz",
    'labels_train': f"{data_dir}labels_train.csv",
    'labels_val': f"{data_dir}labels_val.csv",
    'labels_test': f"{data_dir}labels_test.csv"
}

# Check file existence
for name, path in files.items():
    if not os.path.exists(path):
        logging.error(f"Missing file: {path}")
        raise FileNotFoundError(f"Missing file: {path}")

# Load TF-IDF matrices
tfidf_train = sp.load_npz(files['tfidf_train'])
tfidf_val = sp.load_npz(files['tfidf_val'])
tfidf_test = sp.load_npz(files['tfidf_test'])
logging.info("TF-IDF matrices loaded successfully")

# Load labels
y_train = pd.read_csv(files['labels_train'], encoding='utf-8')['Label'].values
y_val = pd.read_csv(files['labels_val'], encoding='utf-8')['Label'].values
y_test = pd.read_csv(files['labels_test'], encoding='utf-8')['Label'].values
logging.info("Labels loaded successfully")

# Validate shapes
assert tfidf_train.shape[0] == len(y_train), "Train data mismatch"
assert tfidf_val.shape[0] == len(y_val), "Validation data mismatch"
assert tfidf_test.shape[0] == len(y_test), "Test data mismatch"
logging.info("Data shapes validated")

# Print shapes and distribution
print("TF-IDF Train Shape:", tfidf_train.shape)
print("Labels Train Shape:", y_train.shape)
print("Label Distribution (Train):\n", pd.Series(y_train).value_counts(normalize=True) * 100)

2025-06-23 17:54:49,806 - INFO - TF-IDF matrices loaded successfully
2025-06-23 17:54:49,821 - INFO - Labels loaded successfully
2025-06-23 17:54:49,823 - INFO - Data shapes validated


TF-IDF Train Shape: (6193, 5000)
Labels Train Shape: (6193,)
Label Distribution (Train):
 0    47.359922
2    29.081221
1    23.558857
Name: proportion, dtype: float64


### Step 1: Apply Data-Level Mitigation Techniques

- **Objective**: Generate mitigated datasets using SMOTE (oversampling), Random Undersampling, and NearMiss (undersampling).

In [3]:
%pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Collecting scikit-learn<2,>=1.3.2 (from imbalanced-learn)
  Using cached scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m818.8 kB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
[?25hDownloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Installing collected packages: scikit-learn, sklearn-compat, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.7.0
    Uninstalling scikit-learn-1.7.0:
      Successfully uninstalled sci

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
import scipy.sparse as sp
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# SMOTE
smote = SMOTE(random_state=42)
tfidf_train_smote, y_train_smote = smote.fit_resample(tfidf_train, y_train)
sp.save_npz("mitigated_datasets/tfidf_train_smote.npz", tfidf_train_smote)
pd.DataFrame({'Label': y_train_smote}).to_csv("mitigated_datasets/labels_train_smote.csv", index=False)
logging.info("SMOTE dataset saved")

# Random Undersampling
undersampler = RandomUnderSampler(random_state=42)
tfidf_train_under, y_train_under = undersampler.fit_resample(tfidf_train, y_train)
sp.save_npz("mitigated_datasets/tfidf_train_undersampled.npz", tfidf_train_under)
pd.DataFrame({'Label': y_train_under}).to_csv("mitigated_datasets/labels_train_undersampled.csv", index=False)
logging.info("Undersampled dataset saved")

# NearMiss (version 1)
nearmiss = NearMiss(version=1, n_neighbors=3)
tfidf_train_nearmiss, y_train_nearmiss = nearmiss.fit_resample(tfidf_train, y_train)
sp.save_npz("mitigated_datasets/tfidf_train_nearmiss.npz", tfidf_train_nearmiss)
pd.DataFrame({'Label': y_train_nearmiss}).to_csv("mitigated_datasets/labels_train_nearmiss.csv", index=False)
logging.info("NearMiss dataset saved")

# Class weights for algorithm-level mitigation
class_weights = {i: 1.0 / pd.Series(y_train).value_counts()[i] for i in range(3)}
total = sum(class_weights.values())
class_weights = {k: v / total * 3 for k, v in class_weights.items()}
logging.info("Class weights computed")

2025-06-23 17:59:26,166 - INFO - SMOTE dataset saved
2025-06-23 17:59:26,227 - INFO - Undersampled dataset saved
2025-06-23 17:59:26,529 - INFO - NearMiss dataset saved
2025-06-23 17:59:26,540 - INFO - Class weights computed


### Step 2: Retrain Logistic Regression, SVM, Naive Bayes, and Random Forest

- **Objective**: Train models on mitigated datasets (SMOTE, Random Undersampling, NearMiss) and apply weighted loss to all models

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import joblib
from tqdm import tqdm
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create output directory
os.makedirs("models/mitigated_models", exist_ok=True)
logging.info("Output directory created: models/mitigated_models")

# Initialize models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, penalty='l2', random_state=42),  # Removed deprecated multi_class
    'SVM': SVC(probability=True, random_state=42),
    'NaiveBayes': MultinomialNB(),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Define datasets
datasets = {
    'smote': (tfidf_train_smote, y_train_smote),
    'undersampled': (tfidf_train_under, y_train_under),
    'nearmiss': (tfidf_train_nearmiss, y_train_nearmiss),
    'weighted': (tfidf_train_smote, y_train_smote)  # Same data, but algorithm-level class_weight is applied
}

# Supported class_weight models
supports_class_weight = {'LogisticRegression', 'SVM', 'RandomForest'}

# Train models
for dataset_name, (X_train, y_train) in tqdm(datasets.items(), desc="Training Datasets"):
    for name, model in models.items():
        try:
            # Clone model to avoid contamination
            model_copy = model.__class__(**model.get_params())

            # Set class_weight only when applicable
            if dataset_name == 'weighted' and name in supports_class_weight:
                model_copy.set_params(class_weight=class_weights)
            elif 'class_weight' in model_copy.get_params():
                model_copy.set_params(class_weight=None)

            model_copy.fit(X_train, y_train)

            model_path = f"models/mitigated_models/{name}_{dataset_name}.joblib"
            joblib.dump(model_copy, model_path)
            logging.info(f"{name} ({dataset_name}) trained and saved")

        except Exception as e:
            logging.error(f"Error training {name} ({dataset_name}): {str(e)}")


2025-06-23 18:08:17,021 - INFO - Output directory created: models/mitigated_models
Training Datasets:   0%|          | 0/4 [00:00<?, ?it/s]

2025-06-23 18:08:18,250 - INFO - LogisticRegression (smote) trained and saved
2025-06-23 18:09:46,155 - INFO - SVM (smote) trained and saved
2025-06-23 18:09:46,164 - INFO - NaiveBayes (smote) trained and saved
2025-06-23 18:10:02,435 - INFO - RandomForest (smote) trained and saved
Training Datasets:  25%|██▌       | 1/4 [01:45<05:16, 105.41s/it]2025-06-23 18:10:02,739 - INFO - LogisticRegression (undersampled) trained and saved
2025-06-23 18:10:25,960 - INFO - SVM (undersampled) trained and saved
2025-06-23 18:10:25,966 - INFO - NaiveBayes (undersampled) trained and saved
2025-06-23 18:10:35,551 - INFO - RandomForest (undersampled) trained and saved
Training Datasets:  50%|█████     | 2/4 [02:18<02:05, 62.88s/it] 2025-06-23 18:10:36,411 - INFO - LogisticRegression (nearmiss) trained and saved
2025-06-23 18:11:00,987 - INFO - SVM (nearmiss) trained and saved
2025-06-23 18:11:00,993 - INFO - NaiveBayes (nearmiss) trained and saved
2025-06-23 18:11:12,010 - INFO - RandomForest (nearmiss)

### Step 3: Hyperparameter Tuning

- **Objective**: Tune Logistic Regression, SVM, Naive Bayes, and Random Forest on mitigated datasets.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm
import joblib
import pandas as pd
import logging


param_grids = {
    'LogisticRegression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
        'penalty': ['l2'],
        'max_iter': [1000]
    },
    'SVM': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'kernel': ['linear', 'rbf']
    },
    'NaiveBayes': {
        'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0]
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],  # Prevent overfitting
    }
}
# Tune models
tuned_results = {}
for dataset_name, (X_train, y_train) in tqdm(datasets.items(), desc="Tuning Datasets"):
    for name, model in models.items():
        try:
            if dataset_name == 'weighted' and name in ['LogisticRegression', 'SVM']:
                model.set_params(class_weight=class_weights)
            grid = GridSearchCV(
                estimator=model,
                param_grid=param_grids[name],
                scoring='f1_weighted',
                cv=5,
                n_jobs=-1,
                verbose=1
            )
            grid.fit(X_train, y_train)
            tuned_results[f"{name}_{dataset_name}"] = {
                'best_params': grid.best_params_,
                'best_score': grid.best_score_
            }
            joblib.dump(grid.best_estimator_, f"models/mitigated_models/{name}_{dataset_name}_tuned.joblib")
            logging.info(f"{name} ({dataset_name}) tuned and saved: {grid.best_params_}")
        except Exception as e:
            logging.error(f"Error tuning {name} ({dataset_name}): {str(e)}")

# Save tuning results
pd.DataFrame(tuned_results).to_csv("models/mitigated_models/tuned_results.csv")
logging.info("Tuning results saved: tuned_results.csv")

Tuning Datasets:   0%|          | 0/4 [00:00<?, ?it/s]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


2025-06-23 18:14:03,794 - INFO - LogisticRegression (smote) tuned and saved: {'C': 10, 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}


Fitting 5 folds for each of 10 candidates, totalling 50 fits
