# Phase 5: Evaluation and Comparison

**Objective**: Compare baseline models (Phase 3) and mitigated models (Phase 4) to assess the effectiveness of imbalance mitigation strategies (SMOTE, Random Undersampling, NearMiss, Weighted Loss) for 3-class (Negative, Neutral, Positive) sentiment classification on the Bangla Sentiment Dataset. Evaluate performance on the test set, analyze source-specific performance (newspapers, social media, blogs) to test hypothesis H3 (source-specific differences in sentiment classification), and perform statistical tests to determine significant improvements.


### Step 1: Load Test Data and Models

- **Objective**: Load test TF-IDF matrix, labels, source metadata, and all models (baseline and mitigated)

In [3]:
import pandas as pd
import scipy.sparse as sp
import joblib
import os
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
data_dir = "text_representation/"

files = {
    'tfidf_test': f"{data_dir}tfidf_test.npz",
    'labels_test': f"{data_dir}labels_test.csv",
    'tfidf_train': f"{data_dir}tfidf_train.npz",
    'labels_train': f"{data_dir}labels_train.csv",
}

# Check file existence
for name, path in files.items():
    if not os.path.exists(path):
        logging.error(f"Missing file: {path}")
        raise FileNotFoundError(f"Missing file: {path}")

# Load test data
tfidf_test = sp.load_npz(files['tfidf_test'])
y_test = pd.read_csv(files['labels_test'], encoding='utf-8')['Label'].values

# load the train data for statistical testing
# Placeholder: Replace with original training data
X_full = sp.load_npz(files['tfidf_train'])
y_full = pd.read_csv(files['labels_train'], encoding='utf-8')['Label'].values


logging.info("Test data & Train Data loaded successfully")


2025-06-26 18:36:04,834 - INFO - Test data & Train Data loaded successfully


In [4]:
# Load models
model_dir_baseline = "models/baseline_models/"
model_dir_mitigated = "models/mitigated_models/"

model_configs = [
    ('baseline', model_dir_baseline, ['LogisticRegression_tuned_grid', 'SVM_tuned_grid', 'NaiveBayes_tuned_grid', 'RandomForest_tuned_grid']),
    ('mitigated', model_dir_mitigated, [
        f"{model}_{mitigation}_tuned"
        for model in ['LogisticRegression', 'SVM', 'NaiveBayes', 'RandomForest']
        for mitigation in ['smote', 'undersampled', 'nearmiss', 'weighted']
    ])
]
models = {}
for config_type, model_dir, model_names in model_configs:
    for name in tqdm(model_names, desc=f"Loading {config_type} models"):
        try:
            models[f"{config_type}_{name}"] = joblib.load(f"{model_dir}{name}.joblib")
            logging.info(f"Loaded model: {config_type}_{name}")
        except Exception as e:
            logging.error(f"Error loading {name}: {str(e)}")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2025-06-26 18:36:04,930 - INFO - Loaded model: baseline_LogisticRegression_tuned_grid
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2025-06-26 18:36:04,935 - INFO - Loaded model: baseline_SVM_tuned_grid
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2025-06-26 18:36:04,941 - INFO - Loaded model: baseline_NaiveBayes_tuned_grid
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2025-06-26 18:36:05,430 - INFO - Loaded model: baseline_RandomForest_tuned_grid
Loading baseline models: 100%|██████████| 4/4 [00:00<00:00,  6.92it/s]
Loading mitigated models:   0%|          | 0/16 [00:00<?, ?it/s]2025-06-26 18:36:05,436 - INFO - Loaded model: mitigated_LogisticRegression_smote_tuned
20

### Step 2: Evaluate Models on Test Set

- **Objective**: Compute accuracy, precision, recall, F1-score (weighted and per-class), and ROC-AUC for all models on the test set.

In [5]:
import os
import logging
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score
)
from sklearn.preprocessing import LabelBinarizer

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure output directory exists
os.makedirs("evaluation", exist_ok=True)

# Binarize true labels for multiclass ROC AUC
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)

# Prepare results container
results = []

# Evaluate all models
for model_key, model in tqdm(models.items(), desc="Evaluating models"):
    try:
        # Parse model info
        config_type, name = model_key.split('_', 1)
        mitigation = name.split('_')[-1] if config_type == 'mitigated' else 'none'
        model_name = '_'.join(name.split('_')[:-1]) if config_type == 'mitigated' else name.replace('_tuned_grid', '')

        # Make predictions
        y_pred = model.predict(tfidf_test)
        y_proba = model.predict_proba(tfidf_test)

        # Evaluate metrics
        acc = accuracy_score(y_test, y_pred)
        f1_weighted = precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]
        f1_per_class = precision_recall_fscore_support(y_test, y_pred, average=None)[2]
        roc_auc = roc_auc_score(y_test_bin, y_proba, multi_class='ovr')

        # Append results
        results.append({
            'Model': model_name,
            'Type': config_type,
            'Mitigation': mitigation,
            'Accuracy': acc,
            'F1_Weighted': f1_weighted,
            'F1_Negative': f1_per_class[0],
            'F1_Positive': f1_per_class[1],
            'F1_Neutral': f1_per_class[2],
            'ROC_AUC': roc_auc
        })

        logging.info(f"Evaluated: {model_key}")

    except Exception as e:
        logging.error(f"Failed to evaluate {model_key}: {e}")

# Save results to CSV
results_df = pd.DataFrame(results)
csv_path = "evaluation/comparative_results.csv"
results_df.to_csv(csv_path, index=False)
logging.info(f"Saved comparative results to: {csv_path}")

Evaluating models:   0%|          | 0/20 [00:00<?, ?it/s]2025-06-26 18:36:06,472 - INFO - Evaluated: baseline_LogisticRegression_tuned_grid
2025-06-26 18:36:07,455 - INFO - Evaluated: baseline_SVM_tuned_grid
Evaluating models:  10%|█         | 2/20 [00:00<00:08,  2.01it/s]2025-06-26 18:36:07,465 - INFO - Evaluated: baseline_NaiveBayes_tuned_grid
2025-06-26 18:36:07,691 - INFO - Evaluated: baseline_RandomForest_tuned_grid
Evaluating models:  20%|██        | 4/20 [00:01<00:04,  3.64it/s]2025-06-26 18:36:07,699 - INFO - Evaluated: mitigated_LogisticRegression_smote_tuned
2025-06-26 18:36:07,707 - INFO - Evaluated: mitigated_LogisticRegression_undersampled_tuned
2025-06-26 18:36:07,715 - INFO - Evaluated: mitigated_LogisticRegression_nearmiss_tuned
2025-06-26 18:36:07,724 - INFO - Evaluated: mitigated_LogisticRegression_weighted_tuned
2025-06-26 18:36:08,700 - INFO - Evaluated: mitigated_SVM_smote_tuned
Evaluating models:  45%|████▌     | 9/20 [00:02<00:02,  4.44it/s]2025-06-26 18:36:09,31

In [8]:
# Display as table in notebook output
from tabulate import tabulate
from IPython.display import display, HTML

print("\n=== Evaluation Results ===\n")
print(tabulate(results_df, headers='keys', tablefmt='github', showindex=False))


=== Evaluation Results ===

| Model                           | Type      | Mitigation   |   Accuracy |   F1_Weighted |   F1_Negative |   F1_Positive |   F1_Neutral |   ROC_AUC |
|---------------------------------|-----------|--------------|------------|---------------|---------------|---------------|--------------|-----------|
| LogisticRegression              | baseline  | none         |   0.629677 |      0.612208 |      0.715596 |      0.521739 |     0.51715  |  0.753958 |
| SVM                             | baseline  | none         |   0.625806 |      0.612925 |      0.707692 |      0.530351 |     0.52551  |  0.734816 |
| NaiveBayes                      | baseline  | none         |   0.621935 |      0.595792 |      0.710098 |      0.503597 |     0.48433  |  0.744348 |
| RandomForest                    | baseline  | none         |   0.618065 |      0.593155 |      0.711602 |      0.478571 |     0.493151 |  0.743115 |
| LogisticRegression_smote        | mitigated | tuned        |   

### Step 4: Statistical Tests

- **Objective**: Perform paired t-tests to compare baseline vs. mitigated models’ F1-scores (weighted and Positive class).

In [9]:
import os
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.base import clone
from scipy.stats import ttest_rel

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure output directory exists
os.makedirs("evaluation", exist_ok=True)

# Define setup
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
metrics = {
    'F1_Weighted': lambda y, y_pred: f1_score(y, y_pred, average='weighted'),
    'F1_Positive': lambda y, y_pred: f1_score(y, y_pred, average=None)[1]
}
model_names = ['LogisticRegression', 'SVM', 'NaiveBayes', 'RandomForest']
mitigations = ['smote', 'undersampled', 'nearmiss', 'weighted']

# Results container
stat_results = {
    'Model': [],
    'Mitigation': [],
    'Metric': [],
    'P_Value': [],
    'Significant': []
}

# Perform statistical tests
for model_name in model_names:
    for mitigation in mitigations:
        try:
            baseline_key = f'baseline_{model_name}_tuned_grid'
            mitigated_key = f'mitigated_{model_name}_{mitigation}_tuned'

            base_model = models[baseline_key]
            mitigated_model = models[mitigated_key]

            # Store scores per metric
            scores = {metric: {'base': [], 'mit': []} for metric in metrics}

            for train_idx, test_idx in rskf.split(X_full, y_full):
                X_train, X_test = X_full[train_idx], X_full[test_idx]
                y_train, y_test = y_full[train_idx], y_full[test_idx]

                # Train fresh clones of the models
                base_clf = clone(base_model).fit(X_train, y_train)
                mit_clf = clone(mitigated_model).fit(X_train, y_train)

                y_pred_base = base_clf.predict(X_test)
                y_pred_mit = mit_clf.predict(X_test)

                for metric, func in metrics.items():
                    scores[metric]['base'].append(func(y_test, y_pred_base))
                    scores[metric]['mit'].append(func(y_test, y_pred_mit))

            # Perform paired t-tests
            for metric in metrics:
                base_scores = scores[metric]['base']
                mit_scores = scores[metric]['mit']
                t_stat, p_value = ttest_rel(base_scores, mit_scores)

                stat_results['Model'].append(model_name)
                stat_results['Mitigation'].append(mitigation)
                stat_results['Metric'].append(metric)
                stat_results['P_Value'].append(p_value)
                stat_results['Significant'].append(p_value < 0.05)

                logging.info(f"T-test | {model_name} | {mitigation} | {metric}: p={p_value:.4f}")

        except Exception as e:
            logging.error(f"Error processing {model_name} with {mitigation}: {str(e)}")

# Save results to CSV
stat_results_df = pd.DataFrame(stat_results)
output_path = "evaluation/statistical_tests.csv"
stat_results_df.to_csv(output_path, index=False)
logging.info(f"Statistical test results saved: {output_path}")

2025-06-26 18:38:40,420 - INFO - T-test | LogisticRegression | smote | F1_Weighted: p=0.0000
2025-06-26 18:38:40,425 - INFO - T-test | LogisticRegression | smote | F1_Positive: p=0.0001
2025-06-26 18:39:07,532 - INFO - T-test | LogisticRegression | undersampled | F1_Weighted: p=0.0000
2025-06-26 18:39:07,535 - INFO - T-test | LogisticRegression | undersampled | F1_Positive: p=0.0000
2025-06-26 18:39:35,015 - INFO - T-test | LogisticRegression | nearmiss | F1_Weighted: p=0.0000
2025-06-26 18:39:35,018 - INFO - T-test | LogisticRegression | nearmiss | F1_Positive: p=0.0000
2025-06-26 18:40:20,538 - INFO - T-test | LogisticRegression | weighted | F1_Weighted: p=0.0000
2025-06-26 18:40:20,540 - INFO - T-test | LogisticRegression | weighted | F1_Positive: p=0.0000
2025-06-26 19:10:59,604 - INFO - T-test | SVM | smote | F1_Weighted: p=0.1203
2025-06-26 19:10:59,608 - INFO - T-test | SVM | smote | F1_Positive: p=0.2854
2025-06-26 19:41:05,716 - INFO - T-test | SVM | undersampled | F1_Weighted

In [10]:
stat_results_df.head()

Unnamed: 0,Model,Mitigation,Metric,P_Value,Significant
0,LogisticRegression,smote,F1_Weighted,6.853281e-13,True
1,LogisticRegression,smote,F1_Positive,9.648053e-05,True
2,LogisticRegression,undersampled,F1_Weighted,4.1589700000000005e-25,True
3,LogisticRegression,undersampled,F1_Positive,1.842795e-24,True
4,LogisticRegression,nearmiss,F1_Weighted,6.742074e-25,True
