In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.metrics import f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek

from sklearn.metrics import log_loss
from sklearn.preprocessing import label_binarize

from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
X = pd.read_csv("X_train.csv").values
y = pd.read_csv("y_train.csv").squeeze()

# TODO: pipelining (https://scikit-learn.org/stable/modules/compose.html)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# classes_to_remove = [5, 6, 14, 17]
# mask = ~np.isin(y, classes_to_remove)
# X = X[mask]
# y = y[mask]

le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
classes = np.unique(y)
y_one_hot = label_binarize(y, classes=classes) # TODO: investigate TransformedTargetRegressor? https://scikit-learn.org/stable/modules/compose.html#transforming-target-in-regression
class_counts = np.sum(y_one_hot, axis=0)
class_weights = 1.0 / class_counts
class_weights /= np.sum(class_weights)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=4)

In [None]:
from collections import namedtuple

Experiment = namedtuple("Experiment", ["resampler", "use_weights", "model_type"])

experiments = {
    # AdaBoost
    'ADA - Baseline': Experiment(None, False, 'ada'),
    'ADA - SMOTE': Experiment(SMOTE(random_state=4, k_neighbors=3), False, 'ada'),
    'ADA - ADASYN': Experiment(ADASYN(random_state=42, n_neighbors=3), False, 'ada'),
    'ADA - Undersampling': Experiment(RandomUnderSampler(random_state=42), False, 'ada'),
    'ADA - Class Weighting': Experiment(None, True, 'ada'),
    'ADA - SMOTE + Tomek': Experiment(SMOTETomek(smote=SMOTE(k_neighbors=3),
                                                 tomek=TomekLinks(sampling_strategy='majority'),
                                                 random_state=42), False, 'ada'),

    # HistGradientBoosting
    'HGB - Baseline': Experiment(None, False, 'hgb'),
    'HGB - SMOTE': Experiment(SMOTE(random_state=4, k_neighbors=3), False, 'hgb'),
    'HGB - ADASYN': Experiment(ADASYN(random_state=42, n_neighbors=3), False, 'hgb'),
    'HGB - Undersampling': Experiment(RandomUnderSampler(random_state=42), False, 'hgb'),
    'HGB - Class Weighting': Experiment(None, True, 'hgb'),
    'HGB - SMOTE + Tomek': Experiment(SMOTETomek(smote=SMOTE(k_neighbors=3),
                                                 tomek=TomekLinks(sampling_strategy='majority'),
                                                 random_state=42), False, 'hgb'),

    # Logistic Regression
    'LR - Baseline': Experiment(None, False, 'lr'),
    'LR - SMOTE': Experiment(SMOTE(random_state=4, k_neighbors=3), False, 'lr'),
    'LR - ADASYN': Experiment(ADASYN(random_state=42, n_neighbors=3), False, 'lr'),
    'LR - Undersampling': Experiment(RandomUnderSampler(random_state=42), False, 'lr'),
    'LR - Class Weighting': Experiment(None, True, 'lr'),
    'LR - SMOTE + Tomek': Experiment(SMOTETomek(smote=SMOTE(k_neighbors=3),
                                                tomek=TomekLinks(sampling_strategy='majority'),
                                                random_state=42), False, 'lr'),

    # Random Forest
    'RF - Baseline': Experiment(None, False, 'rf'),
    'RF - SMOTE': Experiment(SMOTE(random_state=4, k_neighbors=3), False, 'rf'),
    'RF - ADASYN': Experiment(ADASYN(random_state=42, n_neighbors=3), False, 'rf'),
    'RF - Undersampling': Experiment(RandomUnderSampler(random_state=42), False, 'rf'),
    'RF - Class Weighting': Experiment(None, True, 'rf'),
    'RF - SMOTE + Tomek': Experiment(SMOTETomek(smote=SMOTE(k_neighbors=3),
                                                tomek=TomekLinks(sampling_strategy='majority'),
                                                random_state=42), False, 'rf'),

    # Support Vector Machine
    'SVM - Baseline': Experiment(None, False, 'svm'),
    'SVM - SMOTE': Experiment(SMOTE(random_state=4, k_neighbors=3), False, 'svm'),
    'SVM - ADASYN': Experiment(ADASYN(random_state=42, n_neighbors=3), False, 'svm'),
    'SVM - Undersampling': Experiment(RandomUnderSampler(random_state=42), False, 'svm'),
    'SVM - Class Weighting': Experiment(None, True, 'svm'),
    'SVM - SMOTE + Tomek': Experiment(SMOTETomek(smote=SMOTE(k_neighbors=3),
                                                 tomek=TomekLinks(sampling_strategy='majority'),
                                                 random_state=42), False, 'svm'),
}

results = {}
predictions = {}
probabilities = {}

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

for name, config in experiments.items():
    print(f"Training: {name}")
    X_resampled, y_resampled = X_train, y_train

    if config.resampler is not None:
        X_resampled, y_resampled = config.resampler.fit_resample(X_train, y_train)

    # Initialize the model based on type and weighting
    # TODO: hyperparameter optimisation? (Grid search).. + cross_validation?
    if config.model_type == 'hgb':
        model = HistGradientBoostingClassifier(
            loss='log_loss',
            class_weight='balanced' if config.use_weights else None
        )

    elif config.model_type == 'lr':
        model = LogisticRegression(
            max_iter=1000,
            multi_class='multinomial',
            solver='lbfgs',
            class_weight='balanced' if config.use_weights else None
        )

    elif config.model_type == 'rf':
        model = RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            class_weight='balanced' if config.use_weights else None
        )

    elif config.model_type == 'svm':
        model = SVC(
            kernel='rbf',
            probability=True,
            random_state=42,
            class_weight='balanced' if config.use_weights else None
        )

    elif config.model_type == 'ada':
        base = LogisticRegression(
            max_iter=1000,
            multi_class='multinomial',
            solver='lbfgs',
            class_weight='balanced' if config.use_weights else None
        )
        model = AdaBoostClassifier(
            estimator=base,
            n_estimators=50,
            algorithm='SAMME',
            random_state=42
        )

    else:
        raise ValueError(f"Unknown model type: {config.model_type}")

    # Fit the model
    model.fit(X_resampled, y_resampled)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    predictions[name] = y_pred
    probabilities[name] = y_proba


Training: ADA - Baseline
Training: ADA - SMOTE
Training: ADA - ADASYN
Training: ADA - Undersampling
Training: ADA - Class Weighting
Training: ADA - SMOTE + Tomek
Training: HGB - Baseline
Training: HGB - SMOTE
Training: HGB - ADASYN
Training: HGB - Undersampling
Training: HGB - Class Weighting
Training: HGB - SMOTE + Tomek
Training: LR - Baseline
Training: LR - SMOTE
Training: LR - ADASYN
Training: LR - Undersampling
Training: LR - Class Weighting
Training: LR - SMOTE + Tomek
Training: RF - Baseline
Training: RF - SMOTE
Training: RF - ADASYN
Training: RF - Undersampling
Training: RF - Class Weighting
Training: RF - SMOTE + Tomek
Training: SVM - Baseline
Training: SVM - SMOTE


In [None]:
def weighted_log_loss(y_true, y_pred_proba):
    # One-hot encode y_true
    ohe = OneHotEncoder(sparse_output=False, categories='auto', handle_unknown='ignore')
    y_true = np.array(y_true).reshape(-1, 1)
    y_true_bin = ohe.fit_transform(y_true) # TODO: check - binarised twice??

    # Compute class weights (inverse frequency, normalized)
    class_counts = np.sum(y_true_bin, axis=0)
    class_weights = 1.0 / class_counts
    class_weights /= np.sum(class_weights)

    # Compute sample weights using the one-hot labels and class weights
    sample_weights = np.sum(y_true_bin * class_weights, axis=1)

    # Compute weighted log loss
    eps = 1e-15  # To avoid log(0)
    loss = -np.mean(sample_weights * np.sum(y_true_bin * np.log(np.clip(y_pred_proba, eps, 1)), axis=1))
    return loss

In [None]:
print("\nEvaluation Metrics for Each Method:")
for method in predictions:
    y_pred = predictions[method]
    y_proba = probabilities[method]

    f1 = f1_score(y_test, y_pred, average='macro')
    wll = weighted_log_loss(y_test, y_proba)

    results[method] = (f1, wll)
    print(f"{method:25s} | Macro F1: {f1:.4f} | Weighted Log Loss: {wll:.4f}")

fig, axes = plt.subplots(len(predictions), 2, figsize=(16, 6 * len(predictions)))
fig.tight_layout(pad=5.0)

for idx, (method, y_pred_vals) in enumerate(predictions.items()):
    report = classification_report(y_test, y_pred_vals, output_dict=True, zero_division=0)
    report_df = pd.DataFrame(report).transpose()
    sns.heatmap(report_df.iloc[:-1, :-1], annot=True, fmt=".2f", cmap="Blues", ax=axes[idx, 0])
    axes[idx, 0].set_title(f"{method} - Classification Report")

    cm = confusion_matrix(y_test, y_pred_vals)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
    disp.plot(ax=axes[idx, 1], cmap="Blues", values_format='d')
    axes[idx, 1].set_title(f"{method} - Confusion Matrix")

plt.show()
