In [None]:
"""
Cluster-Based Undersampling for Imbalanced Classification

This script applies a incremental KMeans-based undersampling technique where
majority class samples are clustered, and one representative sample
is selected from each cluster (the one closest to the centroid).

Author: Duygu Selin Turan
GitHub: https://github.com/duyguselinballi-creator
Date: 2025-10-16
"""

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, recall_score, roc_auc_score,
    classification_report
)
from imblearn.metrics import geometric_mean_score

# --- Load Dataset ---
df = pd.read_csv("yeast6.dat", sep=" ", header=None)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# --- Split into Train/Test ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y, random_state=42
)

# --- Identify Class Labels ---
minority_class = 1  # Update if needed
majority_class = 0

X_train_min = X_train[y_train == minority_class]
X_train_maj = X_train[y_train == majority_class]

# --- Cluster Majority Class Samples ---
n_clusters = len(X_train_min)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_maj)
labels = kmeans.labels_

# --- Select One Representative per Cluster ---
selected_indices = []
for i in range(n_clusters):
    cluster_points = X_train_maj[labels == i]
    center = kmeans.cluster_centers_[i]
    idx = np.argmin(np.linalg.norm(cluster_points - center, axis=1))
    # Find original index in X_train_maj
    global_idx = np.where((X_train_maj == cluster_points[idx]).all(axis=1))[0][0]
    selected_indices.append(global_idx)

X_maj_selected = X_train_maj[selected_indices]

# --- Create Balanced Training Set ---
X_train_bal = np.vstack((X_train_min, X_maj_selected))
y_train_bal = np.hstack((
    np.full(len(X_train_min), minority_class),
    np.full(len(X_maj_selected), majority_class)
))

# --- Train Classifier ---
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_bal, y_train_bal)

# --- Evaluate on Test Set ---
y_pred = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))
print("G-Mean:", geometric_mean_score(y_test, y_pred))
