In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (make_scorer, accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score, confusion_matrix)
from scipy.stats import zscore

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb


# 1. Mount Google Drive and load dataset

drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Colab Notebooks/Balanced_Energy_Classification.csv'
df = pd.read_csv(file_path)

# ───────────────────────────────────────────────────────────────
# 2. Remove any leaky or irrelevant columns (if applicable)
# ───────────────────────────────────────────────────────────────
target_col = 'EnergyClass'
X = df.drop(columns=[target_col])
y = df[target_col]

# ───────────────────────────────────────────────────────────────
# 3. Identify column types
# ───────────────────────────────────────────────────────────────
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# ───────────────────────────────────────────────────────────────
# 4. Impute missing values
# ───────────────────────────────────────────────────────────────
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

# ───────────────────────────────────────────────────────────────
# 5. Simple outlier capping (z-score > 3) on numeric columns
# ───────────────────────────────────────────────────────────────
z_scores = np.abs(zscore(X[num_cols]))
for i, col in enumerate(num_cols):
    col_z = z_scores[:, i]
    upper = X[col].mean() + 3 * X[col].std()
    lower = X[col].mean() - 3 * X[col].std()
    X.loc[col_z > 3,  col] = upper
    X.loc[col_z < -3, col] = lower

# ───────────────────────────────────────────────────────────────
# 6. Basic filter-style feature selection (|corr| > 0.1)
# ───────────────────────────────────────────────────────────────
corr_with_target = X[num_cols].corrwith(y.astype(int))
selected_num_features = corr_with_target[abs(corr_with_target) > 0.1].index.tolist()
selected_cat_features = cat_cols
selected_features = selected_num_features + selected_cat_features
X = X[selected_features]

num_cols = [c for c in selected_num_features if c in num_cols]
cat_cols = [c for c in selected_cat_features if c in cat_cols]

# ───────────────────────────────────────────────────────────────
# 7. Preprocessing pipelines
# ───────────────────────────────────────────────────────────────
num_pipeline = Pipeline([('scaler', MinMaxScaler())])
cat_pipeline = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# ───────────────────────────────────────────────────────────────
# 8. Candidate models (Logistic & SVM removed)
# ───────────────────────────────────────────────────────────────
models = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}

# ───────────────────────────────────────────────────────────────
# 9. Scoring metrics
# ───────────────────────────────────────────────────────────────
scoring = {
    'accuracy':  make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall':    make_scorer(recall_score,   zero_division=0),
    'f1':        make_scorer(f1_score,       zero_division=0),
    'roc_auc':   'roc_auc'
}

# ───────────────────────────────────────────────────────────────
# 10. Cross-validation performance
# ───────────────────────────────────────────────────────────────
results = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier',   clf)])
    cv_results = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    results.append({
        'Model': name,
        **{metric: cv_results[f'test_{metric}'].mean() for metric in scoring}
    })

results_df = pd.DataFrame(results)
print("\nCross-validated model performance summary:\n")
print(results_df)

# ───────────────────────────────────────────────────────────────
# 11. Confusion matrices on a hold-out test split
# ───────────────────────────────────────────────────────────────
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

confusion_data = []

print("\nConfusion Matrix Values for All Models:\n")

for name, clf in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier',   clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    confusion_data.append({
        'Model':       name,
        'TP':          tp,
        'FP':          fp,
        'TN':          tn,
        'FN':          fn,
        'Accuracy (%)':  round(accuracy_score (y_test, y_pred) * 100, 2),
        'Precision (%)': round(precision_score(y_test, y_pred, zero_division=0) * 100, 2),
        'Recall (%)':    round(recall_score   (y_test, y_pred, zero_division=0) * 100, 2),
        'F1-Score (%)':  round(f1_score       (y_test, y_pred, zero_division=0) * 100, 2)
    })

confusion_df = pd.DataFrame(confusion_data)
print(confusion_df)


Mounted at /content/drive

Cross-validated model performance summary:

                 Model  accuracy  precision    recall        f1   roc_auc
0  K-Nearest Neighbors  0.781344   0.786896  0.780735  0.779952  0.837714
1        Decision Tree  0.781323   0.780680  0.789061  0.781459  0.781102
2        Random Forest  0.789384   0.797857  0.776816  0.784851  0.852976
3             AdaBoost  0.789384   0.795179  0.780898  0.785917  0.868551
4              XGBoost  0.749000   0.781718  0.699755  0.734653  0.832195

Confusion Matrix Values for All Models:

                 Model  TP  FP  TN  FN  Accuracy (%)  Precision (%)  \
0  K-Nearest Neighbors  37   9  41  12         78.79          80.43   
1        Decision Tree  41  11  39   8         80.81          78.85   
2        Random Forest  36   9  41  13         77.78          80.00   
3             AdaBoost  36   9  41  13         77.78          80.00   
4              XGBoost  33   8  42  16         75.76          80.49   

   Recall (%)  F