In [1]:
# 11 - Cross-Validation and Model Performance Comparison

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Load dataset
df = pd.read_csv("bank_numeric.csv")

# Define features and target
target_column = "deposit"
X = df.drop(columns=[target_column])
y = df[target_column]

# Handle missing values (if any)
X.fillna(X.mean(), inplace=True)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_seed=42),
    'KNN': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'Ridge Classifier': RidgeClassifier(random_state=42),
}

# Create a stacking classifier
stacked_model = StackingClassifier(
    estimators=[
        ('log_reg', LogisticRegression(max_iter=1000, random_state=42)),
        ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgboost', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('svm', SVC(probability=True, random_state=42))
    ],
    final_estimator=RidgeClassifier(random_state=42),
    cv=5
)

models['Stacking Classifier'] = stacked_model

# Cross-validation and performance collection
from sklearn.model_selection import cross_val_score

cv_scores = {}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    cv_scores[name] = np.mean(scores)
    print(f"{name} Accuracy (CV): {np.mean(scores):.4f}")

# Convert results to a DataFrame for easy plotting
cv_results = pd.DataFrame(list(cv_scores.items()), columns=['Model', 'Accuracy'])

# Plotting comparison graph
plt.figure(figsize=(12, 6))
cv_results = cv_results.sort_values(by='Accuracy', ascending=False)
plt.barh(cv_results['Model'], cv_results['Accuracy'], color='skyblue')
plt.xlabel('Cross-Validation Accuracy')
plt.title('Model Performance Comparison')
plt.gca().invert_yaxis()
plt.show()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Accuracy (CV): 0.8010
Random Forest Accuracy (CV): 0.8134
XGBoost Accuracy (CV): 0.8107
CatBoost Accuracy (CV): 0.8250
KNN Accuracy (CV): 0.7805




AdaBoost Accuracy (CV): 0.7934
[LightGBM] [Info] Number of positive: 1690, number of negative: 2515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 721
[LightGBM] [Info] Number of data points in the train set: 4205, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.401902 -> initscore=-0.397544
[LightGBM] [Info] Start training from score -0.397544
[LightGBM] [Info] Number of positive: 1690, number of negative: 2515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 720
[LightGBM] [Info] Number of data points in the train set: 4205, number of used features

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 