# Cross Validation With Multiple Algorithms


We will use credit card default data from https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

## Step-1: Download Data

In [None]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

In [None]:
data = pd.read_csv(data_location)
data.sample(10)

## Step-2: EDA

In [None]:
## Check data skew
data['default'].value_counts()

In [None]:
data['default'].value_counts(normalize=True)

## Step-3: Clean up 

TODO

## Step-4: Shape data

In [None]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)

In [None]:
X = data[feature_columns]
y = data[[label_col]]

print (X.shape)
print (y.shape)

## Step-5: Cross Validation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

## Choose models, we want to compare
models = []
models.append(('LR', LogisticRegression(max_iter=500)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('SVM', SVC()))
#models.append(('NB', GaussianNB()))


results = []
names = []
## for classification tasks
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
## for regression tasks
#scoring_metrics = ['r2', 'rmse']
y2 = np.ravel(y)
for name, model in models:
    kfold = KFold(n_splits=10)
    
    ## cv can be int or kfold
    cv_results = cross_val_score(model, X, y2, cv=5, scoring=scoring_metrics, n_jobs=-1)
    # cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring_metric)
    results.append(cv_results)
    names.append(name)
    print("{}: accuracy : avg={:,.2f}, median={:,.2f},  std={:,.2f},  cv_results={}".format (
        name, np.mean(cv_results), np.median(cv_results), np.std(cv_results), cv_results))

In [None]:
cv_results

## Step-6: Vizualize Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (10,6))

# boxplot algorithm comparison
ax = sns.boxplot(data=results)
ax.set_title('Algorithm Comparison')
ax.set_xticklabels(names)
ax.set_ylabel('accuracy')
plt.show()