In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn import metrics
from sklearn import linear_model
from sklearn import datasets
from sklearn import preprocessing, utils
from sklearn.metrics import  make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df_bankrupt = pd.read_csv("bankrupt.csv")
df_bankrupt.rename(columns={ df_bankrupt.columns[-1]: "y" }, inplace = True)


#divide data into x and y
x = df_bankrupt.iloc[:,:-1]
y = df_bankrupt.y

#seeing %pos in the data
pos = y.sum()
neg = len(y) - pos

percent_pos = round(pos/len(y)*100, 2)
percent_neg = round(neg/len(y)*100, 2)

percent_pos

#since it is approximately normal, we will standardize using z dist
x_std = preprocessing.scale(x).astype(np.float32)
y_std = y


In [3]:
result_dt = []
for trial in range(5):
    #splitting into training and testing
    x_train, x_test, y_train, y_test = train_test_split(x_std, y_std, train_size = 5000, random_state=trial,
                                                    stratify = y_std, shuffle=True)

    #encode training and testing data != continuous
    encode_ytr = preprocessing.LabelEncoder()
    ytr_encoded = encode_ytr.fit_transform(y_train)
    
    encode_yt = preprocessing.LabelEncoder()
    yt_encoded = encode_yt.fit_transform(y_test)
    
    criterion_list = ["gini", "entropy"]
    alg = DecisionTreeClassifier(splitter="best")
    param_grid = {'criterion':criterion_list}
    scoring_list = ['accuracy', 'precision', 'f1', 'roc_auc', 'recall']
    scoring_func_dict = {'acc':accuracy_score, 'AUC':roc_auc_score,
                        'prec':precision_score, 'recall':recall_score, 'f1':f1_score}
    scoring_dict = {k:make_scorer(v) for k,v in scoring_func_dict.items()}
    
    search_result = GridSearchCV (alg, param_grid, scoring=scoring_dict, refit=False, cv=5, n_jobs=-1)
    search_result.fit(x_train, ytr_encoded)
    
    res_one_trial = []
    for k in scoring_dict.keys():
        
        mean_test_metric = search_result.cv_results_['mean_test_'+k]
        best_idx = np.argmax(mean_test_metric)
        best_params = search_result.cv_results_['params'][best_idx]
        alg_new_5000 = DecisionTreeClassifier(splitter="best", **best_params)
        alg_new_5000.fit(x_train, ytr_encoded)
        
        res_test = alg_new_5000.predict(x_test)
        
        res_one_model = []
        for k_metric, func_metric in scoring_func_dict.items():
            metric_value = func_metric(res_test, yt_encoded)
            res_one_model.append(metric_value)

        res_one_trial.append(res_one_model)
    
    result_dt.append(res_one_trial)
result_dt = np.array(result_dt)
print(result_dt.shape)
print(np.mean(result_dt, axis=0))

(5, 5, 5)
[[0.97096    0.63421578 0.27864492 0.28333059 0.28019633]
 [0.96916    0.60896454 0.22915939 0.23384488 0.23092622]
 [0.96908    0.60978819 0.23112017 0.23545386 0.23228159]
 [0.96868    0.60779666 0.23317802 0.23143785 0.23150688]
 [0.96956    0.61502724 0.24102116 0.24572714 0.24289283]]


In [4]:
print([k for  k in scoring_dict.keys()])
print([k for k in scoring_func_dict.keys()])

['acc', 'AUC', 'prec', 'recall', 'f1']
['acc', 'AUC', 'prec', 'recall', 'f1']
