In [None]:
%%capture
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
import pandas.api.types as ptypes

from matplotlib import pyplot as plt
from imblearn.over_sampling import ADASYN 

import pydotplus

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999


# from tqdm import tqdm_notebook as tqdm
from tqdm.autonotebook import tqdm
tqdm().pandas()

### Execute ADASYN oversampling on imbalanced dataset and Export Decision tree rules to a figure

In [None]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(df_data, df_target)

clf = clf.fit(X_res, y_res)
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=df_data.columns, filled = True,special_characters=True, class_names=['GOOD','BAD']) 
graph = pydotplus.graph_from_dot_data(dot_data)

graph.write_pdf("D:/Poatek/Blinc/Risk_Project/DT.pdf") 

# Cross-validation method applied to Decision Tree Models

Below, some methods for perform cross-validation and evaluation of decision tree models are presented

In [1]:
def crossvalidation(indices, K):
    
    size_indices = len(indices)
    size_combinations = round(size_indices/K)
    
    combinations = [indices[i:i+size_combinations] for i in range(0, len(indices), size_combinations)]
    
    Kfolds = []
    for i in range(K):
        train = []
        test = []
        test = combinations[i]
        
        for combination in combinations:
            if combination != test:
                train.append(combination)
        
        train_new = [item for sublist in train for item in sublist]
        Kfolds.append((train_new, test))
    
    return Kfolds

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix


folds = crossvalidation(X_res.index.to_list(),7)

tn, fp, fn, tp
for k in range(len(folds)):
    
    X_train, y_train = X_res.iloc[folds[k][0]], y_res.iloc[folds[k][0]]
    X_test, y_test = X_res.iloc[folds[k][1]], y_res.iloc[folds[k][1]]

    clf = tree.DecisionTreeClassifier(max_depth=5)
    clf = clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    tn_temp, fp_temp, fn_temp, tp_temp = confusion_matrix(y_test, y_pred).ravel()
    
    tn, fp, fn, tp = tn+tn_temp, fp+fp_temp, fn+fn_temp, tp+tp_temp

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+tn+fp+fn)

print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')

confusion_matrix = np.array([[tp, fp],[fp,tn]])
sns.heatmap(confusion_matrix,annot=True,fmt="d");

### Evaluation with support of cross_validate for accuracy, precision and f-score

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = KFold(n_splits=7, random_state=42)
clf = tree.DecisionTreeClassifier(max_depth=5)

results = cross_validate(estimator=clf, X=X_res, y=y_res, cv=kfold, scoring=scoring)
display(results)

### Checking tree depth influence on model accuracy

In [None]:
from sklearn.model_selection import cross_val_score
depth = []
for i in range(3,50):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=clf, X=X_res, y=y_res, cv=10, n_jobs=4)
    depth.append((i,scores.mean()))
print(depth)

In [None]:
x = [x[0] for x in depth]
y = [x[1] for x in depth]

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, y, 'o-')
ax.set_title('Accuracy x Decision Tree depth', fontsize=20)
ax.set_ylabel('Accuracy')
ax.grid()
ax.set_xlabel('Decision Tree Depth');
