# 3. Classification

---

### Setup

In [5]:
import sys
sys.path.insert(1, '../../utils')

import pandas as pd

NUM_SPLITS = 10

trn_splits = [None] * NUM_SPLITS
tst_splits = [None] * NUM_SPLITS
    
for spli in range(NUM_SPLITS):  
    trn_splits[spli] = pd.read_csv('data/prepared/%d.csv' % spli, sep=';')
    tst_splits[spli] = pd.read_csv('data/prepared_test/%d.csv' % spli, sep=';')

## 3.5. Gradient Boosting

---

### Run algorithm

---

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.ensemble import GradientBoostingClassifier
import ds_functions as ds

labels = [False, True]
target = 'DEATH_EVENT'
cols = len(max_depths)

n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300]
max_depths = [5, 10, 25]
learning_rate = [.1, .3, .5, .7, .9]

best_trees = []
bests = []
accuracies = [[] for _ in range(len(max_depths))]

trnY_list = []
trnX_list = []
tstY_list = []
tstX_list = []


for spli in range(NUM_SPLITS):
    print("Split %d" % spli)
    
    trn_split = trn_splits[spli]
    tst_split = tst_splits[spli]   

    trnY = trn_split.pop(target).values
    trnX = trn_split.values
    tstY = tst_split.pop(target).values
    tstX = tst_split.values
    
    trnY_list.append(trnY)
    trnX_list.append(trnX)
    tstY_list.append(tstY)
    tstX_list.append(tstX)
    
    
    best = ('', 0, 0)
    last_best = 0
    best_tree = None

    
    for k in range(cols):
        d = max_depths[k]
        values = {}
        for lr in learning_rate:
            yvalues = []
            for n in n_estimators:
                gb = GradientBoostingClassifier(n_estimators=n, max_depth=d, learning_rate=lr)
                gb.fit(trnX, trnY)
                prdY = gb.predict(tstX)
                yvalues.append(metrics.accuracy_score(tstY, prdY))
                if yvalues[-1] > last_best:
                    best = (d, lr, n)
                    last_best = yvalues[-1]
                    best_tree = gb
            values[lr] = yvalues
        
        accuracies[k].append(values)
    
    best_trees.append(best_tree)
    bests.append(best)
    
    print('Best results with depth=%d, learning rate=%1.2f and %d estimators, with accuracy=%1.2f' %
          (*best, last_best))


Split 0
Split 0: Best results with depth=5, learning rate=0.70 and 10 estimators, with accuracy=0.87
Split 1
Split 1: Best results with depth=5, learning rate=0.70 and 150 estimators, with accuracy=0.83
Split 2
Split 2: Best results with depth=5, learning rate=0.70 and 5 estimators, with accuracy=0.87
Split 3
Split 3: Best results with depth=5, learning rate=0.50 and 300 estimators, with accuracy=0.90
Split 4
Split 4: Best results with depth=5, learning rate=0.50 and 25 estimators, with accuracy=0.93
Split 5
Split 5: Best results with depth=5, learning rate=0.30 and 10 estimators, with accuracy=0.93
Split 6
Split 6: Best results with depth=5, learning rate=0.70 and 5 estimators, with accuracy=0.87
Split 7
Split 7: Best results with depth=5, learning rate=0.10 and 25 estimators, with accuracy=0.90
Split 8
Split 8: Best results with depth=10, learning rate=0.10 and 25 estimators, with accuracy=0.97
Split 9
Split 9: Best results with depth=5, learning rate=0.70 and 50 estimators, with acc

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.ensemble import GradientBoostingClassifier
import ds_functions as ds

labels = [False, True]
target = 'DEATH_EVENT'
cols = len(max_depths)

#min_impurity_decrease = [0.025, 0.01, 0.005, 0.0025, 0.001]
#max_depths = [2, 5, 10, 15, 20, 25]
#criteria = ['entropy', 'gini']
n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300]
max_depths = [5, 10, 25]
learning_rate = [.1, .3, .5, .7, .9]

best = ('',  0, 0.0) # (criteria, max depth, min impurity decrease)
#best_tree = None
last_best = 0 # accuracy
#best_split = None
#best_spli = 0

split_best = [('',  0, 0.0)] * NUM_SPLITS
split_best_tree = [None] * NUM_SPLITS
split_last_best = [0] * NUM_SPLITS

plt.figure()
fig, axs = plt.subplots(1, 2, figsize=(16, 4), squeeze=False)
for k in range(len(criteria)):
    f = criteria[k]
    values = {}
    for d in max_depths:
        values[d] = []
        for i in range(len(min_impurity_decrease)):
            values[d].append(0)
    for spli in range(NUM_SPLITS):
        split = splits[spli]
        for d in max_depths:
            yvalues = []
            for imp in min_impurity_decrease:
                tree = DecisionTreeClassifier(max_depth=d, criterion=f, min_impurity_decrease=imp)
                tree.fit(split['X_train'], split['y_train'])
                split['prdY'] = tree.predict(split['X_test'])
                yvalues.append(metrics.accuracy_score(split['y_test'], split['prdY']))
                # Check if accuracy is better than best overall
                if yvalues[-1] > last_best:
                    best = (f, d, imp)
                    last_best = yvalues[-1]
                    best_tree = tree
                    best_split = split
                    best_spli = spli
                # Check if accuracy is better than best in current split
                if yvalues[-1] > split_last_best[spli]:
                    split_best[spli] = (f, d, imp)
                    split_last_best[spli] = yvalues[-1]
                    split_best_tree[spli] = tree
            # Increment total accuracy for current (max_depth, min_impurity decrease)
            for i in range(len(yvalues)):
                values[d][i] += yvalues[i]
    # Normalize
    for v in values:
        for i in range(len(values[v])):
            values[v][i] /= NUM_SPLITS
            
    ds.multiple_line_chart(min_impurity_decrease, values, ax=axs[0, k], title='Decision Trees with %s criteria'%f,
                           xlabel='min_impurity_decrease', ylabel='accuracy', percentage=True)
plt.show()
print('Best results achieved in split %d with %s criteria, depth=%d and min_impurity_decrease=%f ==> accuracy=%f'%(best_spli, best[0], best[1], best[2], last_best))

print('Best results per split:')
for i in range(len(splits)):
    print('\tSplit %d: %s criteria, depth=%d and min_impurity_decrease=%f ==> accuracy=%f' % (i, *split_best[i], split_last_best[i]))

### Average results

---

In [14]:
type(accuracies[0][0][0.1])

list

In [None]:
total_accuracies = [{lr: 0 for lr in learning_rate} for _ in cols]

for k in range(cols):
    for spli in range(NUM_SPLITS):
        for 
    total_accuracies



plt.figure()
fig, axs = plt.subplots(1, cols, figsize=(cols*ds.HEIGHT, ds.HEIGHT), squeeze=False)

for k in range(cols):   
    ds.multiple_line_chart(n_estimators, values, ax=axs[0, k], title='Gradient Boorsting with max_depth=%d'%d,
                               xlabel='nr estimators', ylabel='accuracy', percentage=True)
plt.show()

In [7]:
prd_train_all = []
prd_test_all = []

for spli in range(NUM_SPLITS):
    prd_train_all.extend(best_trees[spli].predict(trnX_list[spli]))
    prd_test_all.extend(best_trees[spli].predict(tstX_list[spli]))
    
ds.plot_evaluation_results(labels, trnY_list, prd_train_all, tstY_list, prd_test_all, showXTickLabels=True)

ValueError: Found input variables with inconsistent numbers of samples: [10, 2691]

In [None]:
for spli in range(NUM_SPLITS):
    d = split_best[spli][1]
    plt.figure()
    fig, axs = plt.subplots(1, len(criteria), figsize=(16, 4), squeeze=False)
    for k in range(len(criteria)):
        f = criteria[k]
        values = {}
        yvalues = []
        yvalues_train = []
        for imp in min_impurity_decrease:
            tree = DecisionTreeClassifier(max_depth=d, criterion=f, min_impurity_decrease=imp)
            tree.fit(trnX_list[spli], trnY_list[spli])
            prdY = tree.predict(tstX_list[spli])
            prdY_train = tree.predict(trnX_list[spli])
            yvalues.append(metrics.accuracy_score(tstY_list[spli], prdY))
            yvalues_train.append(metrics.accuracy_score(trnY_list[spli], prdY_train))
        values['test'] = yvalues
        values['train'] = yvalues_train
        ds.multiple_line_chart(min_impurity_decrease, values, ax=axs[0, k], title='Decision Trees with %s criteria (split %d)'%(f, spli),
                           xlabel='min_impurity_decrease', ylabel='accuracy', percentage=True)

    plt.show()