by wedad\
Hey team this file contains the optimisation for Decision Tree Optimzation and experimentations\
I'll be comparing them to the baseline model found in baseline_model.ipynb\
The columns dropped are selected from '4_FeatureEngineeringRegression' file

# Decision Tree Optimisation
Approaches I took were feature selection, Hyperparam tuning, Different Tree Model testing\
And Ensemble of different models\
Overall the best performing model was the resampled Ensemble with reduced feature set, as see below:\
Resampled Ensemble Decision Tree
--------------------------------------------
Accuracy: 0.7652395514780836\
Precision:  0.3573594606585052\
recall:  0.48192771084337355\
f1_score:  0.40980134174773336

--------------------------------------------
In terms of improvement from baseline, removing features allowed for minor improvements as seen here:

--------------------------------------------
Accuracy: 0.7726049735018344\
Precision:  0.34347531906688905\
recall:  0.3773493975903614\
f1_score:  0.35957584643135104
--------------------------------------------

Read in all the relevant Libraries


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed_list = [1,2,3,4,5]

In [3]:
aki_df = pd.read_csv('../../data/df_final_AKI.csv')
aki_df = aki_df.drop(columns=['Unnamed: 0', 'subject_id'])
aki_df

Unnamed: 0,dod,gender,age,Albumin,Creatinine,Hemoglobin,INR(PT),PT,Sodium,Urea Nitrogen,Arterial Blood Pressure diastolic,Arterial Blood Pressure systolic,Heart Rate,Respiratory Rate,hypertension,chronic_kidney_disease,sepsis,Intercept
0,0,1,73,3.6,1.2,9.4,3.2,13.8,138.0,16.0,48.0,101.0,84.0,20.0,1,0,0,1
1,0,0,24,4.1,0.5,10.2,1.2,15.7,143.0,6.0,56.0,114.0,59.0,16.0,0,0,0,1
2,0,0,54,4.1,0.4,10.2,1.0,14.4,142.0,22.0,48.0,130.0,59.0,24.0,1,0,0,1
3,0,1,84,3.5,1.1,10.2,1.6,18.7,144.0,17.0,56.0,114.0,89.0,21.0,1,0,0,1
4,0,0,59,3.6,0.6,10.2,1.7,12.4,153.0,8.0,70.0,91.0,99.0,21.0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9804,0,1,61,3.3,3.6,10.2,1.5,13.1,138.0,23.0,62.0,103.0,89.0,20.0,0,1,0,1
9805,0,1,74,3.5,0.6,10.2,1.2,13.6,138.0,6.0,56.0,114.0,107.0,22.0,1,0,0,1
9806,0,1,58,4.1,0.6,10.2,0.9,9.3,141.0,8.0,56.0,114.0,78.0,10.0,0,0,0,1
9807,1,1,84,4.0,3.4,12.2,1.1,17.9,141.0,88.0,56.0,114.0,101.0,25.0,1,1,0,1


In [4]:
model_desc = []
model_comparison_DF = pd.DataFrame(model_desc, columns=['model', 'accuracy', 'precision', 'recall', 'F1 score'])
model_comparison_DF

Unnamed: 0,model,accuracy,precision,recall,F1 score


### Baseline model

In [5]:
x = aki_df.drop(columns=['dod'])
y = aki_df['dod']

def TreeTester(model_desc, model, x, y, title='Tree', printer=False):
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    for rand in seed_list:
        x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y, test_size=0.25, random_state=rand)
        model.fit(x_train, y_train)
        y_pred_log = model.predict(x_test)
        total_acc += accuracy_score(y_test, y_pred_log)
        total_pre += precision_score(y_test, y_pred_log)
        total_rec += recall_score(y_test, y_pred_log)
        total_f1 += f1_score(y_test, y_pred_log)

    model_desc.append([
        title,
        total_acc/total,
        total_pre/total,
        total_rec/total,
        total_f1/total
        
                      ])
    if printer:
        # Evaluate
        print(f"{title}")
        print('--------------------------------------------')
        print("Accuracy:", total_acc/total,)
        print("Precision: ", total_pre/total,)
        print("recall: ", total_rec/total,)
        print("f1_score: ", total_f1/total,)
        print('--------------------------------------------')
        # tree.plot_tree(model)
        # print("Classification Report:\n", classification_report(y_test, y_pred_bin))
        plt.show()


In [6]:
dTree = DecisionTreeClassifier()
print("Baseline model for Decision Tree")
TreeTester(model_desc, dTree, x, y,'Decision Tree baseline', True)

Baseline model for Decision Tree
Decision Tree baseline
--------------------------------------------
Accuracy: 0.7679576029351814
Precision:  0.32624558591065894
recall:  0.3489156626506024
f1_score:  0.3370595970853561
--------------------------------------------


In [7]:
x = aki_df.drop(columns=['dod','Sodium','Hemoglobin','gender','hypertension','Intercept'])
y = aki_df['dod']

print("Modified Feature Baseline model for Decision Tree")
TreeTester(model_desc, dTree, x, y,'Decision Tree reduced-feature baseline', True)

Modified Feature Baseline model for Decision Tree
Decision Tree reduced-feature baseline
--------------------------------------------
Accuracy: 0.7726049735018344
Precision:  0.34347531906688905
recall:  0.3773493975903614
f1_score:  0.35957584643135104
--------------------------------------------


### Hyper Parameter Tuning

In [9]:
# This code takes some time to run, approx 10 -15 te

criterions =  ['gini', 'entropy']
splitters = ['best', 'random']
max_depths = [None, 10, 20, 30, 40]
min_samples_splits = [2, 5, 10]
min_samples_leafs = [1, 2, 4, 8]
max_featuress = [None, 'auto', 'sqrt', 'log2']
min_impurity_decreases = [0.0, 0.1, 0.2, 0.3]
class_weights = ['balanced', None]

for crit in criterions:
    for split in splitters:
        for depth in max_depths:
            for min_split in min_samples_splits:
                for min_leafs in min_samples_leafs:
                    for max_feature in max_featuress:
                        for impurity_dec in min_impurity_decreases:
                            for weight in class_weights:
                                dt = DecisionTreeClassifier(
                                    criterion=crit, 
                                    splitter=split, 
                                    max_depth=depth, 
                                    min_samples_split=min_split, 
                                    min_samples_leaf=min_leafs, 
                                    max_features=max_feature, 
                                    random_state=0, 
                                    min_impurity_decrease=impurity_dec, 
                                    class_weight=weight
                                )
                                TreeTester(model_desc, dt, x, y,f'{dt}', False)
                                
                                




In [11]:
model_comparison_DF = pd.DataFrame(model_desc, columns=['model', 'accuracy', 'precision', 'recall', 'F1 score'])
model_comparison_DF

Unnamed: 0,model,accuracy,precision,recall,F1 score
0,Decision Tree baseline,0.767958,0.326246,0.348916,0.337060
1,Decision Tree reduced-feature baseline,0.772605,0.343475,0.377349,0.359576
2,DecisionTreeClassifier(class_weight='balanced'...,0.769914,0.316905,0.312771,0.314681
3,DecisionTreeClassifier(random_state=0),0.774562,0.347693,0.379759,0.362945
4,DecisionTreeClassifier(class_weight='balanced'...,0.169181,0.169181,1.000000,0.289400
...,...,...,...,...,...
7677,"DecisionTreeClassifier(criterion='entropy', ma...",0.830819,0.000000,0.000000,0.000000
7678,DecisionTreeClassifier(class_weight='balanced'...,0.169181,0.169181,1.000000,0.289400
7679,"DecisionTreeClassifier(criterion='entropy', ma...",0.830819,0.000000,0.000000,0.000000
7680,DecisionTreeClassifier(class_weight='balanced'...,0.169181,0.169181,1.000000,0.289400


In [16]:
model_comparison_DF.sort_values(by=['F1 score'], ascending=False).head(3)

Unnamed: 0,model,accuracy,precision,recall,F1 score
6338,DecisionTreeClassifier(class_weight='balanced'...,0.70583,0.30644,0.581687,0.401071
6210,DecisionTreeClassifier(class_weight='balanced'...,0.70583,0.30644,0.581687,0.401071
2498,DecisionTreeClassifier(class_weight='balanced'...,0.696698,0.301701,0.594699,0.399547


In [17]:
model_comparison_DF.sort_values(by=['accuracy'], ascending=False).head(3)

Unnamed: 0,model,accuracy,precision,recall,F1 score
2499,"DecisionTreeClassifier(max_depth=10, min_sampl...",0.833184,0.526923,0.184578,0.27195
2371,"DecisionTreeClassifier(max_depth=10, min_sampl...",0.833184,0.526923,0.184578,0.27195
6299,"DecisionTreeClassifier(criterion='entropy', ma...",0.833021,0.535954,0.106506,0.175351


In [18]:
model_comparison_DF.sort_values(by=['precision'], ascending=False).head(3)

Unnamed: 0,model,accuracy,precision,recall,F1 score
6283,"DecisionTreeClassifier(criterion='entropy', ma...",0.833021,0.535954,0.106506,0.175351
6299,"DecisionTreeClassifier(criterion='entropy', ma...",0.833021,0.535954,0.106506,0.175351
6291,"DecisionTreeClassifier(criterion='entropy', ma...",0.833021,0.535954,0.106506,0.175351


In [19]:
model_comparison_DF.sort_values(by=['recall'], ascending=False).head(3)

Unnamed: 0,model,accuracy,precision,recall,F1 score
3486,DecisionTreeClassifier(class_weight='balanced'...,0.169181,0.169181,1.0,0.2894
3636,DecisionTreeClassifier(class_weight='balanced'...,0.169181,0.169181,1.0,0.2894
2016,DecisionTreeClassifier(class_weight='balanced'...,0.169181,0.169181,1.0,0.2894


It appears that decision trees with the highest recall and precision classifies all the test data as either dod or alive, however models focused on F1 score shows promise given a balanced precision and recall, however these scores are little low

In [38]:
print('best F1 Score 1: ',model_comparison_DF.sort_values(by=['F1 score'], ascending=False).head(3).iloc[0]['model'])
print('best F1 Score 2: ',model_comparison_DF.sort_values(by=['F1 score'], ascending=False).head(3).iloc[1]['model'])
print('best F1 Score 3: ',model_comparison_DF.sort_values(by=['F1 score'], ascending=False).head(3).iloc[2]['model'])
print('best Accuracy Score 1: ',model_comparison_DF.sort_values(by=['accuracy'], ascending=False).head(3).iloc[0]['model'])
print('best recall Score 1: ',model_comparison_DF.sort_values(by=['recall'], ascending=False).head(3).iloc[0]['model'])

best F1 Score 1:  DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
best F1 Score 2:  DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_leaf=4, random_state=0,
                       splitter='random')
best F1 Score 3:  DecisionTreeClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=4, min_samples_split=5, random_state=0,
                       splitter='random')
best Accuracy Score 1:  DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
best recall Score 1:  DecisionTreeClassifier(class_weight='balanced', max_depth=40,
                       max_features='log2', min_impurity_decrease=0.2,
                       random_state=0, splitter='random')


### Ensemble

In [57]:
def TreeEnsemble(model_list, model_desc, x, y, title='Combined Ensemble'):
    prediction_list = []
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    
    for rand in seed_list:
        prediction_list = []
        for model in model_list:
            x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y, test_size=0.25, random_state=rand)
            model.fit(x_train, y_train)
            y_pred_log = model.predict(x_test)
            prediction_list.append(y_pred_log)

        prediction_list = [sum(x) for x in zip(*prediction_list)]
        final_prediction = []
        for pred in prediction_list:
            if pred > 0:
                final_prediction.append(1)
            else:
                final_prediction.append(0)
                
        total_acc += accuracy_score(y_test, final_prediction)
        total_pre += precision_score(y_test, final_prediction)
        total_rec += recall_score(y_test, final_prediction)
        total_f1 += f1_score(y_test, final_prediction)


    model_desc.append([
        "Decision Tree Ensemble",
        total_acc/total,
        total_pre/total,
        total_rec/total,
        total_f1/total
                      ])
    
    # Evaluate
    print(f"{title}")
    print('--------------------------------------------')
    print("Accuracy:", total_acc/total)
    print("Precision: ", total_pre/total)
    print("recall: ", total_rec/total)
    print("f1_score: ", total_f1/total)
    print('--------------------------------------------')
    # print("Classification Report:\n", classification_report(y_test, y_pred_bin))
    plt.show()

In [58]:
x = aki_df.drop(columns=['dod','Sodium','Hemoglobin','gender','hypertension','Intercept'])
y = aki_df['dod']

f1_best1 = DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')

f1_best2 = DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_leaf=4, random_state=0,
                       splitter='random')

f1_best3 = DecisionTreeClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=4, min_samples_split=5, random_state=0,
                       splitter='random')

recall_best = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')


accuracy_best = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')

model_list = [f1_best1, f1_best2, f1_best2, recall_best, accuracy_best]

TreeEnsemble(model_list, model_desc, x, y, title='Reduced Feature Set Decision Tree Combined Ensemble')


x = aki_df.drop(columns=['dod'])
y = aki_df['dod']

TreeEnsemble(model_list, model_desc, x, y, title='Full Feature Set Decision Tree Combined Ensemble')


Reduced Feature Set Decision Tree Combined Ensemble
--------------------------------------------
Accuracy: 0.7020790868324501
Precision:  0.30876473096301466
recall:  0.6120481927710844
f1_score:  0.41020830539102954
--------------------------------------------
Full Feature Set Decision Tree Combined Ensemble
--------------------------------------------
Accuracy: 0.7151243375458622
Precision:  0.31300839147187953
recall:  0.567710843373494
f1_score:  0.40271091455799385
--------------------------------------------


### Sampling Methods

In [68]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTEENN, SMOTETomek

In [69]:
x = aki_df.drop(columns=['dod', 'Sodium','Hemoglobin','gender','hypertension','Intercept'])
y = aki_df['dod']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=0)
# Define models for each approach
models = [f1_best1, f1_best2, f1_best2, recall_best, accuracy_best]
# Define sampling methods
sampling_methods = [None,  RandomOverSampler(),  SMOTE(), ADASYN(),RandomUnderSampler(), SMOTEENN(), SMOTETomek(), NearMiss()]

In [70]:
# Define a function to train and evaluate models
def resampled_eval(model, x_train, y_train, x_test, y_test, sampling_method, seed_list, model_desc, printer = False):
    
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    
    for j in seed_list:
        if sampling_method:
            x_train_resampled, y_train_resampled = sampling_method.fit_resample(x_train, y_train)
        else:
            x_train_resampled = x_train
            y_train_resampled = y_train
            
        model.fit(x_train_resampled, y_train_resampled)
        y_pred = model.predict(x_test)

        # Evaluate
        total_acc += accuracy_score(y_test, y_pred)
        total_pre += precision_score(y_test, y_pred)
        total_rec += recall_score(y_test, y_pred)
        total_f1 += f1_score(y_test, y_pred)
        
        
    model_desc.append([
    str(sampling_method) + str(model),
    total_acc/total,
    total_pre/total,
    total_rec/total,
    total_f1/total
    ])
        
    if printer:
        print(f"{str(sampling_method)}")
        print('--------------------------------------------')
        print("Accuracy:", total_acc/total,)
        print("Precision: ", total_pre/total,)
        print("recall: ", total_rec/total,)
        print("f1_score: ", total_f1/total,)
        print('--------------------------------------------')
    
    return

In [71]:
print("Sampling Methods for Decision Trees")
sampling_desc = []
for samp in sampling_methods:
    for mod in models:
        resampled_eval(mod, x_train, y_train, x_test, y_test, samp, seed_list, sampling_desc)
    
sample_comparison_DF = pd.DataFrame(sampling_desc, columns=['model', 'accuracy', 'precision', 'recall', 'F1 score'])
sample_comparison_DF

Sampling Methods for Decision Trees


Unnamed: 0,model,accuracy,precision,recall,F1 score
0,NoneDecisionTreeClassifier(class_weight='balan...,0.675331,0.279942,0.584337,0.378537
1,NoneDecisionTreeClassifier(class_weight='balan...,0.675331,0.279942,0.584337,0.378537
2,NoneDecisionTreeClassifier(class_weight='balan...,0.675331,0.279942,0.584337,0.378537
3,"NoneDecisionTreeClassifier(max_depth=10, min_s...",0.82263,0.432203,0.153614,0.226667
4,"NoneDecisionTreeClassifier(max_depth=10, min_s...",0.82263,0.432203,0.153614,0.226667
5,RandomOverSampler()DecisionTreeClassifier(clas...,0.735984,0.320915,0.496988,0.389623
6,RandomOverSampler()DecisionTreeClassifier(clas...,0.723751,0.313328,0.526506,0.392248
7,RandomOverSampler()DecisionTreeClassifier(clas...,0.705199,0.298362,0.539759,0.383561
8,RandomOverSampler()DecisionTreeClassifier(max_...,0.725076,0.312082,0.516867,0.388376
9,RandomOverSampler()DecisionTreeClassifier(max_...,0.70683,0.302218,0.551205,0.389247


In [72]:
sample_comparison_DF.sort_values(by=['F1 score'], ascending=False).head(2)

Unnamed: 0,model,accuracy,precision,recall,F1 score
33,SMOTETomek()DecisionTreeClassifier(max_depth=1...,0.725178,0.317476,0.542771,0.400482
26,SMOTEENN()DecisionTreeClassifier(class_weight=...,0.708869,0.307389,0.574096,0.400178


In [73]:
sample_comparison_DF.sort_values(by=['accuracy'], ascending=False).head(2)

Unnamed: 0,model,accuracy,precision,recall,F1 score
3,"NoneDecisionTreeClassifier(max_depth=10, min_s...",0.82263,0.432203,0.153614,0.226667
4,"NoneDecisionTreeClassifier(max_depth=10, min_s...",0.82263,0.432203,0.153614,0.226667


In [74]:
sample_comparison_DF.sort_values(by=['precision'], ascending=False).head(2)

Unnamed: 0,model,accuracy,precision,recall,F1 score
3,"NoneDecisionTreeClassifier(max_depth=10, min_s...",0.82263,0.432203,0.153614,0.226667
4,"NoneDecisionTreeClassifier(max_depth=10, min_s...",0.82263,0.432203,0.153614,0.226667


In [75]:
sample_comparison_DF.sort_values(by=['recall'], ascending=False).head(2)

Unnamed: 0,model,accuracy,precision,recall,F1 score
39,"NearMiss()DecisionTreeClassifier(max_depth=10,...",0.529562,0.239188,0.816265,0.369966
38,"NearMiss()DecisionTreeClassifier(max_depth=10,...",0.529562,0.239188,0.816265,0.369966


In [76]:
print('best recall: ', sample_comparison_DF.sort_values(by=['recall'], ascending=False).head(1).iloc[0]['model'])
print('best precision: ',sample_comparison_DF.sort_values(by=['precision'], ascending=False).head(1).iloc[0]['model'])
print('best accuracy: ',sample_comparison_DF.sort_values(by=['accuracy'], ascending=False).head(1).iloc[0]['model'])
print('best F1 Score: ',sample_comparison_DF.sort_values(by=['F1 score'], ascending=False).head(1).iloc[0]['model'])

best recall:  NearMiss()DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
best precision:  NoneDecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
best accuracy:  NoneDecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
best F1 Score:  SMOTETomek()DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')


## Ensemble of the best resampled models


In [80]:

final_prediction = []
total_acc = 0
total_pre = 0
total_rec = 0
total_f1 = 0
total = len(seed_list)

#Near Miss Sampled
best_recall = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
# No sampling
best_accuracy = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')

#SMOTEK sampling
best_f1 = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')

    
for rand in seed_list:
    prediction_list = []
    
    x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y, test_size=0.2, random_state=rand)
        
    x_train_nearMiss, y_train_nearMiss = NearMiss().fit_resample(x_train, y_train)
    x_train_smotek, y_train_smotek = SMOTETomek().fit_resample(x_train, y_train)
    
    best_f1.fit(x_train_smotek, y_train_smotek)
        
    best_recall.fit(x_train_nearMiss, y_train_nearMiss)
        
    best_accuracy.fit(x_train, y_train)
        
        
    y_pred_log1 = best_f1.predict(x_test)
    y_pred_log2 = best_recall.predict(x_test)
    y_pred_log3 = best_accuracy.predict(x_test) 

    prediction_list.append(y_pred_log1)
    prediction_list.append(y_pred_log2)
    prediction_list.append(y_pred_log3)
                
    prediction_list = [sum(x) for x in zip(*prediction_list)]
    # print(set(prediction_list))
    
    final_prediction = []
    for pred in prediction_list:
        if pred >1:
            final_prediction.append(1)
        else:
            final_prediction.append(0)
                
        # print(set(prediction_list))
    total_acc += accuracy_score(y_test, final_prediction)
    total_pre += precision_score(y_test, final_prediction)
    total_rec += recall_score(y_test, final_prediction)
    total_f1 += f1_score(y_test, final_prediction)

    # Evaluate
print(f"Resampled Ensemble Decision Tree")
print('--------------------------------------------')
print("Accuracy:", total_acc/total)
print("Precision: ", total_pre/total)
print("recall: ", total_rec/total)
print("f1_score: ", total_f1/total)
print('--------------------------------------------')

    

Resampled Ensemble Decision Tree
--------------------------------------------
Accuracy: 0.7652395514780836
Precision:  0.3573594606585052
recall:  0.48192771084337355
f1_score:  0.40980134174773336
--------------------------------------------
