by wedad\
Hey team this file contains the best combined ensemble models from for Decision Tree and Regression model experimentations\
I'll be comparing them to the baseline model found in baseline_model.ipynb\
The columns dropped are selected from '4_FeatureEngineeringRegression' file

### Ensemble Models

#### Importing Relevant Libraries and data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTEENN, SMOTETomek

import warnings
warnings.filterwarnings('ignore')

In [2]:
aki_df = pd.read_csv('../../data/df_final_AKI.csv')
aki_df = aki_df.drop(columns=['Unnamed: 0', 'subject_id'])
aki_df

Unnamed: 0,dod,gender,age,Albumin,Creatinine,Hemoglobin,INR(PT),PT,Sodium,Urea Nitrogen,Arterial Blood Pressure diastolic,Arterial Blood Pressure systolic,Heart Rate,Respiratory Rate,hypertension,chronic_kidney_disease,sepsis,Intercept
0,0,1,73,3.6,1.2,9.4,3.2,13.8,138.0,16.0,48.0,101.0,84.0,20.0,1,0,0,1
1,0,0,24,4.1,0.5,10.2,1.2,15.7,143.0,6.0,56.0,114.0,59.0,16.0,0,0,0,1
2,0,0,54,4.1,0.4,10.2,1.0,14.4,142.0,22.0,48.0,130.0,59.0,24.0,1,0,0,1
3,0,1,84,3.5,1.1,10.2,1.6,18.7,144.0,17.0,56.0,114.0,89.0,21.0,1,0,0,1
4,0,0,59,3.6,0.6,10.2,1.7,12.4,153.0,8.0,70.0,91.0,99.0,21.0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9804,0,1,61,3.3,3.6,10.2,1.5,13.1,138.0,23.0,62.0,103.0,89.0,20.0,0,1,0,1
9805,0,1,74,3.5,0.6,10.2,1.2,13.6,138.0,6.0,56.0,114.0,107.0,22.0,1,0,0,1
9806,0,1,58,4.1,0.6,10.2,0.9,9.3,141.0,8.0,56.0,114.0,78.0,10.0,0,0,0,1
9807,1,1,84,4.0,3.4,12.2,1.1,17.9,141.0,88.0,56.0,114.0,101.0,25.0,1,1,0,1


In [3]:
seed_list = [0]

### training Models

In [4]:
x = aki_df.drop(columns=['dod', 'Sodium','Hemoglobin','gender','hypertension','Intercept'])
y = aki_df['dod']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=seed_list[0])
model_desc = []

#### Linear Ensemble

In [5]:

linear_sampling_methods = [None,  RandomOverSampler(),  SMOTE(), ADASYN(),RandomUnderSampler(), SMOTEENN(), SMOTETomek(), NearMiss()]

lin_reg = linear_model.LinearRegression()
ridge_model = linear_model.Ridge(alpha=0, solver='cholesky')
lasso_model = linear_model.Lasso(alpha=0.00001)
EN_model = linear_model.ElasticNet(alpha=0.1)
bayridge_model = linear_model.BayesianRidge()

linear_model_list = [lin_reg, ridge_model, lasso_model, EN_model, bayridge_model]

In [6]:
def linearEnsemble_Resampled(model_list, model_desc, sampling_methods, x_train, y_train, x_test, y_test,seed_list, majority_vote, title='Combined Resampled Ensemble'):
    final_prediction = []
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    
   
    for model in model_list: 
        prediction_list = []
        for samp in sampling_methods:
            x_train_temp = x_train
            y_train_temp = y_train
            if samp != None:
                x_train_temp, y_train_temp = samp.fit_resample(x_train, y_train)
            model.fit(x_train_temp, y_train_temp)
            y_pred_lin = model.predict(x_test)
            y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred_lin]
            prediction_list.append(y_pred_bin)
#                 prediction_list = [a + b for a, b in zip(prediction_list, y_pred_bin)]
# #                 prediction_list.append(y_pred_bin)
                
    prediction_list = [sum(x) for x in zip(*prediction_list)]
    final_prediction = []
    print("Number of votes")
    print(set(prediction_list))
    for pred in prediction_list:
        if pred > majority_vote:
            final_prediction.append(1)
        else:
            final_prediction.append(0)
                
    # print(set(prediction_list))
    total_acc += accuracy_score(y_test, final_prediction)
    total_pre += precision_score(y_test, final_prediction)
    total_rec += recall_score(y_test, final_prediction)
    total_f1 += f1_score(y_test, final_prediction)
    
    model_desc.append([
        "linear Regression Ensemble",
        total_acc/total,
        total_pre/total,
        total_rec/total,
        total_f1/total
                      ])

    # Evaluate
    print(f"{title}")
    print('--------------------------------------------')
    print("Accuracy:", total_acc/total)
    print("Precision: ", total_pre/total)
    print("recall: ", total_rec/total)
    print("f1_score: ", total_f1/total)
    print('--------------------------------------------')
    # print("Classification Report:\n", classification_report(y_test, y_pred_bin))
    plt.show()
    
    return final_prediction

In [7]:
linear_prediction =  linearEnsemble_Resampled(linear_model_list, model_desc, linear_sampling_methods, x_train, y_train, x_test, y_test, seed_list, 6, title=f'Linear Sampled Ensemble')

Number of votes
{0, 1, 2, 3, 4, 5, 6, 7, 8}
Linear Sampled Ensemble
--------------------------------------------
Accuracy: 0.7864424057084608
Precision:  0.3893129770992366
recall:  0.4608433734939759
f1_score:  0.4220689655172413
--------------------------------------------


#### Logistic Ensemble

In [8]:
log_lbgfs = linear_model.LogisticRegression(penalty='l2', solver= 'lbfgs', class_weight = 'balanced', max_iter=1000, C=0.1)
log_liblinear = linear_model.LogisticRegression(penalty='l1', solver= 'liblinear', class_weight = None, max_iter=1000, C=0.1)
log_saga = linear_model.LogisticRegression(penalty='l2', solver= 'saga', class_weight = None, max_iter=1000, C=0.001)

model_list = [log_lbgfs, log_liblinear, log_saga]

In [9]:
def logisticEnsemble(model_list, model_desc, x_train, y_train, x_test, y_test, title='Combined Ensemble'):
    prediction_list = []
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    
    prediction_list = []
    for model in model_list:
        x_train_temp = x_train
        y_train_temp = y_train
        model.fit(x_train_temp, y_train_temp)
        y_pred_log = model.predict(x_test)
        prediction_list.append(y_pred_log)

    prediction_list = [sum(x) for x in zip(*prediction_list)]
    final_prediction = []
    for pred in prediction_list:
        if pred >1:
            final_prediction.append(1)
        else:
            final_prediction.append(0)
            
    total_acc += accuracy_score(y_test, final_prediction)
    total_pre += precision_score(y_test, final_prediction)
    total_rec += recall_score(y_test, final_prediction)
    total_f1 += f1_score(y_test, final_prediction)


    model_desc.append([
        "logistic Regression Ensemble",
        total_acc/total,
        total_pre/total,
        total_rec/total,
        total_f1/total
                      ])
    
    # Evaluate
    print(f"{title}")
    print('--------------------------------------------')
    print("Accuracy:", total_acc/total)
    print("Precision: ", total_pre/total)
    print("recall: ", total_rec/total)
    print("f1_score: ", total_f1/total)
    print('--------------------------------------------')
    # print("Classification Report:\n", classification_report(y_test, y_pred_bin))
    plt.show()
    return final_prediction

In [10]:
logistic_prediction = logisticEnsemble(model_list, model_desc, x_train, y_train, x_test, y_test, title='Reduced Feature Set Logistic Combined Ensemble')

Reduced Feature Set Logistic Combined Ensemble
--------------------------------------------
Accuracy: 0.8430173292558614
Precision:  0.6132075471698113
recall:  0.19578313253012047
f1_score:  0.2968036529680365
--------------------------------------------


### Decision Tree Ensemble

In [11]:
final_prediction = []
total_acc = 0
total_pre = 0
total_rec = 0
total_f1 = 0
total = len(seed_list)

#Near Miss Sampled
best_recall = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')
# No sampling
best_accuracy = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')

#SMOTEK sampling
best_f1 = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       random_state=0, splitter='random')

In [12]:

prediction_list = []
    
x_train_nearMiss, y_train_nearMiss = NearMiss().fit_resample(x_train, y_train)
x_train_smotek, y_train_smotek = SMOTETomek().fit_resample(x_train, y_train)

best_f1.fit(x_train_smotek, y_train_smotek)
    
best_recall.fit(x_train_nearMiss, y_train_nearMiss)
    
best_accuracy.fit(x_train, y_train)
    
    
y_pred_log1 = best_f1.predict(x_test)
y_pred_log2 = best_recall.predict(x_test)
y_pred_log3 = best_accuracy.predict(x_test) 

prediction_list.append(y_pred_log1)
prediction_list.append(y_pred_log2)
prediction_list.append(y_pred_log3)
            
prediction_list = [sum(x) for x in zip(*prediction_list)]
# print(set(prediction_list))

final_prediction = []
for pred in prediction_list:
    if pred >1:
        final_prediction.append(1)
    else:
        final_prediction.append(0)
            
    # print(set(prediction_list))
total_acc += accuracy_score(y_test, final_prediction)
total_pre += precision_score(y_test, final_prediction)
total_rec += recall_score(y_test, final_prediction)
total_f1 += f1_score(y_test, final_prediction)

model_desc.append([
    "linear Regression Ensemble",
    total_acc/total,
    total_pre/total,
    total_rec/total,
    total_f1/total
                    ])

    # Evaluate
print(f"Resampled Ensemble Decision Tree")
print('--------------------------------------------')
print("Accuracy:", total_acc/total)
print("Precision: ", total_pre/total)
print("recall: ", total_rec/total)
print("f1_score: ", total_f1/total)
print('--------------------------------------------')

decisionTree_prediction = final_prediction

    

Resampled Ensemble Decision Tree
--------------------------------------------
Accuracy: 0.7777777777777778
Precision:  0.37962962962962965
recall:  0.4939759036144578
f1_score:  0.42931937172774864
--------------------------------------------


In [13]:
ensemble_prediction = [linear_prediction, logistic_prediction, decisionTree_prediction]
ensemble_prediction = [sum(x) for x in zip(*ensemble_prediction)]
final_prediction = []
for ens in ensemble_prediction:
    if ens >1:
        final_prediction.append(1)
    else:
        final_prediction.append(0)

model_desc.append([
    "Ensemble Logistic+Linear+Decision Tree",
    accuracy_score(y_test, final_prediction),
    precision_score(y_test, final_prediction),
    recall_score(y_test, final_prediction),
    f1_score(y_test, final_prediction)
                      ])        
        

print(f"Ensemble of best Logistic, Linear and Decision Tree Classifiers")
print('--------------------------------------------')
print("Accuracy:", accuracy_score(y_test, final_prediction))
print("Precision: ", precision_score(y_test, final_prediction))
print("recall: ", recall_score(y_test, final_prediction))
print("f1_score: ", f1_score(y_test, final_prediction))
print('--------------------------------------------')
            

Ensemble of best Logistic, Linear and Decision Tree Classifiers
--------------------------------------------
Accuracy: 0.8256880733944955
Precision:  0.4807692307692308
recall:  0.37650602409638556
f1_score:  0.42229729729729726
--------------------------------------------


In [14]:
model_desc
model_comparison_DF = pd.DataFrame(model_desc, columns=['model', 'accuracy', 'precision', 'recall', 'F1 score'])
model_comparison_DF

Unnamed: 0,model,accuracy,precision,recall,F1 score
0,linear Regression Ensemble,0.786442,0.389313,0.460843,0.422069
1,logistic Regression Ensemble,0.843017,0.613208,0.195783,0.296804
2,linear Regression Ensemble,0.777778,0.37963,0.493976,0.429319
3,Ensemble Logistic+Linear+Decision Tree,0.825688,0.480769,0.376506,0.422297


Looking at the dataframe below, It appears that the ensembled models\
have ea tendency to label and mislabel in a similar manner.
most guessed labels appears to have 2 or more votes, and although this sometimes
captures the dod, it also equally misses these labels in a ratio to 3:1

In [15]:
prediction_comparison = pd.DataFrame(
    {'guessed label': ensemble_prediction,
     'actual label': y_test,
    })
pd.set_option("display.max_rows", None)
prediction_comparison


Unnamed: 0,guessed label,actual label
8403,3,0
8207,1,0
1333,3,1
6758,0,0
4936,2,0
8511,0,0
1551,0,0
7828,0,0
8524,2,0
3859,0,0
