by wedad
Hey team this file contains the optimisation for Logistic Regressiona and Linear regressions
I'll be comparing them to the baseline model found in baseline_model.ipynb

# LinearRegression Optimisation

Read in all the relevant Libraries


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed_list = [1,2,3,4,5,6,7,8,9,10]

Read In the data

In [3]:
aki_df = pd.read_csv('../data/df_final_AKI.csv')
aki_df

Unnamed: 0.1,Unnamed: 0,subject_id,dod,gender,age,Albumin,Creatinine,Hemoglobin,INR(PT),PT,Sodium,Urea Nitrogen,Arterial Blood Pressure diastolic,Arterial Blood Pressure systolic,Heart Rate,Respiratory Rate,hypertension,chronic_kidney_disease,sepsis,Intercept
0,0,12642263,0,1,73,3.6,1.2,9.4,3.2,13.8,138.0,16.0,48.0,101.0,84.0,20.0,1,0,0,1
1,1,12155939,0,0,24,4.1,0.5,10.2,1.2,15.7,143.0,6.0,56.0,114.0,59.0,16.0,0,0,0,1
2,2,17735780,0,0,54,4.1,0.4,10.2,1.0,14.4,142.0,22.0,48.0,130.0,59.0,24.0,1,0,0,1
3,3,13677167,0,1,84,3.5,1.1,10.2,1.6,18.7,144.0,17.0,56.0,114.0,89.0,21.0,1,0,0,1
4,4,11259141,0,0,59,3.6,0.6,10.2,1.7,12.4,153.0,8.0,70.0,91.0,99.0,21.0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9804,9804,12118394,0,1,61,3.3,3.6,10.2,1.5,13.1,138.0,23.0,62.0,103.0,89.0,20.0,0,1,0,1
9805,9805,18392647,0,1,74,3.5,0.6,10.2,1.2,13.6,138.0,6.0,56.0,114.0,107.0,22.0,1,0,0,1
9806,9806,12459376,0,1,58,4.1,0.6,10.2,0.9,9.3,141.0,8.0,56.0,114.0,78.0,10.0,0,0,0,1
9807,9807,14262251,1,1,84,4.0,3.4,12.2,1.1,17.9,141.0,88.0,56.0,114.0,101.0,25.0,1,1,0,1


In [4]:
model_desc = []
model_comparison_DF = pd.DataFrame(model_desc, columns=['model', 'accuracy', 'precision', 'recall', 'F1 score'])
model_comparison_DF

Unnamed: 0,model,accuracy,precision,recall,F1 score


In [5]:
x = aki_df.drop(columns=['dod', 'Unnamed: 0', 'subject_id'])
y = aki_df['dod']

def linearTester(model_desc, model, x, y, title='alg'):
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    for rand in seed_list:
        x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y, test_size=0.25, random_state=rand)
        model.fit(x_train, y_train)
        y_pred_lin = model.predict(x_test)
        y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred_lin]
        total_acc += accuracy_score(y_test, y_pred_bin)
        total_pre += precision_score(y_test, y_pred_bin)
        total_rec += recall_score(y_test, y_pred_bin)
        total_f1 += f1_score(y_test, y_pred_bin)

    model_desc.append([
        str(model),
        total_acc/total,
        total_pre/total,
        total_rec/total,
        total_f1/total
        
                      ])
    # Evaluate
    print(f"{title}")
    print('--------------------------------------------')
    print("Accuracy:", total_acc/total,)
    print("Precision: ", total_pre/total,)
    print("recall: ", total_rec/total,)
    print("f1_score: ", total_f1/total,)
    print('--------------------------------------------')
    # print("Classification Report:\n", classification_report(y_test, y_pred_bin))
    plt.show()
    

In [6]:
lin_reg = linear_model.LinearRegression()
linearTester(model_desc, lin_reg, x, y,'Linear Regression baseline')

Linear Regression baseline
--------------------------------------------
Accuracy: 0.84027721157766
Precision:  0.7283479660626826
recall:  0.09012048192771084
f1_score:  0.16009707044429883
--------------------------------------------


Baseline model, accuracy of approximately 0.8445

In [7]:
x = aki_df.drop(columns=['dod','Sodium','Hemoglobin','gender','hypertension','Intercept'])
y = aki_df['dod']
x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y, test_size=0.2, random_state=seed_list[0])

In [8]:
linearTester(model_desc,lin_reg, x, y,'Linear Regression modified feature')

Linear Regression modified feature
--------------------------------------------
Accuracy: 0.84097024052181
Precision:  0.752637728909679
recall:  0.08987951807228914
f1_score:  0.16033765861952176
--------------------------------------------


New Feature Baseline model, accuracy of approximately 0.8460

In [9]:
alpha = [0,0.01,0.1,0.2,0.5,1,5,10,100]
solver = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
ridge_grid = {
    'alpha': alpha,
    'solver': solver,
}

In [10]:
ridge_grid = GridSearchCV(estimator = linear_model.Ridge(), param_grid = ridge_grid, cv = 3, verbose =2, n_jobs = 4)
ridge_grid.fit(x_train, y_train)
ridge_grid.best_params_

Fitting 3 folds for each of 63 candidates, totalling 189 fits


{'alpha': 10, 'solver': 'svd'}

In [11]:
ridge_model = linear_model.Ridge(alpha=10, solver='svd')
linearTester(model_desc, ridge_model, x, y,'Linear Ridge modified feature')

Linear Ridge modified feature
--------------------------------------------
Accuracy: 0.84097024052181
Precision:  0.7524346181802525
recall:  0.08987951807228914
f1_score:  0.1603119140111063
--------------------------------------------


In [12]:
alpha = [0.00001,0.001,0.01,0.1,0.2,0.5,1,5,10,50,100]
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
lasso_grid = {
    'alpha': alpha,
}
lasso_grid = GridSearchCV(estimator = linear_model.Lasso(), param_grid = lasso_grid, cv = 3, verbose =2, n_jobs = 4)
lasso_grid.fit(x_train, y_train)
lasso_grid.best_params_

Fitting 3 folds for each of 11 candidates, totalling 33 fits


{'alpha': 1e-05}

In [13]:
lasso_model = linear_model.Lasso(alpha=0.00001)
linearTester(model_desc, lasso_model, x, y,'Linear lasso modified feature')

Linear lasso modified feature
--------------------------------------------
Accuracy: 0.84097024052181
Precision:  0.752637728909679
recall:  0.08987951807228914
f1_score:  0.16033765861952176
--------------------------------------------


In [14]:
EN_model = linear_model.ElasticNet(alpha=0.1)
EN_model.fit(x_train, y_train)
linearTester(model_desc, EN_model, x, y,'Linear ElasticNet modified feature')

Linear ElasticNet modified feature
--------------------------------------------
Accuracy: 0.8333061557276803
Precision:  0.6841302118933698
recall:  0.028915662650602414
f1_score:  0.05536895936216544
--------------------------------------------


In [15]:
bayridge_model = linear_model.BayesianRidge()
bayridge_model.fit(x_train, y_train)
linearTester(model_desc, bayridge_model, x, y,'Linear Baysian Ridge modified feature')

Linear Baysian Ridge modified feature
--------------------------------------------
Accuracy: 0.8406033428454954
Precision:  0.7556715923723959
recall:  0.08578313253012047
f1_score:  0.1537272180504024
--------------------------------------------


Create an Ensemble of all linear models to create a majority ruling model

In [16]:
def linearEnsemble(model_list, model_desc, x, y, title='Combined Ensemble'):
    prediction_list = []
    total_acc = 0
    total_pre = 0
    total_rec = 0
    total_f1 = 0
    total = len(seed_list)
    
    for rand in seed_list:
        prediction_list = []
        for model in model_list:
            x_train, x_test, y_train, y_test = train_test_split(x, y,stratify=y, test_size=0.25, random_state=seed_list[0])
            model.fit(x_train, y_train)
            y_pred_lin = model.predict(x_test)
            y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred_lin]
            prediction_list.append(y_pred_bin)

        prediction_list = [sum(x) for x in zip(*prediction_list)]
        final_prediction = []
        for pred in prediction_list:
            if pred >0:
                final_prediction.append(1)
            else:
                final_prediction.append(0)
                
        total_acc += accuracy_score(y_test, final_prediction)
        total_pre += precision_score(y_test, final_prediction)
        total_rec += recall_score(y_test, final_prediction)
        total_f1 += f1_score(y_test, final_prediction)


    model_desc.append([
        "Linear Regression Ensemble",
        total_acc/total,
        total_pre/total,
        total_rec/total,
        total_f1/total
                      ])
    
    # Evaluate
    print(f"{title}")
    print('--------------------------------------------')
    print("Accuracy:", accuracy_score(y_test, final_prediction))
    print("Precision: ", precision_score(y_test, final_prediction))
    print("recall: ", recall_score(y_test, final_prediction))
    print("f1_score: ", f1_score(y_test, final_prediction))
    print('--------------------------------------------')
    # print("Classification Report:\n", classification_report(y_test, y_pred_bin))
    plt.show()

In [17]:
model_list = [lin_reg, ridge_model, lasso_model, EN_model, bayridge_model]
linearEnsemble(model_list, model_desc, x, y, title='Ensemble')

Ensemble
--------------------------------------------
Accuracy: 0.8446799836934366
Precision:  0.7931034482758621
recall:  0.1108433734939759
f1_score:  0.19450317124735728
--------------------------------------------


In [18]:
model_comparison_DF = pd.DataFrame(model_desc, columns=['model', 'accuracy', 'precision', 'recall', 'F1 score'])
model_comparison_DF

Unnamed: 0,model,accuracy,precision,recall,F1 score
0,LinearRegression(),0.840277,0.728348,0.09012,0.160097
1,LinearRegression(),0.84097,0.752638,0.08988,0.160338
2,"Ridge(alpha=10, solver='svd')",0.84097,0.752435,0.08988,0.160312
3,Lasso(alpha=1e-05),0.84097,0.752638,0.08988,0.160338
4,ElasticNet(alpha=0.1),0.833306,0.68413,0.028916,0.055369
5,BayesianRidge(),0.840603,0.755672,0.085783,0.153727
6,Linear Regression Ensemble,0.84468,0.793103,0.110843,0.194503


It appears that the base linear Regression performs identical to Ridge and Lasso normalised iterations\
This is for iterations that utilised the reduced feature set. \
Creating an Ensemble of these models provides a small increase in accuracy and F1 scores