In [9]:
import sqlite3
import pandas as pd
import yaml
import seaborn as sb
from sklearn.preprocessing import StandardScaler

#### connect sqlite database

In [2]:
# If fail, please run preprocessing notebook first 
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)
con = sqlite3.connect(f"{config['SQLITE_DATABASE_DIR']}/ICU.db")

In [4]:
data = pd.read_sql_query('select * from ICU', con)
data     

Unnamed: 0,ID,Survive,Age,AgeGroup,Sex,Infection,SysBP,Pulse,Emergency
0,4,0,87,3,1,1,80,96,1
1,8,1,27,1,1,1,142,88,1
2,12,1,59,2,0,0,112,80,1
3,14,1,77,3,0,0,100,70,0
4,27,0,76,3,1,1,128,90,1
...,...,...,...,...,...,...,...,...,...
195,921,0,50,2,1,0,256,64,1
196,923,1,20,1,0,0,104,83,1
197,924,1,73,3,1,0,162,100,1
198,925,1,59,2,0,0,100,88,1


### preprocessing

In [15]:
feature_colname = data.columns[2:].tolist()
target_colname = "Survive"

numerical_cols = ["Age", "SysBP", "Pulse"]
categorical_cols = ["Sex", "Infection", "Emergency", "AgeGroup"]

In [26]:
# no NAs to handle
data.isna().sum()

ID           0
Survive      0
Age          0
AgeGroup     0
Sex          0
Infection    0
SysBP        0
Pulse        0
Emergency    0
dtype: int64

In [18]:
def normalize(df, cols):
    ss = StandardScaler()
    ss.fit(df[cols])
    output = pd.DataFrame(ss.transform(df[cols]), columns = cols)
    return output
def set_categoric(df, cols):
    for c in cols:
        df[c] = df[c].astype(object)
    return df[cols]

In [77]:
normalized_df = normalize(data, cols = numerical_cols)
categoric_df = set_categoric(data, cols = categorical_cols)
preprocessed_data = pd.concat([normalized_df,categoric_df, data[[target_colname]]], axis = 1)

In [78]:
preprocessed_data

Unnamed: 0,Age,SysBP,Pulse,Sex,Infection,Emergency,AgeGroup,Survive
0,1.472422,-1.590527,-0.109295,1,1,1,3,0
1,-1.526910,0.295714,-0.408221,1,1,1,1,1
2,0.072734,-0.616983,-0.707147,0,0,1,2,1
3,0.972534,-0.982062,-1.080805,0,0,0,3,1
4,0.922545,-0.130211,-0.333489,1,1,1,3,0
...,...,...,...,...,...,...,...,...
195,-0.377166,3.763963,-1.305000,1,0,1,2,0
196,-1.876833,-0.860369,-0.595050,0,0,1,1,1
197,0.772578,0.904179,0.040168,1,0,1,3,1
198,0.072734,-0.982062,-0.408221,0,0,1,2,1


# Random Grid Search CV

In [81]:
results = {
    "model":[],
    "recall":[],
    "best_param":[]
}

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [47]:
X = preprocessed_data[feature_colname]
y = preprocessed_data[target_colname].astype(int)

### random forest

In [53]:
param_grid = {
    "n_estimators" : range(30,101), 
    "max_depth" : range(2,10), 
    "max_features" : range(2, len(X.columns)), 
    "class_weight" : [None, "balanced", "balanced_subsample"]
}

In [54]:
model = RandomForestClassifier(
    bootstrap = True, 
    random_state = 0, 
)

In [73]:
gs_cv = RandomizedSearchCV(model, 
                           param_grid, 
                           n_iter = 10,
                           scoring = "recall",
                           # scoring = ["accuracy", "recall", "precision", "roc_auc"],
                           n_jobs = -1,
                           random_state = 0,
                           cv = 5) 
gs_cv_output = gs_cv.fit(X, y)

In [82]:
results['model'].append("Random Forest")
results['recall'].append(gs_cv_output.best_score_)
results['best_param'].append(gs_cv_output.best_params_)

### Logistic Regression

### Decision Trees