In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from split import acquire_train_test_data
import warnings
warnings.filterwarnings("ignore")

In [43]:
train,test=acquire_train_test_data()
train,validate=train_test_split(train, train_size=.8, random_state=123, stratify = train.hospital_death)

# Build a function without cross validation

In [6]:
y_train = train['hospital_death']
X_train = train.drop(columns='hospital_death')
y_validate = validate['hospital_death']
X_validate = validate.drop(columns='hospital_death')

In [14]:
dt = DecisionTreeClassifier(max_depth=10,criterion='gini')
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [15]:
dt_probs_tra = dt.predict_proba(X_train)
dt_probs_val = dt.predict_proba(X_validate)
dt_probs_tra = dt_probs_tra[:, 1]
dt_probs_val = dt_probs_val[:, 1]
score_tra= roc_auc_score(y_train, dt_probs_tra)
score_val= roc_auc_score(y_validate, dt_probs_val)
score = (score_tra, score_val)

In [16]:
score

(0.8997676263476869, 0.8212178565436399)

# Grid search cross valiation

## Use the grid method to get the auc score.

In [3]:
from sklearn.model_selection import GridSearchCV

In [4]:
dt= DecisionTreeClassifier()

In [29]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(10, 15),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, i.e. k = 4
grid = GridSearchCV(dt, params, cv=3,scoring='roc_auc')
grid.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=10,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(10, 15)},
             pre_dispatch

In [30]:
grid.best_score_

0.7824709832312784

In [31]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 10}

In [32]:
model = grid.best_estimator_
model.score(X_validate, y_validate)

0.9167916041979011

## Use prediction to calculate the auc score

In [37]:
pred_prob = model.predict_proba(X_train)
pred_prob = pred_prob[:, 1]
y_predicted = model.predict(X_train)

In [34]:
roc_auc_score(y_train, pred_prob)

0.8996327499935772

In [39]:
confusion_matrix(y_train,y_predicted)

array([[53264,   366],
       [ 2814,  2252]])

In [36]:
pred_prob = model.predict_proba(X_validate)
pred_prob = pred_prob[:, 1]
roc_auc_score(y_validate, pred_prob)

0.8178734925648596

In [42]:
confusion_matrix(y_validate,y_predicted)

array([[13122,   286],
       [  935,   331]])

In [19]:
from sklearn.metrics import confusion_matrix

In [10]:
y_test = test['hospital_death']
X_test = test.drop(columns='hospital_death')

In [11]:
model.score(X_test, y_test)

0.9177342855585237

> Here is the problem, the predicted score from gridsearch is not the same as the calculated socre.

# Tuning the model with different models and parameters

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [44]:
y_train = train['hospital_death']
X_train = train.drop(columns='hospital_death')
y_validate = validate['hospital_death']
X_validate = validate.drop(columns='hospital_death')

In [57]:
model_params={
    'logistic_regression':{
        'model': LogisticRegression(),
        'params': {
            'C': range(1, 5)
        }
    },
    'DecisionTree':{
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': range(8, 12),
            'criterion': ['gini', 'entropy']
        }
    },
    'RandomFores':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators':[1,5,10]
        }
    }
}

In [58]:
pd.DataFrame(model_params.items())

Unnamed: 0,0,1
0,logistic_regression,"{'model': LogisticRegression(C=1.0, class_weig..."
1,DecisionTree,{'model': DecisionTreeClassifier(class_weight=...
2,RandomFores,{'model': RandomForestClassifier(bootstrap=Tru...


In [59]:
scores = []
for model_name,mp in model_params.items():
    clf=GridSearchCV(mp['model'],mp['params'],cv=5, scoring='roc_auc')
    clf.fit(X_train, y_train)
    scores.append({
        'model':model_name,
        'best_score': clf.best_score_,
        'best_params':clf.best_params_
    })

In [60]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.874105,{'C': 1}
1,DecisionTree,0.833928,"{'criterion': 'entropy', 'max_depth': 8}"
2,RandomFores,0.828993,{'n_estimators': 10}


# Build a model based on best model and params

In [62]:
dt = LogisticRegression(C= 1)
dt.fit(X_train, y_train)
dt_probs_tra = dt.predict_proba(X_train)
dt_probs_val = dt.predict_proba(X_validate)
dt_probs_tra = dt_probs_tra[:, 1]
dt_probs_val = dt_probs_val[:, 1]
score_tra= roc_auc_score(y_train, dt_probs_tra)
score_val= roc_auc_score(y_validate, dt_probs_val)
score = (score_tra, score_val)

In [63]:
score

(0.8781289146238145, 0.8723256988353374)