In [28]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import make_scorer, roc_auc_score
import matplotlib.pyplot as plt
from split import acquire_train_test_data
import warnings
warnings.filterwarnings("ignore")

In [29]:
train,test=acquire_train_test_data()

In [30]:
y = train['hospital_death']
X = train.drop(columns='hospital_death')

# Train test evaluation

## Train a model

In [31]:
dt = DecisionTreeClassifier(max_depth=10, random_state=123)
dt.fit(X, y)
dt_probs = dt.predict_proba(X)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y, dt_probs)

0.894899045057501

## Evaluate on the test dataset

In [32]:
y_test = test['hospital_death']
X_test = test.drop(columns='hospital_death')

In [33]:
lr_probs = dt.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
roc_auc_score(y_test, lr_probs)

0.8280829502606001

# Cross validation

In [34]:
dtree = DecisionTreeClassifier()

In [35]:
roc_auc_scorer = make_scorer(roc_auc_score)

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(9, 11),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, i.e. k = 4
grid = GridSearchCV(dtree, params, cv=4,scoring=roc_auc_scorer)
grid.fit(X, y)

grid.best_params_

{'criterion': 'gini', 'max_depth': 9}

In [38]:
grid.best_score_

0.6229205046290265

**Conclusion**: So the Score of the crossvalidation is much lower, the aussuming reason is the split of k fold is not stratified.

# StratifiedKFold

In [39]:
from sklearn.model_selection import StratifiedKFold

In [40]:
y = train['hospital_death']
X = train.drop(columns='hospital_death')

In [41]:
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

2

In [42]:
print(skf)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)


In [43]:
dtree = DecisionTreeClassifier(max_depth = 5)

In [44]:
auc = []
for train_index, validate_index in skf.split(X, y):
    X_train, X_validate = X.iloc[train_index], X.iloc[validate_index]
    y_train, y_validate = y.iloc[train_index], y.iloc[validate_index]
    
    dt.fit(X_train, y_train)
    dt_probs_tra = dt.predict_proba(X_validate)
    dt_probs_val = dt.predict_proba(X_validate)
    dt_probs_tra = dt_probs_tra[:, 1]
    dt_probs_val = dt_probs_val[:, 1]
    score_tra= roc_auc_score(y_train, dt_probs_tra)
    score_val= roc_auc_score(y_validate, dt_probs_val)
    score = (score_tra, score_val)
    roc_auc.append(score)

roc_auc

[0.784137774265063,
 0.7945458876182216,
 (0.5027758885848528, 0.784137774265063),
 (0.4976102926660597, 0.7945458876182216)]

**Conclusion**: I guess when use stratified K fold. the training dataset becom much smaller which wouldn't train a good model.

# Train validate test.

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train,validate=train_test_split(train, train_size=.8, random_state=42, stratify = train.hospital_death)

In [21]:
train.shape, validate.shape

((58696, 134), (14674, 134))

## Train a model

In [22]:
y_train = train['hospital_death']
X_train = train.drop(columns='hospital_death')

In [23]:
dt = DecisionTreeClassifier(max_depth=10, random_state=123)
dt.fit(X_train, y_train)
dt_probs = dt.predict_proba(X_train)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y_train, dt_probs)

0.8901716878505241

## Validate the model

In [24]:
y_validate = validate['hospital_death']
X_validate = validate.drop(columns='hospital_death')

In [25]:
dt_probs = dt.predict_proba(X_validate)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y_validate, dt_probs)

0.8217469434201645

## test the model

In [26]:
y_test = test['hospital_death']
X_test = test.drop(columns='hospital_death')

In [27]:
dt_probs = dt.predict_proba(X_test)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y_test, dt_probs)

0.805888810406512

# Conclusion

Compared above metrics, the best way to evaluate a model performace is the train validate test