In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import make_scorer, roc_auc_score
import matplotlib.pyplot as plt
from split import acquire_train_test_data
import warnings
warnings.filterwarnings("ignore")

In [2]:
train,test=acquire_train_test_data()

In [3]:
y = train['hospital_death']
X = train.drop(columns='hospital_death')

# Train test evaluation

## Train a model

In [4]:
dt = DecisionTreeClassifier(max_depth=10, random_state=123)
dt.fit(X, y)
dt_probs = dt.predict_proba(X)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y, dt_probs)

0.894899045057501

## Evaluate on the test dataset

In [5]:
y_test = test['hospital_death']
X_test = test.drop(columns='hospital_death')

In [6]:
lr_probs = dt.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
roc_auc_score(y_test, lr_probs)

0.8280829502606001

# Cross validation

In [7]:
dtree = DecisionTreeClassifier()

In [8]:
roc_auc_scorer = make_scorer(roc_auc_score)

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(9, 11),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, i.e. k = 4
grid = GridSearchCV(dtree, params, cv=4,scoring=roc_auc_scorer)
grid.fit(X, y)

grid.best_params_

{'criterion': 'gini', 'max_depth': 10}

In [11]:
grid.best_score_

0.6226256401989342

**Conclusion**: So the Score of the crossvalidation is much lower, the aussuming reason is the split of k fold is not stratified.

**Correction**: Acually the split in the crossvalidation is stratified, and the training and test in each fold dependents on the K for example if cv=5 so 1 for test 4 for train.

# StratifiedKFold

In [12]:
from sklearn.model_selection import StratifiedKFold

In [13]:
y = train['hospital_death']
X = train.drop(columns='hospital_death')

In [14]:
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)

5

In [15]:
print(skf)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)


In [16]:
dtree = DecisionTreeClassifier(max_depth = 5)

In [19]:
auc = []
for train_index, validate_index in skf.split(X, y):
    X_train, X_validate = X.iloc[train_index], X.iloc[validate_index]
    y_train, y_validate = y.iloc[train_index], y.iloc[validate_index]
    
    dt.fit(X_train, y_train)
    dt_probs_tra = dt.predict_proba(X_train)
    dt_probs_val = dt.predict_proba(X_validate)
    dt_probs_tra = dt_probs_tra[:, 1]
    dt_probs_val = dt_probs_val[:, 1]
    score_tra= roc_auc_score(y_train, dt_probs_tra)
    score_val= roc_auc_score(y_validate, dt_probs_val)
    score = (score_tra, score_val)
    auc.append(score)

auc

[(0.8985978512785219, 0.812300211161615),
 (0.8965592864272935, 0.8066027267821118),
 (0.8974413004723995, 0.8147306658541551),
 (0.8980365939930962, 0.805613617464928),
 (0.8955054086711741, 0.798932285379204)]

**Conclusion**: I guess when use stratified K fold. the training dataset becom much smaller which wouldn't train a good model.

# Train validate test.

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
train,validate=train_test_split(train, train_size=.8, random_state=42, stratify = train.hospital_death)

In [22]:
train.shape, validate.shape

((58696, 134), (14674, 134))

## Train a model

In [23]:
y_train = train['hospital_death']
X_train = train.drop(columns='hospital_death')

In [24]:
dt = DecisionTreeClassifier(max_depth=10, random_state=123)
dt.fit(X_train, y_train)
dt_probs = dt.predict_proba(X_train)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y_train, dt_probs)

0.8901716878505241

## Validate the model

In [25]:
y_validate = validate['hospital_death']
X_validate = validate.drop(columns='hospital_death')

In [26]:
dt_probs = dt.predict_proba(X_validate)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y_validate, dt_probs)

0.8217469434201645

## test the model

In [27]:
y_test = test['hospital_death']
X_test = test.drop(columns='hospital_death')

In [28]:
dt_probs = dt.predict_proba(X_test)
# keep probabilities for the positive outcome only
dt_probs = dt_probs[:, 1]
roc_auc_score(y_test, dt_probs)

0.805888810406512

# Conclusion

Compared above metrics, the best way to evaluate a model performace is the train validate test

**Correction** : We are still going to use crossvalidation to evaluate our model and find the best hyperparameters