In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
    PRCurve,
)
from yellowbrick.model_selection import (
    LearningCurve,
)

In [None]:
plt.rcParams.update({'font.size': 16})

## Data

[Vanderbilt Datasets](http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets)

### Gather data

In [None]:
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

In [None]:
y_train.head(3)

In [None]:
y_test.head(3)

### Baseline model

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
dummy = DummyClassifier(strategy='prior')

In [None]:
dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test, dummy.predict(X_test))

### Evaluate model families

In [None]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

#### `sklearn` does not like column vectors for the target

In [None]:
y = y.values.ravel()
y_train = y_train.values.ravel()
y_test =y_test.values.ravel()

## Classification

In [None]:
# ! python3 -m pip install --quiet xgboost

In [None]:
from sklearn import model_selection
from sklearn.linear_model import (
    LogisticRegression,
)
from sklearn.tree import (
    DecisionTreeClassifier,
)
from sklearn.neighbors import (
    KNeighborsClassifier,
)
from sklearn.naive_bayes import (
    GaussianNB,
)
from sklearn.svm import (
    SVC,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
import xgboost

In [None]:
models = [    
    DummyClassifier,
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier,
    xgboost.XGBRFClassifier,
]

In [None]:
for model in models:
    cls = model()
    kfold = model_selection.KFold(
        n_splits=10, 
        shuffle=True,
        random_state=123,
    )
    s = model_selection.cross_val_score(
        cls, X, y, scoring='roc_auc', cv=kfold,
    )
    print(
        f'{model.__name__:22} AUC:'
        f'{s.mean():.3f} STD: {s.std():.2f}'
    )

### Stacking

Source: https://miro.medium.com/max/2044/1*5O5_Men2op_sZsK6TTjD9g.png

<img src="https://miro.medium.com/max/2044/1*5O5_Men2op_sZsK6TTjD9g.png" 
     alt="Stacking" style="width: 600px;"/>


In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
stack = StackingClassifier(estimators=[(m.__name__, m()) for m in models])

In [None]:
s = model_selection.cross_val_score(
    stack, X, y, scoring='roc_auc', cv=kfold,
)
print(
    f'{stack.__class__.__name__:22} AUC:'
    f'{s.mean():.3f} STD: {s.std():.2f}'
)

### Create a model

In [None]:
clf = xgboost.XGBRFClassifier()
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
roc_auc_score(
    y_test, clf.predict(X_test)
)

In [None]:
sorted(zip(X_train.columns, clf.feature_importances_), key=lambda x: -x[1])

### Optimize model

In [None]:
clf_ = xgboost.XGBRFClassifier()

In [None]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.5, 1, 1.5, 2, 5],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [4, 5, 6, 7],
}

In [None]:
clf = model_selection.GridSearchCV(
    clf_, params, n_jobs=-1, 
).fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.score(X_test, y_test)

In [None]:
clf_best = xgboost.XGBRFClassifier(**clf.best_params_)

### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
clf_best.fit(X_train, y_train)

In [None]:
confusion_matrix(y_test, clf_best.predict(X_test))

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
cm_viz = ConfusionMatrix(clf, classes=['died', 'survived'])
cm_viz.fit(X_train, y_train)
cm_viz.score(X_test, y_test)
cm_viz.show();

### ROC curve

In [None]:
roc_auc_score(y_test, clf_best.predict(X_test))

In [None]:
roc_viz = ROCAUC(clf)
roc_viz.fit(X_train, y_train)
roc_viz.score(X_test, y_test)
roc_viz.show();

### Precision-recall curve

In [None]:
prc_viz = PRCurve(clf)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show();

### Learning curve

In [None]:
lc_viz = LearningCurve(clf_best)
lc_viz.fit(X_train, y_train)
lc_viz.score(X_test, y_test)
lc_viz.show();

### Model persistence (and deploymnet)

In [None]:
import joblib

In [None]:
joblib.dump(clf_best, 'clf_best.pickle')

In [None]:
clf = joblib.load('clf_best.pickle')

In [None]:
clf.score(X_test, y_test)