In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)
from yellowbrick.model_selection import (
    LearningCurve,
)

## Data

[Vanderbilt Datasets](http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets)

### Gather data

In [None]:
url = (
    'http://biostat.mc.vanderbilt.edu/' 
    'wiki/pub/Main/DataSets/titanic3.xls'
)

In [None]:
df = pd.read_excel(url)
orig_df = df.copy()

In [None]:
df.dtypes

### Clean data

In [None]:
import pandas_profiling

In [None]:
pandas_profiling.ProfileReport(df)

In [None]:
df.shape

In [None]:
df.count()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum(axis=1)

In [None]:
df.isnull().any(axis=1)

In [None]:
df.sex.value_counts(dropna=False)

In [None]:
df.embarked.value_counts(dropna=False)

### Create features

In [None]:
df.columns

In [None]:
df = df.drop(
    columns = [
        'name',
        'ticket',
        'home.dest',
        'boat',
        'body',
        'cabin',
    ]
)

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.columns

In [None]:
X = df.drop(columns = 'survived')
y = df.survived

### Sample data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=123
)

### Impute data

In [None]:
from sklearn.experimental import (
    enable_iterative_imputer
)
from sklearn import impute

In [None]:
num_cols = [
    'pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_male'
]

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
imputer = impute.IterativeImputer()

In [None]:
X_train.loc[:, num_cols] = imputer.fit_transform(
    X_train[num_cols]
)

**Note**: We use only data from training set to impute missing values in test set

In [None]:
X_test.loc[:, num_cols] = imputer.transform(
    X_test[num_cols]
)

### Normalize data

In [None]:
std_cols =  ['pclass', 'age', 'sibsp', 'parch', 'fare',]

In [None]:
scaler = preprocessing.StandardScaler()
X_train.loc[:, std_cols] = scaler.fit_transform(X_train[std_cols])
X_test.loc[:, std_cols] = scaler.transform(X_test[std_cols])

### Baseline model

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
dummy = DummyClassifier(strategy='stratified')

In [None]:
dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(y_test, dummy.predict(X_test))

### Evaluate model families

In [None]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [None]:
from sklearn import model_selection
from sklearn.linear_model import (
    LogisticRegression,
)
from sklearn.tree import (
    DecisionTreeClassifier,
)
from sklearn.neighbors import (
    KNeighborsClassifier,
)
from sklearn.naive_bayes import (
    GaussianNB,
)
from sklearn.svm import (
    SVC,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
import xgboost

In [None]:
models = [    
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier,
    xgboost.XGBRFClassifier,
]

In [None]:
for model in models:
    cls = model()
    kfold = model_selection.KFold(
        n_splits=10, 
        shuffle=True,
        random_state=123,
    )
    s = model_selection.cross_val_score(
        cls, X, y, scoring='roc_auc', cv=kfold,
    )
    print(
        f'{model.__name__:22} AUC:'
        f'{s.mean():.3f} STD: {s.std():.2f}'
    )

### Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
stack = StackingClassifier(estimators=[(m.__name__, m()) for m in models])

In [None]:
s = model_selection.cross_val_score(
    stack, X, y, scoring='roc_auc', cv=kfold,
)
print(
    f'{stack.__class__.__name__:22} AUC:'
    f'{s.mean():.3f} STD: {s.std():.2f}'
)

### Create a model

In [None]:
clf = xgboost.XGBRFClassifier()
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
precision_score(
    y_test, clf.predict(X_test)
)

In [None]:
sorted(zip(X_train.columns, clf.feature_importances_), key=lambda x: -x[1])

### Optimize model

In [None]:
clf_ = xgboost.XGBRFClassifier()

In [None]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.5, 1, 1.5, 2, 5],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [4, 5, 6, 7],
}

In [None]:
clf = model_selection.GridSearchCV(
    clf_, params, n_jobs=-1, 
).fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.score(X_test, y_test)

In [None]:
clf_best = xgboost.XGBRFClassifier(**clf.best_params_)

### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
clf_best.fit(X_train, y_train)

In [None]:
confusion_matrix(y_test, clf_best.predict(X_test))

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
cm_viz = ConfusionMatrix(clf, classes=['died', 'survived'])
cm_viz.fit(X_train, y_train)
cm_viz.score(X_test, y_test)
cm_viz.show();

### ROC curve

In [None]:
roc_auc_score(y_test, clf_best.predict(X_test))

In [None]:
roc_viz = ROCAUC(clf)
roc_viz.fit(X_train, y_train)
roc_viz.score(X_test, y_test)
roc_viz.show();

### Learning curve

In [None]:
lc_viz = LearningCurve(clf_best)
lc_viz.fit(X_train, y_train)
lc_viz.score(X_test, y_test)
lc_viz.show();

### Model persistence (and deploymnet)

In [None]:
import joblib

In [None]:
joblib.dump(clf_best, 'clf_best.pickle')

In [None]:
clf = joblib.load('clf_best.pickle')

In [None]:
clf.score(X_test, y_test)