# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

---
# Data cleaning

In [None]:
vit = pd.read_csv('../data/no-show-300k.csv')

# Transformations required for modeling
def transform_vitoria(df):
    new_names = ['age', 'gender', 'app_registration', 'app_date', 'week_day', 'show_up',
                 'diabetes', 'alcoholism', 'hypertension', 'handicap', 'smokes', 
                 'monetary_help', 'tuberculosis', 'sms_reminder', 'days_to_appointment']
    new_order = ['age', 'gender', 'week_day', 'days_to_appointment', 'diabetes', 'alcoholism', 
                 'hypertension', 'handicap', 'smokes', 'monetary_help', 'tuberculosis', 
                 'sms_reminder', 'show_up']
    df.columns = new_names
    df = df.loc[:, new_order]
    df.loc[:, 'week_day'] = df.loc[:, 'week_day'].str.lower()
    df.loc[:, 'gender'] = df.loc[:, 'gender'].str.lower()
    df.loc[:, 'days_to_appointment'] = df.loc[:, 'days_to_appointment'] * -1
    df = df.replace({'show_up': {'Show-Up': 1, 'No-Show': 0, '2': 0}})
    
    return df

vit = transform_vitoria(vit)

---
# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

data = pd.get_dummies(vit)

X = data.drop('show_up', axis=1)
y = data.loc[:, 'show_up']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Export
import pickle

train_test_data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

# data.to_csv('../data/data_all_features.csv')
# pickle.dump(train_test_data, open("../data/train_test_data_all_features", "wb"))

---
# Supervised Learning with xgboost

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=100, n_jobs=4, silent=False)

grid_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.3, 0.5]
}

scorer = make_scorer(f1_score)

def custom_f1_eval(y_pred, y_true):
    labels = y_true.get_label()
    y_bin = [1. if y_cont > 0.5 else 0. for y_cont in y_pred]
    return 'f1_score', f1_score(labels, y_bin)

fit_params = {
    'eval_set': [(X_test, y_test)],
    'eval_metric': custom_f1_eval,
    'early_stopping_rounds': 20,
    'verbose': True
}

grid = GridSearchCV(clf, grid_params, cv=5, n_jobs=-1, verbose=True, scoring=scorer)

In [None]:
grid.fit(X_train, y_train, **fit_params)

In [None]:
model = grid.best_estimator_

In [None]:
preds = model.predict(X_test)

In [None]:
# Export
import pickle

# pickle.dump(model, open("../data/xgb_model_pickle_v2", "wb"))
# model.save_model("../data/xgb_model_v2")

---
# Feature Selection

### Information Gain

In [None]:
selection_table = pd.DataFrame({'var': X.columns, 'mi': model.feature_importances_}).sort_values('mi', ascending=True)

plt.barh(selection_table['var'], selection_table['mi'])

### Mutual Information

In [None]:
from sklearn.feature_selection import mutual_info_classif

discrete_idx = [i for i in range(2, 19)]
selection_scores = mutual_info_classif(X, y, discrete_idx)
mi_table = pd.DataFrame({'var': X.columns, 'mi': selection_scores}).sort_values('mi', ascending=True)

plt.barh(mi_table['var'], mi_table['mi'])

### Creating minimal data

In [None]:
# Get mutual features
from helper import intersect_features
mutual_features = intersect_features(selection_table, mi_table, 0.1)

In [None]:
minimal_data = data.loc[:, mutual_features + ['show_up']]
minimal_X_train = X_train.loc[:, mutual_features]
minimal_y_train = y_train.loc[:, mutual_features]

In [None]:
# Export
import pickle

minimal_train_test_data = {'X_train': minimal_X_train, 'X_test': X_test, 'y_train': minimal_y_train, 'y_test': y_test}

# minimal_data.to_csv('../data/data_selected_features.csv')
# pickle.dump(minimal_train_test_data, open("../data/train_test_data_selected_features", "wb"))

---
# Unsupervised learning with GMM

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

clusterer = GaussianMixture(n_components=12).fit(minimal_data)
samples = minimal_data[1000:1005]
preds = clusterer.predict(minimal_data)
centers = clusterer.means_

clustered_data = minimal_data.assign(cluster = pd.Series(preds))
preds_proba = clusterer.predict_proba(samples)
preds_deter = clusterer.predict(samples)

pivot = clustered_data.pivot_table(index='cluster', values=mutual_features + ['show_up'], aggfunc='mean')
pivot = pivot.assign(count = clustered_data.groupby('cluster')['show_up'].aggregate('count'))