## Build ML Model

---
> Use reduced by State data

In [1]:
# import general packages
import numpy as np
import pandas as pd
import src.scripts as src
import matplotlib.pyplot as plt
%matplotlib inline

# import ML packages
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

In [None]:
# load data
df_id_reduced = src.read_from_efs('df_id_reduced.csv')
X_reduced = src.read_from_efs('X_reduced.csv')
y_reduced = src.read_from_efs('y_reduced.csv')

In [None]:
# other code

# process data
y = df_id_reduced['exclusion']
X = X_reduced.fillna(0).replace(np.inf, 0)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y)

# show the distribution
print('y_train class distribution')
print(y_train.value_counts(normalize=True))
print('y_test class distribution')
print(y_test.value_counts(normalize=True))

# train model
clf = RandomForestClassifier(n_jobs=-1)

param_grid = {'min_samples_split': [3, 5, 10], 
              'n_estimators' : [100, 300],
              'max_depth': [3, 5, 15, 25],
              'max_features': [3, 5, 10, 20]}

scorers = {'precision_score': make_scorer(precision_score),
           'recall_score': make_scorer(recall_score),
           'accuracy_score': make_scorer(accuracy_score)}

In [None]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [None]:
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

clf.fit(X_train, y_train)

# make a prediction
y_pred = clf.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tp, fp)
print(tn, fn)