In [None]:
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, f1_score, auc, roc_auc_score, precision_score, balanced_accuracy_score, fbeta_score, make_scorer
from sklearn.inspection import permutation_importance

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler


In [None]:

features = pd.read_csv('features.csv')
labels = pd.read_csv('labels.csv')

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1, stratify=labels)

y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

In [None]:
def scoring(clf, X, y):
    y_pred = clf.predict(X)
    print(f"Recall:            {recall_score(y, y_pred)}")
    print(f"Precision:         {precision_score(y, y_pred, zero_division=0)}")

In [None]:
# ## Note MDI is computed on training set
# ## MDI has bias towards features with high cardinalities

# def plot_MDI(forest):
#     importances = forest.feature_importances_
#     std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
#     forest_importances = pd.Series(importances, index=features.columns)

#     fig, ax = plt.subplots()
#     forest_importances.plot.bar(yerr=std, ax=ax)
#     ax.set_title("Feature importances using MDI")
#     ax.set_ylabel("Mean decrease in impurity")
#     fig.tight_layout()
    
# def plot_permutation_feature_importance(forest, X_test, y_test):
#     result = permutation_importance(forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
    
#     forest_importances = pd.Series(result.importances_mean, index=features.columns)
    
#     fig, ax = plt.subplots()
#     forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
#     ax.set_title("Feature importances using permutation on full model")
#     ax.set_ylabel("Mean accuracy decrease")
#     fig.tight_layout()
#     plt.show()

In [None]:
# def eval_models(X_train, y_train, X_test, y_test):
#     print("Decision Tree")
#     clf = DecisionTreeClassifier().fit(X_train, y_train)
#     scoring(clf, X_test, y_test)

#     print("Random Forest")
#     clf = RandomForestClassifier(max_depth=4, random_state=0).fit(X_train, y_train)
#     scoring(clf, X_test, y_test)
#     # plot_MDI(clf)
#     # plot_permutation_feature_importance(clf, X_test, y_test)

#     print("Logistic Regression")
#     clf = LogisticRegression(random_state=0).fit(X_train, y_train)
#     scoring(clf, X_test, y_test)

#     # print("Naive Bayes")
#     # clf = CategoricalNB().fit(X_train, y_train)
#     # scoring(clf, X_test, y_test)

In [None]:
# print("Baseline")
# eval_models(X_train, y_train, X_val, y_val)

# print("Random_OverSampling")
# X_resampled, y_resampled = RandomOverSampler().fit_resample(X_train, y_train)
# eval_models(X_resampled, y_resampled, X_val, y_val)

# print("SMOTE")
# X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
# eval_models(X_resampled, y_resampled, X_val, y_val)

# print("ADASYN")
# X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
# eval_models(X_resampled, y_resampled, X_val, y_val)

# print("Random_UnderSampling")
# X_resampled, y_resampled = RandomUnderSampler().fit_resample(X_train, y_train)
# eval_models(X_resampled, y_resampled, X_val, y_val)

In [None]:

from sklearn.model_selection import GridSearchCV
fbeta_scorer = make_scorer(fbeta_score, beta=0.0001)

def find_best_hyperparam(estimator, param_distributions, X_train, y_train):
    clf = RandomizedSearchCV(estimator=estimator, 
                             param_distributions=param_distributions, 
                             scoring=fbeta_scorer,
                             n_iter=20,
                             cv=5, 
                             verbose=2, 
                             n_jobs=4)
    clf.fit(X_train, y_train)
    return clf.best_estimator_, clf.best_params_

X_resampled, y_resampled = RandomOverSampler().fit_resample(X_train, y_train)

In [None]:
# Decision Tree
try:
    params = {'splitter': ['best', 'random'],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['log2', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]}

    best_tree, best_tree_params = find_best_hyperparam(DecisionTreeClassifier(), params, X_resampled, y_resampled)
except:
    pass

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegressionCV
logreg_cv = LogisticRegressionCV(cv=5, random_state=0).fit(X_resampled, y_resampled)

In [None]:
# SVC
try:
    params = {'C': [1, 10, 100], 
          'gamma': [0.001, 0.0001], 
          'kernel': ['rbf', 'linear'],}

    best_svc, best_svc_params = find_best_hyperparam(SVC(max_iter=5000, probability=True, class_weight = 'balanced'), params, X_resampled, y_resampled)
    print(best_svc.get_params())
except:
    pass

In [None]:
# # Naive Bayes
# try:
#     params = {'alpha': np.logspace(0,-9, num=100)}
#     best_nb, best_nb_params = find_best_hyperparam(CategoricalNB(min_categories=features.nunique()), params, X_resampled, y_resampled)
#     print(best_nb.get_params())
#     scoring(best_nb, X_val, y_val)
#     print(best_nb_params)
# except:
#     pass

In [None]:
# # Ridge Regression

# params = {'alpha': [0.1, 1.0, 10.0],
#         'solver': ['auto', 'svd', 'cholesky','sparse_cg', 'saga', 'lbfgs']
# }
# best_ridgereg, best_ridgereg_params = find_best_hyperparam(RidgeClassifier(), params, X_resampled, y_resampled)
# print(best_ridgereg.get_params())

# try:
#     scoring(best_ridgereg, X_val, y_val)
#     print(best_ridgereg_params)
# except:
#     pass

In [None]:
# Random Forest
try:
    params = {
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'max_features': ['log2', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

    best_forest, best_tree_params = find_best_hyperparam(RandomForestClassifier(), params, X_resampled, y_resampled)
    print(best_forest.get_params())
except:
    pass

In [None]:
# Test set
print("Decision Tree")
scoring(best_tree, X_test, y_test)

print("Forest")
scoring(best_forest, X_test, y_test)

print("SVC")
scoring(best_svc, X_test, y_test)

print("Logistic Regression")
scoring(logreg_cv, X_test, y_test)

In [None]:
# Undersampling


best_forest_params = {'bootstrap': False, 'ccp_alpha': 0.0, 
                    'class_weight': None, 'criterion': 'gini', 
                    'max_depth': 70, 
                    'max_features': 'log2', 
                    'max_leaf_nodes': None, 
                    'max_samples': None, 
                    'min_impurity_decrease': 0.0, 
                    'min_samples_leaf': 1, 
                    'min_samples_split': 2, 
                    'min_weight_fraction_leaf': 0.0, 
                    'n_estimators': 600, 
                    'n_jobs': None, 'oob_score': False, 
                    'random_state': None, 'verbose': 0, 'warm_start': False}

best_tree_params = {'ccp_alpha': 0.0, 'class_weight': None, 
                    'criterion': 'gini', 'max_depth': 30, 
                    'max_features': 'sqrt', 'max_leaf_nodes': None, 
                    'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 
                    'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 
                    'random_state': None, 'splitter': 'best'}

best_logreg_params = {'C': 10, 'class_weight': None, 'dual': False, 
                      'fit_intercept': True, 'intercept_scaling': 1, 
                      'l1_ratio': None, 'max_iter': 100, 
                      'multi_class': 'auto', 
                      'n_jobs': None, 'penalty': 'l2', 
                      'random_state': None, 'solver': 'saga', 
                      'tol': 0.0001, 'verbose': 0, 'warm_start': False}

best_ridge_params = {'alpha': 0.1, 'class_weight': None, 'copy_X': True, 
                     'fit_intercept': True, 'max_iter': None, 
                     'positive': False, 'random_state': None, 
                     'solver': 'auto', 'tol': 0.0001}


best_svc_params = {'C': 100, 'break_ties': False, 'cache_size': 200, 
                   'class_weight': None, 'coef0': 0.0, 
                   'decision_function_shape': 'ovr', 'degree': 3, 
                   'gamma': 0.001, 'kernel': 'rbf', 
                   'max_iter': 5000, 'probability': False, 
                   'random_state': None, 'shrinking': True, 
                   'tol': 0.001, 'verbose': False}

In [None]:
# Export models
import pickle

with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(best_tree, f)
    
# with open('forest_model.pkl', 'wb') as f:
#     pickle.dump(best_forest, f)
    
with open('svc_model.pkl', 'wb') as f:
    pickle.dump(best_svc, f)

with open('logreg_model.pkl', 'wb') as f:
    pickle.dump(logreg_cv, f)
    logreg_cv