# Imports e config dos módulos

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
import warnings
from sklearn.feature_selection import f_classif, mutual_info_classif, SequentialFeatureSelector, SelectKBest
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, LeaveOneOut, cross_val_score, StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from boruta import BorutaPy
from genetic_selection import GeneticSelectionCV

In [None]:
# pd.set_option('max_columns', None)
%matplotlib inline
sns.set_style('darkgrid')

# Dataset

In [None]:
df = pd.read_csv('../data/interim/hans_outcome.csv')

In [None]:
df.shape

# Agrupamento e remoção de features

In [None]:
target = 'TPALTA_N'
feats = df.drop(target, axis=1).columns
num_feats = [feat for feat in feats if df[feat].dtype != 'O']
cat_feats = [feat for feat in feats if feat not in num_feats]

In [None]:
# remove colunas com classes muito dominantes
def drop_low_variance(col):
    if df[col].nunique() == 1:
        return True
    elif df[col].value_counts(1).iloc[0] > 0.9:
        return True
    elif df[col].value_counts().iloc[0] < df[col].isna().sum():
        return True
    else:
        return False

low_var_feats = [col for col in cat_feats if drop_low_variance(col)]
cat_feats = [feat for feat in cat_feats if feat not in low_var_feats]

In [None]:
cat_feats

# ML Pipeline

## Feature transformers

In [None]:
class GBMFeatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, EBM=False):
        self.EBM = EBM
        self.num_feats_means = dict()
        
    def fit(self, X, y=None):
        if self.EBM:
            for col in X.columns:
                if X[col].dtype != 'O':
                    self.num_feats_means[col] = X[col].mean()
        return self
            
    def transform(self, X, y=None):
        ret = X.copy()
        if not self.EBM:
            for col in ret.columns:
                if ret[col].dtype == 'O':
                    ret[col] = ret[col].astype('category')
        else:
            for col in self.num_feats_means:
                ret[col] = ret[col].fillna(self.num_feats_means[col])
        return ret
        
general_num_transformer = FeatureUnion(    
    [
        ('num_pipe', Pipeline(
            [
                ('norm', StandardScaler()),
                ('nan_input', SimpleImputer())
            ]
        )),
        ('nan_flag', MissingIndicator(error_on_new=False))
    ]
)

general_feat_transformer = ColumnTransformer(
    [
        ('num_trans', general_num_transformer, num_feats),
        ('cat_trans', OneHotEncoder(handle_unknown='ignore'), cat_feats)    
    ],
    remainder='passthrough', sparse_threshold=0
)

## Nested K-Fold

In [None]:
class NestedKFoldOpt():
    def __init__(self, ml_model, opt_space, loss_metric, outer_cv, inner_cv, 
                 opt_type='bayes', max_evals=10):
        self.ml_model = ml_model
        self.opt_space = opt_space
        self.loss_metric = loss_metric
        self.opt_type = opt_type
        self.max_evals = max_evals
        self.outer_cv = outer_cv
        self.inner_cv = inner_cv
        self.metrics_ = None
        self.metrics_oof_ = None
        self.metrics_dist_ = None
        self.best_hyperparameters_ = None
        
    def objective(self, x, data):
        model = clone(self.ml_model).set_params(**x)
        
        preds = cross_val_predict(model, data[0], data[1], cv=self.inner_cv, n_jobs=-1)
        
        return -self.loss_metric(data[1], preds)     
        
    
    def optimize(self, X, y):
        if self.opt_type == 'bayes':
            obj = partial(self.objective, data=(X, y))
            best = fmin(obj, space=self.opt_space, algo=tpe.suggest, 
                        max_evals=self.max_evals, return_argmin=False)
        else:
            loss_metric = make_scorer(self.loss_metric)
            best = GridSearchCV(self.ml_model, self.opt_space, scoring=loss_metric,
                                n_jobs=-1, cv=self.inner_cv, verbose=3).\
                   fit(X, y).best_params_
        return best
    
    def nested_kfold(self, X, y):
        recall_0 = []
        recall_1 = []
        precision_0 = []
        precision_1 = []
        oof = np.zeros(len(X))
        for train_idx, val_idx in self.outer_cv.split(X, y):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
            hypers = self.optimize(X_train, y_train)
            model = clone(self.ml_model).set_params(**hypers).fit(X_train, y_train)
            preds = model.predict(X_val)
            oof[val_idx] = preds 
            recall_0.append(recall_score(y_val, preds, pos_label=0))
            recall_1.append(recall_score(y_val, preds, pos_label=1))
            precision_0.append(precision_score(y_val, preds, pos_label=0))
            precision_1.append(precision_score(y_val, preds, pos_label=1))
        self.metrics_ = pd.DataFrame({'recall': [np.mean(recall_0), np.mean(recall_1)],
                                      'recall_std': [np.std(recall_0), np.std(recall_1)],
                                      'precision': [np.mean(precision_0), np.mean(precision_1)],
                                      'precision_std': [np.std(precision_0), np.std(precision_1)]},
                                     index=['class_0', 'class_1'])
        self.metrics_oof_ = pd.DataFrame({'recall': [recall_score(y, oof, pos_label=0), recall_score(y, oof)],
                                          'precision': [precision_score(y, oof, pos_label=0), precision_score(y, oof)],
                                          'f1 score': [f1_score(y, oof, pos_label=0), f1_score(y, oof)],
                                          'accuracy': [accuracy_score(y, oof)],
                                         },
                                         index=['class_0', 'class_1'])
        self.metrics_dist_ = {'recall_0': recall_0, 'recall_1': recall_1,
                              'precision_0': precision_0, 'precision_1': precision_1}
            
    def fit(self, X, y):
        X = X.copy()
        y = y.copy()
        self.nested_kfold(X, y)
        self.best_hyperparameters_ = self.optimize(X, y)

In [None]:
X = df[num_feats + cat_feats].copy()
le = LabelEncoder().fit(df[target])
y = pd.Series(le.transform(df[target]))

## Feature Selection

In [None]:
base_model = RandomForestClassifier(n_jobs=-1, max_depth=5)

In [None]:
X_num = GBMFeatTransformer().fit_transform(X)
for col in X_num.columns:
    if X_num[col].dtype.name == 'category':
        X_num[col] = X_num[col].cat.codes
    else:
        X_num[col].fillna(X_num[col].median(), inplace=True)
        
columns = X_num.columns.values

In [None]:
print(classification_report(y, cross_val_predict(base_model, X_num, y, n_jobs=-1,
  cv=StratifiedKFold(shuffle=True, random_state=9))))

### Boruta

In [None]:
bor_selector = BorutaPy(base_model, n_estimators='auto', perc=90, max_iter=100)
bor_selector.fit(X_num.values, y.values)

In [None]:
columns[bor_selector.support_]

In [None]:
bottom_boruta = columns[bor_selector.ranking_.argsort()[-(len(columns)-30):]]
bottom_boruta

### Sequential Selector

In [None]:
sb_selector = SequentialFeatureSelector(base_model, n_features_to_select=30, direction='backward',
                                        scoring=make_scorer(f1_score, average='macro'), 
                                        cv=StratifiedKFold(shuffle=True, random_state=9), n_jobs=-1)
sb_selector.fit(X_num, y)

In [None]:
bottom_sb = columns[~sb_selector.get_support()]
bottom_sb

### M.I Filtering

In [None]:
mi_selector = SelectKBest(mutual_info_classif, k=30).fit(X_num, y)

In [None]:
bottom_mi = columns[~mi_selector.get_support()]
bottom_mi

### Genetic Algorithm

In [None]:
ga_selector = GeneticSelectionCV(base_model, cv=StratifiedKFold(shuffle=True, random_state=9), 
                                 scoring=make_scorer(f1_score, average='macro'), max_features=30, 
                                 n_population=100, n_generations=40, n_gen_no_change=10)
ga_selector.fit(X_num, y)

In [None]:
bottom_ga = columns[~ga_selector.get_support()]
bottom_ga

### Aggregation of the methods

In [None]:
counts = pd.Series(dict(Counter(np.hstack([bottom_boruta, bottom_sb, bottom_mi, bottom_ga]))))
# counts = pd.Series(dict(Counter(np.hstack([bottom_boruta, bottom_mi]))))

In [None]:
cols_to_decide = counts[counts>2].index
cols_to_decide

In [None]:
cols_to_remove = cols_to_decide

In [None]:
X_fs = X.drop(cols_to_remove, axis=1)
X_fs.shape

## Algorithms Comparison

### Logistic Regression

In [None]:
lr_pipe = Pipeline(
  [
      ('feat_trans', general_feat_transformer),
      ('over', RandomOverSampler(random_state=9)),
      ('logreg', LogisticRegression(random_state = 0))
]
)

lr_opt_space = {'logreg__solver': hp.choice('logreg__solver', ['liblinear', 'lbfgs']),
                'logreg__C': hp.loguniform('logreg__C', np.log(1e-5), np.log(100))}
               
lr_opt_space = {'logreg__warm_start' : hp.choice('logreg__warm_start', [True, False]),
                'logreg__fit_intercept' : hp.choice('logreg__fit_intercept', [True, False]),
                'logreg__tol' : hp.uniform('logreg__tol', 0.00001, 0.0001),
                'logreg__C' : hp.uniform('logreg__C', 0.05, 3),
                'logreg__solver' : hp.choice('logreg__solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'logreg__multi_class' : 'auto',
                'logreg__class_weight' : 'balanced'}

lr_opt = NestedKFoldOpt(lr_pipe, lr_opt_space, partial(f1_score, average='macro'), 
                      outer_cv=StratifiedKFold(2, shuffle=True, random_state=9), 
                      inner_cv=StratifiedKFold(2, shuffle=True, random_state=9))

In [None]:
lr_opt.fit(X, y)

In [None]:
lr_opt.metrics_oof_ 

In [None]:
lr_opt.fit(X_fs, y)

In [None]:
lr_opt.metrics_oof_

### Logistic Regression

In [None]:
rf_pipe = Pipeline(
    [
        ('feat_trans', GBMFeatTransformer(high_card_feats_fs)),
        ('over', RandomOverSampler(random_state=9)),
        ('rf', LGBMClassifier(boosting_type='rf', subsample_freq=1, min_child_samples=1))
    ]
)

rf_opt_space = {'rf__n_estimators': scope.int(hp.quniform('rf__n_estimators', 50, 300, 10)),
                'rf__num_leaves': scope.int(hp.quniform('num_leaves', 2, 100, 1)),
                'rf__subsample': hp.uniform('rf__subsample', 0.3, 0.95),
                'rf__colsample_bytree': hp.uniform('rf__colsample_bytree', 0.3, 0.95)}

rf_opt = NestedKFoldOpt(rf_pipe, rf_opt_space, partial(f1_score, average='macro'), 
                        outer_cv=StratifiedKFold(10, shuffle=True, random_state=9), 
                        inner_cv=StratifiedKFold(10, shuffle=True, random_state=9))

In [None]:
rf_opt.fit(X_fs, y)

In [None]:
rf_opt.metrics_

In [None]:
rf_opt.fit(X, y)

In [None]:
rf_opt.metrics_

### SVM

svm_pipe = Pipeline(
    [
        ('feat_trans', general_feat_transformer),
        ('over', RandomOverSampler(random_state=9)),
        ('svm', SVC())
    ]
)

svm_opt_space = {'svm__C': hp.loguniform('svm__C', np.log(1e-5), np.log(100)),
                 'svm__gamma': hp.loguniform('svm__gamma', np.log(1e-6), np.log(10))}

svm_opt = NestedKFoldOpt(svm_pipe, svm_opt_space, partial(f1_score, average='macro'), 
                        outer_cv=StratifiedKFold(10, shuffle=True, random_state=9), 
                        inner_cv=StratifiedKFold(10, shuffle=True, random_state=9))

In [None]:
svm_opt.fit(X, y)

In [None]:
svm_opt.metrics_

### SGDClassifier

sgd_pipe = Pipeline(
    [
        ('feat_trans', general_feat_transformer),
        ('over', RandomOverSampler(random_state=9)),
        ('sgd', SGDClassifier())
    ]
)

sgd_opt_space = {'sgd__loss': hp.choice('sgd__loss', ['hinge', 'log']),
                 'sgd__alpha': hp.loguniform('sgd__alpha', np.log(1e-5), np.log(10)), 
                 'sgd__max_iter': scope.int(hp.quniform('sgd__max_iter', 10, 500, 10))}

sgd_opt = NestedKFoldOpt(sgd_pipe, sgd_opt_space, partial(f1_score, average='macro'), 
                        outer_cv=StratifiedKFold(10, shuffle=True, random_state=9), 
                        inner_cv=StratifiedKFold(10, shuffle=True, random_state=9))

In [None]:
sgd_opt.fit(X, y)

In [None]:
sgd_opt.metrics_

### LGBM

In [None]:
lgbm_pipe = Pipeline(
    [
        ('feat_trans', GBMFeatTransformer(high_card_feats_fs)),
        ('over', RandomOverSampler(random_state=9)),
        ('lgbm', LGBMClassifier())
    ]
)

lgbm_opt_space = {'lgbm__learning_rate': hp.loguniform('lgbm__learning_rate', np.log(0.001), np.log(0.5)),
                  'lgbm__reg_alpha': hp.loguniform('lgbm__reg_alpha', np.log(0.001), np.log(1)),
                  'lgbm__reg_lambda': hp.loguniform('lgbm__reg_lambda', np.log(0.001), np.log(1)),
                  'lgbm__subsample': hp.uniform('lgbm__subsample', 0.2, 1),
                  'lgbm__colsample_bytree': hp.uniform('lgbm__colsample_bytree', 0.2, 1),
                  'lgbm__min_child_samples': scope.int(hp.quniform('lgbm__min_child_samples', 1, 100, 1)),
                  'lgbm__num_leaves': scope.int(hp.quniform('lgbm__num_leaves', 2, 50, 1)),
                  'lgbm__subsample_freq': scope.int(hp.quniform('lgbm__subsample_freq', 1, 10, 1)),
                  'lgbm__n_estimators': scope.int(hp.quniform('lgbm__n_estimators', 100, 5000, 1))}

lgbm_opt = NestedKFoldOpt(lgbm_pipe, lgbm_opt_space, partial(f1_score, average='macro'), 
                        outer_cv=StratifiedKFold(10, shuffle=True, random_state=9), 
                        inner_cv=StratifiedKFold(10, shuffle=True, random_state=9))

In [None]:
lgbm_opt.fit(X_fs, y)

In [None]:
lgbm_opt.metrics_

In [None]:
lgbm_opt.fit(X, y)

In [None]:
lgbm_opt.metrics_

### EBM

In [None]:
ebm_pipe = Pipeline(
    [
        ('feat_trans', GBMFeatTransformer(high_card_feats_fs, EBM=True)),
        #('over', RandomOverSampler(random_state=9)),
        ('ebm', ExplainableBoostingClassifier(n_jobs=1, validation_size=0))
    ]
)

ebm_opt_space = {'ebm__learning_rate': hp.loguniform('ebm__learning_rate', np.log(0.001), np.log(0.5)),
                 #'ebm__validation_size': hp.uniform('ebm__validation_size', 0.05, 0.25),
                 #'ebm__early_stopping_rounds': scope.int(hp.quniform('early__stopping_rounds', 5, 100, 1)),
                 'ebm__max_rounds': scope.int(hp.quniform('ebm__max_rounds', 10, 3000, 1)),
                 'ebm__interactions': scope.int(hp.quniform('ebm__interactions', 0, 20, 1)),
                 'ebm__max_leaves': scope.int(hp.quniform('ebm__max_leaves', 2, 10, 1)),
                 'ebm__outer_bags': scope.int(hp.quniform('ebm__outer_bags', 8, 16, 1)),
                 #'ebm__inner_bags': scope.int(hp.quniform('ebm__inner_bags', 0, 5, 1)),
                 'ebm__max_bins': scope.int(hp.quniform('ebm__max_bins', 8, 128, 1)),
                 'ebm__max_interaction_bins': scope.int(hp.quniform('ebm__max_interaction_bins', 8, 64, 1)),
                 'ebm__min_samples_leaf': scope.int(hp.quniform('ebm__min_samples_leaf', 1, 30, 1))}

ebm_opt = NestedKFoldOpt(ebm_pipe, ebm_opt_space, partial(f1_score, average='macro'), 
                        outer_cv=StratifiedKFold(10, shuffle=True, random_state=9), 
                        inner_cv=StratifiedKFold(10, shuffle=True, random_state=9))

In [None]:
ebm_opt.fit(X_fs, y)

In [None]:
ebm_opt.metrics_

In [None]:
ebm_opt.fit(X, y)

In [None]:
ebm_opt.metrics_

### Comparison

In [None]:
df_metrics = pd.DataFrame()
for a, m in zip(['rf', 'svm', 'sgd', 'lgbm', 'ebm'], [rf_opt.metrics_dist_, svm_opt.metrics_dist_, 
                                                      sgd_opt.metrics_dist_, lgbm_opt.metrics_dist_, 
                                                      ebm_opt.metrics_dist_]):
    m['algo'] = [a]*len(m['recall_0'])
    df_metrics = pd.concat([df_metrics, pd.DataFrame(m)])

In [None]:
make_subplots(df_metrics, df_metrics.columns[:-1], 'algo')

In [None]:
# Dummy
print(classification_report(y, DummyClassifier(strategy='stratified').fit(X, y).predict(X)))