# 1 Imports

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from imbtools.evaluation import BinaryExperiment

# 2 Load datasets

# 4 Hyperparameter tuning

#### 4.1 Define the hyperparameter space

In [None]:
hyperparameters = {'max_depth':[2, 3, 5, 8], 'n_estimators':[10, 50, 80, 100]}

#### 4.2 Get the optimal hyperparameters

In [2]:
from sklearn.model_selection import GridSearchCV
help(GridSearchCV)

Help on class GridSearchCV in module sklearn.model_selection._search:

class GridSearchCV(BaseSearchCV)
 |  Exhaustive search over specified parameter values for an estimator.
 |  
 |  Important members are fit, predict.
 |  
 |  GridSearchCV implements a "fit" and a "score" method.
 |  It also implements "predict", "predict_proba", "decision_function",
 |  "transform" and "inverse_transform" if they are implemented in the
 |  estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated grid-search over a parameter grid.
 |  
 |  Read more in the :ref:`User Guide <grid_search>`.
 |  
 |  Parameters
 |  ----------
 |  estimator : estimator object.
 |      This is assumed to implement the scikit-learn estimator interface.
 |      Either estimator needs to provide a ``score`` function,
 |      or ``scoring`` must be passed.
 |  
 |  param_grid : dict or list of dictionaries
 |      Dictionary with parameters names (string) as ke

In [None]:
optimal_hyperparameters = []
for X, y in imbalanced_datasets:
        clf = GradientBoostingClassifier()
        clfs = GridSearchCV(clf, hyperparameters)
        clfs.fit(X, y)
        optimal_hyperparameters.append(clfs.best_params_)
optimal_hyperparameters

# 5 Experiment

##### 5.1 Parameters

In [None]:
cv_scores = []
metrics = ['roc_auc', F_measure, G_mean]
random_states = [5 * i for i in range(5)]

#### 5.2 Run experiment

In [None]:
for random_state in random_states:
    cv = StratifiedKFold(n_splits=5, random_state=random_state)
    algorithms = [LogisticRegression(random_state=random_state), GradientBoostingClassifier(random_state=random_state)]
    methods = [None, RandomOverSampler(random_state=random_state), SMOTE(random_state=random_state, k_neighbors=3), SMOTE(random_state=random_state, kind='borderline1', k_neighbors=3), ADASYN(random_state=random_state, n_neighbors=3)]
    for algorithm in algorithms:
        for ind, (X, y) in enumerate(imbalanced_datasets):
            if isinstance(algorithm, GradientBoostingClassifier):
                algorithm.set_params(**optimal_hyperparameters[ind])
            for method in methods:
                for metric in metrics:
                    if method is None:
                        clf = algorithm
                    else:
                        clf = make_pipeline(method, algorithm)
                    cv_scores.append(cross_val_score(clf, X, y, cv=cv, scoring=metric).mean())
cv_scores

#### 5.3 Results parameters

In [None]:
algorithms_names = ["LR", "GBM"]
metrics_names = ["AUC", "F", "G"]
methods_names = ["None", "Random", "SMOTE", "Borderline SMOTE", "ADASYN"]

#### 5.4 Mean results

In [None]:
mean_cv_scores = np.array(cv_scores).reshape(len(random_states), -1).mean(axis=0).reshape(len(algorithms), len(imbalanced_datasets), len(methods), len(metrics))
mean_results = pd.DataFrame()
for ind1, alg_name in enumerate(algorithms_names):
    for ind2, ds_name in enumerate(datasets_names):
        partial_results = pd.DataFrame(mean_cv_scores[ind1, ind2, :, :].transpose(), columns=methods_names)
        partial_results.insert(0, "Metric", metrics_names)
        partial_results.insert(0, "Dataset", ds_name)
        partial_results.insert(0, "Algorithm", alg_name)
        mean_results = pd.concat([mean_results, partial_results])
mean_results = mean_results.reset_index(drop=True)

#### 5.5 Standard deviation results

In [None]:
std_cv_scores = np.array(cv_scores).reshape(len(random_states), -1).std(axis=0).reshape(len(algorithms), len(imbalanced_datasets), len(methods), len(metrics))
std_results = pd.DataFrame()
for ind1, alg_name in enumerate(algorithms_names):
    for ind2, ds_name in enumerate(datasets_names):
        partial_results = pd.DataFrame(std_cv_scores[ind1, ind2, :, :].transpose(), columns=methods_names)
        partial_results.insert(0, "Metric", metrics_names)
        partial_results.insert(0, "Dataset", ds_name)
        partial_results.insert(0, "Algorithm", alg_name)
        std_results = pd.concat([std_results, partial_results])
std_results = std_results.reset_index(drop=True)

#### 5.6. Export results

In [None]:
methods_names = methods_names
results = mean_results[["Algorithm", "Dataset", "Metric"]].copy()
for ind1, method_name in enumerate(methods_names):
    results[method_name] = round(mean_results[method_name], 3).apply(str) + " \u00B1 " + round(std_results[method_name], 3).apply(str)
for ind, alg_name in enumerate(algorithms_names):
    export_table(results[results["Algorithm"] == alg_name][results.columns[1:]], '/Users/gdouzas/Main/projects/academic/SOMO/table%s.docx' % str(ind + 3))

#### 5.7 Oversampling methods ranking

In [None]:
mean_results

In [None]:
ranking

In [None]:
ranking = mean_results.apply(lambda row: len(row[3:]) - row[3:].argsort().argsort(), axis=1)
aggregated_ranking = round(pd.concat([mean_results[["Algorithm", "Metric"]], ranking], axis=1).groupby(["Algorithm", "Metric"]).mean(), 2)
aggregated_ranking

#### 5.9 Friedman test

In [None]:
friedman_results = pd.DataFrame({}, columns=["Algorithm", "Metric", "p-value"])
for alg_name in algorithms_names:
    for metric_name in metrics_names:
        partial_ranking = pd.concat([mean_results[["Algorithm", "Metric"]], ranking], axis=1)[(mean_results["Algorithm"] == alg_name) & (mean_results["Metric"] == metric_name)]
        friedman_inputs = []
        for method_name in methods_names:
            friedman_inputs.append(partial_ranking[method_name])
        pvalue = round(scipy.stats.friedmanchisquare(*friedman_inputs).pvalue, 2)
        friedman_results.loc[len(friedman_results)] = [alg_name, metric_name, pvalue]
friedman_results