In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer



def standarization_comparison(df :pd.DataFrame, label_name : str, standarization_list = {'StandardScaler':StandardScaler(),'MinMaxScaler':MinMaxScaler(),'MaxAbsScaler':MaxAbsScaler(),'RobustScaler':RobustScaler(),'Normalizer':Normalizer(),'PowerTransformer':PowerTransformer()}, scoring_metric = 'average_precision', cv_folds = 10, cv_seed = 124):
    """
    Description: standarization_comparison functions compare the different standarization techniques and generate the boxplots of the metric in a kflod using a logistic regression with Lasso regularization.
    Input:
        - df: Data frame with the data.
        - label_name: Name of the column with the label.
        - standarization_list: Dictionary with the standarization techniques to be compared.
        - scoring_metric: Metric to be used in the cross validation.
        - cv_folds: Number of folds in the cross validation.
        - cv_seed: Seed for the cross validation.
    Output:
        - standarization_comparison: Data frame with the results of the cross validation.
    """
    results = []
    names = []
    for name, standarization in standarization_list.items():
        kfold = KFold(n_splits=cv_folds, random_state=cv_seed, shuffle=True)
        cv_results = cross_val_score(LogisticRegression(penalty='l1', solver='liblinear'), standarization.fit_transform(df.drop(label_name, axis=1)), df[label_name], cv=kfold, scoring=scoring_metric)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

    fig = plt.figure()
    fig.suptitle('Standarization Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()


# standarization_comparison(df,'label')

