In [None]:
from sklearn import model_selection, preprocessing, pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn import svm, neighbors

from sklearn import metrics
from math import log
import numpy as np
import pandas as pd
import timeit

# Import all lib to create benchmark functions

**Modèle 0 : Dummy**

In [None]:
def dum():
    grid_param0 = {
        'dummy__strategy': ['most_frequent']   
    }
    
    return {
        'model_name': 'dummy',
        'model': DummyClassifier(),
        'param': grid_param0
    }


**Modèle 1 : Forêt aléatoire**

In [None]:
def ran_for(n_estim, crit):
    grid_param1 = {
        'random_forest__n_estimators': n_estim,
        'random_forest__criterion': crit,        
#        'random_forest__max_depth': d,
#        'random_forest__max_features': f,
#        'random_forest__min_samples_leaf': leaf,
#        'random_forest__min_impurity_decrease': i
    }
    
    return {
        'model_name': 'random_forest',
        'model': RandomForestClassifier(),
        'param': grid_param1
    }

**Modèle 2 : GradientBoosting**

In [None]:
def grad_boo(loss, lr, n_estim):
    grid_param2 = {
        'gradient_boosting__loss': loss,
        'gradient_boosting__learning_rate': lr,
        'gradient_boosting__n_estimators': n_estim,
    }

    return {
        'model_name': 'gradient_boosting',
        'model': GradientBoostingClassifier(),
        'param': grid_param2
    }

**Modèle 3 : XGBoost**

In [None]:
def xg_boo(n_estim, booster):
    grid_param3 = {
        'xgboost__n_estimators': n_estim,
        'xgboost__booster': booster
    }

    return {
        'model_name': 'xgboost',
        'model': xgb.XGBClassifier(),
        'param': grid_param3
    }

**Modèle 4 : KNN**

In [None]:
def knn(n_neigh, weight):
    grid_param4 = {
        'knn__n_neighbors': n_neigh,
        'knn__weights': weight,
    }

    return {
        'model_name': 'knn',
        'model': neighbors.KNeighborsClassifier(),
        'param': grid_param4
    }

**Modèle 5 : SVC**

In [None]:
def svc(penalty, loss):
    grid_param5 = {
        'svc__penalty': penalty,
        'svc__loss': loss,
        'svc__multi_class': ['crammer_singer']
    }

    return {
        'model_name': 'svc',
        'model': svm.LinearSVC(),
        'param': grid_param5
    }

**Modèle 6 : SVC (kernel version)**

In [None]:
def svc_kern(c, kern):
    grid_param6 = {
        'svc_kernel__C': c,
        'svc_kernel__kernel': kern,
    }

    return {
        'model_name': 'svc_kernel',
        'model': svm.SVC(),
        'param': grid_param6
    }

**Benchmark des modèles crées**

In [None]:
def model_benchmark(x, y, scal, models):
    """Function to compare a list of model with a dataset

    Args :
    x : arrays or pd.dataFrame
    y : one dimension array or pd.Series
    scal : none, norm or log may be put in argument 
           -> scale dataset
    models : a list of model included in the model_benchmark could be chose

    Return : 
    a dataframe with best param, mse and r square for each model tested

    """
    # Create a dF to stock results of each model by column
    model_performance = pd.DataFrame(dtype='float64')

    # List to stock param of each model
    param = []
    elaps_time = []
    yt = []
    yp = []

    # Split data in train and test
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size=0.3, random_state=0)

    # Define the scaler
    std_scale = preprocessing.StandardScaler()
    normalizer = preprocessing.Normalizer(norm='l2')  # fit does nothing

    for i in models:
        model = i

        # Start timer
        start_time = timeit.default_timer()

        # Scaling operation within the pipeline selected in arg
        if scal == 'standard':
            pipe = pipeline.Pipeline(steps=[('scaler', std_scale),
                                            (i['model_name'], i['model'])])
        elif scal == 'normal':
            pipe = pipeline.Pipeline(steps=[('norm', normalizer),
                                            (i['model_name'], i['model'])])

        # Grid search with pipeline and grid param in each model function
        grids = model_selection.GridSearchCV(
            pipe, i['param'], cv=7)

        # Fit the model / then predict y value
        grids.fit(x_train, y_train)
        y_pred = grids.predict(x_test)

        # Time stop
        time_stop = timeit.default_timer() - start_time

        # Best params in training
        param.append([i['model_name'], grids.best_params_])
        yt.append([i['model_name'], y_test])
        yp.append([i['model_name'], y_pred])
        elaps_time.append([i['model_name'], str(time_stop)])

        # Scoring
        train_accuracy = (grids.best_score_)
        test_accuracy = (metrics.accuracy_score(y_test, y_pred))

        # Create a dF to display all the score results obtained by ML model
        model_serie = pd.DataFrame(
            {
                i['model_name']: [
                    train_accuracy.round(3),
                    test_accuracy.round(3),
                    time_stop
                ]
            },
            index=['Train Accuracy', 'Test Accuracy', 'Time'],
            dtype='float64')
        
        model_performance = pd.concat([model_serie, model_performance], axis=1)

        # Check progression of the model fitted during running of the function
        print("{} is trained and fitted in {:.3f} sec".format(
            i['model_name'], time_stop))

    # Output dict
    return {
        'set': type(y),
        'perf': model_performance,
        'param': param,
        'time': elaps_time,
        'yt': yt,
        'yp': yp
    }