In [None]:
from sklearn import model_selection, preprocessing, pipeline
from sklearn import kernel_ridge, svm
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNet, Ridge
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
from math import log
import xgboost as xgb
import numpy as np
import pandas as pd
import timeit

# Import all lib to create benchmark functions

**Modèle 0 : Dummy**

In [None]:
def dum():
    grid_param0 = {
        'dummy__strategy': ['mean']   
    }
    
    return {
        'model_name': 'dummy',
        'model': DummyRegressor(),
        'param': grid_param0
    }


**Modèle 1 : Linear regression**

In [None]:
def lin_reg():
    grid_param1 = {}
    
    return {
        'model_name': 'linear_regression',
        'model': LinearRegression(),
        'param': grid_param1
    }

**Modèle 2 : Forêt aléatoire**

In [17]:
def ran_for(n, d, f, leaf, i):
    grid_param2 = {
        'random_forest__n_estimators': n,
        'random_forest__max_depth': d,
        'random_forest__max_features': f,
        'random_forest__min_samples_leaf': leaf,
        'random_forest__min_impurity_decrease': i
    }
    
    return {
        'model_name': 'random_forest',
        'model': RandomForestRegressor(),
        'param': grid_param2
    }

**Modèle 3 : Gradient Boosting**

In [10]:
def grad_boo(n, d, f):
    grid_param3 = {
        'gradient_boosting__n_estimators': n,
        'gradient_boosting__max_depth': d,
        'gradient_boosting__max_features': f,
    }

    return {
        'model_name': 'gradient_boosting',
        'model': GradientBoostingRegressor(),
        'param': grid_param3
    }

**Modèle 4 : XGBoost**

In [12]:
def xg_boo(n, d, l, b):
    grid_param = {
        'xgboost__n_estimators': n,
        'xgboost__max_depth': d,
        'xgboost__learning_rate': l,
        'xgboost__booster': b
    }

    return {
        'model_name': 'xgboost',
        'model': xgb.XGBRegressor(),
        'param': grid_param
    }

**Modèle 5 : Lasso**

In [None]:
def lasso(alphas):
    grid_param = {
        'lasso__alpha': alphas,
        'lasso__fit_intercept': [True, False],
        'lasso__max_iter': [100000]
    }

    return {
        'model_name': 'lasso',
        'model': Lasso(),
        'param': grid_param
    }

**Modèle 6 : ElasticNet**

In [None]:
def ela_net(alphas, l1):
    grid_param = {
        'elastic__alpha': alphas,
        'elastic__l1_ratio': l1,
        'elastic__fit_intercept': [True, False],
        'elastic__max_iter': [200000],
    }

    return {
        'model_name': 'elastic',
        'model': ElasticNet(),
        'param': grid_param
    }

**Modèle 7a : Regression Ridge**

In [18]:
def ridge(alphas, s):
    grid_param = {
        'ridge__alpha': alphas,
        'ridge__fit_intercept': [True, False],
        'ridge__max_iter': [100000],
        'ridge__solver': s,
    }

    return {
        'model_name': 'ridge',
        'model': Ridge(),
        'param': grid_param
    }

**Modèle 7b : Regression Ridge (kernel version)**

In [28]:
def ridge_kern(alphas, gammas, kern):
    grid_param = {
        'ridge_kernel__alpha': alphas,
        'ridge_kernel__gamma': gammas,
        'ridge_kernel__kernel': kern,
    }

    return {
        'model_name': 'ridge_kernel',
        'model': kernel_ridge.KernelRidge(),
        'param': grid_param
    }

**Modèle 8 : Support Vector Regression**

In [27]:
def svr(c, tol):
    grid_param = {
    'svr__loss': ['squared_epsilon_insensitive'],
    'svr__C': c,
    'svr__fit_intercept': [True],
    'svr__max_iter': [100000],
    'svr__tol': tol
    }

    return {
        'model_name': 'svr',
        'model': svm.LinearSVR(),
        'param': grid_param
    }

**Benchmark des modèles crées**

In [10]:
def model_benchmark(x, y, reg, models):
    """Function to compare a list of model with a dataset

    Args :
    x : arrays or pd.dataFrame
    y : one dimension array or pd.Series
    scal : none, norm or log may be put in argument 
           -> scale dataset
    models : a list of model included in the model_benchmark could be chose

    Return : 
    a dataframe with best param, mse and r square for each model tested

    """
    # Create a dF to stock results of each model by column
    model_performance = pd.DataFrame(dtype='float64')

    # List to stock param of each model
    param = []
    elaps_time = []
    yt = []
    yp = []

    # Split data in train and test
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size=0.3, random_state=0)

    # Define the scaler
    std_scale = preprocessing.StandardScaler()
    normalizer = preprocessing.Normalizer(norm='l2')  # fit does nothing

    a = np.logspace(-5, 5, 400)

    for i in models:
        model = i

        # Start timer
        start_time = timeit.default_timer()

        # Scaling operation within the pipeline selected in arg
        if reg == 'lasso':
            pipe = pipeline.Pipeline(steps=[(
                'scaler',
                std_scale), ('weighter', SelectFromModel(LassoCV(
                    alphas=a))), (i['model_name'], i['model'])])
        elif reg == 'norm':
            pipe = pipeline.Pipeline(
                steps=[('norm', normalizer), (i['model_name'], i['model'])])
        else:
            pipe = pipeline.Pipeline(
                steps=[('scaler', std_scale), (i['model_name'], i['model'])])

        # Grid search with pipeline and grid param in each model function
        grids = model_selection.GridSearchCV(
            pipe, i['param'], cv=10, scoring='neg_root_mean_squared_error')

        # Fit the model / then predict y value
        grids.fit(x_train, y_train)
        y_pred = grids.predict(x_test)

        # Time stop
        time_stop = timeit.default_timer() - start_time

        # Best params in training
        param.append([i['model_name'], grids.best_params_])
        yt.append([i['model_name'], y_test])
        yp.append([i['model_name'], y_pred])
        elaps_time.append([i['model_name'], str(time_stop)])

        # Scoring with RMSE train and test, R2 test
        train_rmse = (grids.best_score_) * (-1)
        test_rmse = (metrics.mean_squared_error(y_test, y_pred, squared=False))
        r2 = metrics.r2_score(y_test, y_pred)

        # Create a dF to display all the score results obtained by ML model
        model_serie = pd.DataFrame(
            {
                i['model_name']: [
                    train_rmse.round(3),
                    test_rmse.round(3),
                    r2.round(3), time_stop
                ]
            },
            index=['Train RMSE', 'Test RMSE', 'R²', 'Time'],
            dtype='float64')
        model_performance = pd.concat([model_serie, model_performance], axis=1)

        # Check progression of the model fitted during running of the function

        print("{} is trained and fitted in {:.3f} sec".format(
            i['model_name'], time_stop))
        print(grids.cv_results_)

    # Output dict
    return {
        'set': y.name,
        'perf': model_performance,
        'param': param,
        'time': elaps_time,
        'yt': yt,
        'yp': yp
    }

**Plot the perf of the model**

In [31]:
def plot_yt_yp(model_name, y_test, y_pred, rmse, r2, lim):
    # Fig param
    fig = plt.figure(figsize=(7, 7))

    # Define size of circle for yt, yp
    sizes = {}
    for (yt, yp) in zip(y_test, y_pred):
        if (yt, yp) in sizes.keys():
            sizes[(yt, yp)] += 1
        else:
            sizes[(yt, yp)] = 1
    keys = sizes.keys()

    # Plot the prediction
    plt.scatter([k[0] for k in keys], [k[1] for k in keys],
                s=[sizes[k] for k in keys],
                label=["RMSE = {:.2f}".format(rmse), "R² = {:.2f}".format(r2)])

    # Set title, x and y labels
    plt.xlabel('True score', fontsize=16)
    plt.ylabel(u'Predicted Score', fontsize=16)
    plt.title(f'Best model {model_name}', fontsize=16)
    
    # x and y lim
    plt.xlim(0, lim)
    plt.ylim(0, lim)

    plt.legend(loc="lower right", fontsize=12)