In [131]:

import numpy as np
import pandas as pd
#Encodeurs
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler, MinMaxScaler
#Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
#Estimateurs
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

import mlflow
from utils import mlflow_it

In [132]:
data = pd.read_csv('data/bronze.csv')

In [133]:
X = data.drop(['MEDV'], axis=1)
y = data.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [134]:
numerical_features = X.select_dtypes(include=['int','float']).columns

In [135]:
categorical_features = X.select_dtypes(include=['bool']).columns

In [136]:
std = StandardScaler()
minmax = MinMaxScaler()
rbst = RobustScaler()

scalers = [std, minmax, rbst]


In [203]:
lnr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
gbr = GradientBoostingRegressor()

In [183]:
random_params = {
    'learning_rate': np.linspace(0.01, 0.1, num=10),
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': np.linspace(0.5, 1.0, num=6)
}


In [185]:
random_search = RandomizedSearchCV(gbr, random_params, random_state=42)

In [186]:
random_search.fit(X_train, y_train)

In [201]:
random_search.best_params_

{'subsample': 0.8,
 'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_depth': 3,
 'learning_rate': 0.09000000000000001}

In [202]:
random_search.best_score_
#0.7086941878926369

0.8628239161696012

In [213]:
grid_params = {
 'subsample': np.arange(0.7, 1, 0.1),
 'n_estimators': np.arange(497, 503, 1),
 'min_samples_split':np.arange(2,8,1),
 'min_samples_leaf': np.arange(1,5,1),
 'max_depth': np.arange(1,5,1),
 'learning_rate': np.arange(0.089, 0.091, 0.001)
}

In [214]:
grid_search = GridSearchCV(gbr, grid_params)

In [215]:
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [178]:
grid_search.best_params_

{'alpha': 0.098, 'max_iter': 4500, 'tol': 0.0027}

In [182]:
grid_search.best_score_

0.7088993735948131