In [None]:
!pip install optuna -qq

In [None]:
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Medbikri/processed_house_price.csv")

In [None]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**Xgboost**

In [None]:
def objective(trial):
    
    param = {
              'tree_method':'gpu_hist',  
              'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
              'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
              'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
              'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
              'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
              'n_estimators': 10000,
              'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
              'random_state': trial.suggest_categorical('random_state', [2020]),
              'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }

    model = xgb.XGBRegressor(**param, verbosity = 0)  

    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(X_test)
    
    rmse = mean_squared_error(y_test, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-05-30 11:46:45,443][0m A new study created in memory with name: no-name-a939bb71-1904-4831-a009-075dc04d936c[0m
[32m[I 2022-05-30 11:47:12,640][0m Trial 0 finished with value: 25735.30808508089 and parameters: {'lambda': 2.5714187998001368, 'alpha': 0.02756318326231303, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.016, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 98}. Best is trial 0 with value: 25735.30808508089.[0m
[32m[I 2022-05-30 11:47:28,052][0m Trial 1 finished with value: 25914.412998882224 and parameters: {'lambda': 2.4068324990989978, 'alpha': 0.005285006678614819, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.018, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 68}. Best is trial 0 with value: 25735.30808508089.[0m
[32m[I 2022-05-30 11:48:08,442][0m Trial 2 finished with value: 28739.524057370912 and parameters: {'lambda': 0.0037573005028438333, 'alpha': 1.0420130362152402, 'colsample_bytree': 0.

Number of finished trials: 30
Best trial: {'lambda': 0.4102759555323203, 'alpha': 0.0013098619028696133, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.014, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 3}


In [None]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,25735.308085,2022-05-30 11:46:45.448230,2022-05-30 11:47:12.640510,0 days 00:00:27.192280,0.027563,0.4,2.571419,0.016,15,98,2020,0.4,COMPLETE
1,1,25914.412999,2022-05-30 11:47:12.642704,2022-05-30 11:47:28.052342,0 days 00:00:15.409638,0.005285,0.8,2.406832,0.018,7,68,2020,0.6,COMPLETE
2,2,28739.524057,2022-05-30 11:47:28.054466,2022-05-30 11:48:08.442315,0 days 00:00:40.387849,1.042013,0.8,0.003757,0.008,17,290,2020,0.8,COMPLETE
3,3,26232.469189,2022-05-30 11:48:08.444540,2022-05-30 11:48:43.958313,0 days 00:00:35.513773,7.697514,0.6,3.506043,0.01,17,192,2020,0.5,COMPLETE
4,4,31960.929885,2022-05-30 11:48:43.960453,2022-05-30 11:48:51.672101,0 days 00:00:07.711648,0.034639,1.0,0.004168,0.02,9,230,2020,1.0,COMPLETE
5,5,25226.703665,2022-05-30 11:48:51.674284,2022-05-30 11:49:26.906195,0 days 00:00:35.231911,0.107949,0.7,0.00299,0.008,13,128,2020,0.6,COMPLETE
6,6,24664.370885,2022-05-30 11:49:26.912846,2022-05-30 11:49:39.088659,0 days 00:00:12.175813,0.70835,0.3,0.135374,0.012,13,33,2020,0.6,COMPLETE
7,7,25364.166614,2022-05-30 11:49:39.097067,2022-05-30 11:50:13.289532,0 days 00:00:34.192465,0.066627,0.3,0.068265,0.016,9,221,2020,0.5,COMPLETE
8,8,25570.764264,2022-05-30 11:50:13.293969,2022-05-30 11:50:35.415973,0 days 00:00:22.122004,2.423848,0.8,0.019277,0.012,15,170,2020,0.6,COMPLETE
9,9,25218.654832,2022-05-30 11:50:35.418204,2022-05-30 11:50:46.860594,0 days 00:00:11.442390,0.045612,0.4,0.032869,0.02,7,90,2020,0.7,COMPLETE


In [None]:
optuna.visualization.plot_param_importances(study)

**RandomForest**

In [None]:
def objective1(trial):
    
    param = {
           # 'tree_method':'gpu_hist',  
             'n_estimators': trial.suggest_int('trial.suggest_int', 100,1500),
             'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
             'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
             'min_samples_leaf': trial.suggest_categorical('min_samples_leaf', [5,7,9,11,13,15,17]),
             'min_samples_split': trial.suggest_categorical('min_samples_split', [5,7,9,11,13,15,17]),

    }

    model = RandomForestRegressor(**param)  
    model.fit(X_train,y_train)
    
    preds = model.predict(X_test)
    
    rmse = mean_squared_error(y_test, preds,squared=False)
    
    return rmse

In [None]:
study1 = optuna.create_study(direction='minimize')
study1.optimize(objective1, n_trials=30)
print('Number of finished trials:', len(study1.trials))
print('Best trial:', study1.best_trial.params)

[32m[I 2022-05-30 11:56:35,518][0m A new study created in memory with name: no-name-fe9a1130-b194-436e-b368-6318d277e731[0m
[32m[I 2022-05-30 11:56:36,811][0m Trial 0 finished with value: 36145.7971988142 and parameters: {'trial.suggest_int': 894, 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 13, 'min_samples_split': 13}. Best is trial 0 with value: 36145.7971988142.[0m
[32m[I 2022-05-30 11:56:39,526][0m Trial 1 finished with value: 28660.53781198705 and parameters: {'trial.suggest_int': 1421, 'max_depth': 13, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 13}. Best is trial 1 with value: 28660.53781198705.[0m
[32m[I 2022-05-30 11:56:40,805][0m Trial 2 finished with value: 35882.120510363864 and parameters: {'trial.suggest_int': 873, 'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 13, 'min_samples_split': 13}. Best is trial 1 with value: 28660.53781198705.[0m
[32m[I 2022-05-30 11:56:42,889][0m Trial 3 finished with value: 3

Number of finished trials: 30
Best trial: {'trial.suggest_int': 1487, 'max_depth': 17, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 15}


In [None]:
optuna.visualization.plot_param_importances(study1)