In [8]:
import optuna 
import xgboost as xgb
from catboost import CatBoostRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [9]:
train = pd.read_csv('..\\kaggle_data\\train.csv')
test = pd.read_csv('..\\kaggle_data\\test.csv')
sub = pd.read_csv('..\\kaggle_data\\sample_submission.csv')

In [10]:
columns = [col for col in train.columns.to_list() if col not in ['id','target']]
data=train[columns]
target=train['target']

In [11]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 4000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-01-21 17:26:38,118][0m A new study created in memory with name: no-name-af955fed-d982-4f6b-afc9-2fe9136fecca[0m
[32m[I 2021-01-21 17:27:13,697][0m Trial 0 finished with value: 0.6949239476862908 and parameters: {'lambda': 0.9667284319221151, 'alpha': 0.0012544885422151389, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 107}. Best is trial 0 with value: 0.6949239476862908.[0m
[32m[I 2021-01-21 17:28:03,140][0m Trial 1 finished with value: 0.6940814979919743 and parameters: {'lambda': 1.1958160151517383, 'alpha': 0.27102633744027116, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.009, 'max_depth': 9, 'random_state': 48, 'min_child_weight': 226}. Best is trial 1 with value: 0.6940814979919743.[0m
[32m[I 2021-01-21 17:28:41,250][0m Trial 2 finished with value: 0.69378219997585 and parameters: {'lambda': 2.5427675856103678, 'alpha': 0.3228187693076243, 'colsample_bytree': 0.5,

Number of finished trials: 3
Best trial: {'lambda': 2.5427675856103678, 'alpha': 0.3228187693076243, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.018, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 168}


In [13]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,0.694924,2021-01-21 17:26:38.120455,2021-01-21 17:27:13.696455,0 days 00:00:35.576000,0.001254,0.3,0.966728,0.012,7,107,2020,0.4,COMPLETE
1,1,0.694081,2021-01-21 17:27:13.698457,2021-01-21 17:28:03.140455,0 days 00:00:49.441998,0.271026,0.7,1.195816,0.009,9,226,48,0.6,COMPLETE
2,2,0.693782,2021-01-21 17:28:03.142456,2021-01-21 17:28:41.249453,0 days 00:00:38.106997,0.322819,0.5,2.542768,0.018,15,168,24,0.6,COMPLETE


In [20]:
optuna.visualization.plot_optimization_history(study)

In [15]:
optuna.visualization.plot_parallel_coordinate(study)

In [16]:
optuna.visualization.plot_slice(study)

In [17]:
optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

In [18]:
optuna.visualization.plot_param_importances(study)

In [21]:
optuna.visualization.plot_edf(study)

In [22]:
Best_trial= {'lambda': 0.0030282073258141168, 'alpha': 0.01563845128469084, 'colsample_bytree': 0.5,
             'subsample': 0.7,'n_estimators': 4000, 'learning_rate': 0.01,'max_depth': 15,
             'random_state': 2020, 'min_child_weight': 257,'tree_method':'gpu_hist'}

In [23]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train[columns],train['target']):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = xgb.XGBRegressor(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

1 0.6983663796697697
2 0.6954188897931859
3 0.6960413641711274
4 0.6956234980654018
5 0.696015251762687


In [24]:
np.mean(rmse)

0.6962930766924342

In [None]:
from datetime import datetime
dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

sub['target']=preds

print(sub.head())

sub.to_csv(f'..\\kaggle_data\\{dt_string}_submission.csv', index=False)

In [26]:
import math
import sklearn
from sklearn.metrics import mean_squared_error

In [30]:
best_sub = pd.read_csv('..\\kaggle_data\\18-01-2021_18-32-17_submission.csv')
worst_sub = pd.read_csv('..\\kaggle_data\\worst.csv') 

distance_mse_to_best = mean_squared_error(sub, best_sub)
distance_rmse_to_best = math.sqrt(distance_mse_to_best)
print(f'Distance to Best {distance_rmse_to_best}')

distance_mse_to_worst = mean_squared_error(sub, worst_sub)
distance_rmse_to_worst = math.sqrt(distance_mse_to_worst)
print(f'Distance to Worst {distance_rmse_to_worst}')

Distance to Best 0.04116924872377249
Distance to Worst 0.40773848526479634
