In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score, mean_squared_log_error, mean_squared_error
import datetime as dt
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_pickle("full_dataframe_20210803_150443.pkl")
data.reset_index(inplace=True)
data.drop(['index','event_id'], inplace=True, axis=1)
print(data.shape)
data.head()

In [None]:
#data=data.head(200)

In [None]:
data.shape

In [None]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [None]:
print("Train dataframe dimension {} x {}".format(train.shape[0],train.shape[1]))
print("Test dataframe dimension {} x {}".format(test.shape[0],test.shape[1]))

In [None]:
Y_train = train["COLLISSION_PROBABILITY_TARGET"]
X_train= train.drop(["COLLISSION_PROBABILITY_TARGET"], axis=1)
Y_test = test["COLLISSION_PROBABILITY_TARGET"]
X_test= test.drop(["COLLISSION_PROBABILITY_TARGET"], axis=1)


In [None]:
X = X_train
y = Y_train

In [None]:
def bayesian_opt_lgbm(X, y, init_iter=3, n_iters=7, random_state=11, seed = 101, num_iterations = 100):
      dtrain = lgb.Dataset(data=X, label=y)
      def lgb_r2_score(preds, dtrain):
            labels = dtrain.get_label()
            return 'r2', r2_score(labels, preds), True
      # Objective Function
      def hyp_lgbm(num_leaves, feature_fraction, learning_rate, bagging_fraction, max_depth, min_split_gain, min_child_weight):
              params = {      'application':'regression',
                              'num_iterations': num_iterations,
                              'early_stopping_round':50,
                              'verbose':-1,
                              'metric':'lgb_r2_score'} # Default parameters
              params["num_leaves"] = int(round(num_leaves))
              params["learning_rate"] = learning_rate
              params['feature_fraction'] = max(min(feature_fraction, 1), 0)
              params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
              params['max_depth'] = int(round(max_depth))
              params['min_split_gain'] = min_split_gain
              params['min_child_weight'] = min_child_weight
              cv_results = lgb.cv(params, dtrain, nfold=5, seed=seed,categorical_feature=[], stratified=False,
                                  verbose_eval =None, feval=lgb_r2_score)
              #print(cv_results)
              return np.max(cv_results['r2-mean'])
    
              # Domain space-- Range of hyperparameters 
      pds = {     'num_leaves': (80, 120),
                  'feature_fraction': (0.1, 0.9),
                  'bagging_fraction': (0.7, 1),
                  'max_depth': (7, 15),
                  'learning_rate':(0.001,0.05), 
                  'min_split_gain': (0.001, 0.1),
                  'min_child_weight': (10, 25)
                  }
      # Surrogate model
      optimizer = BayesianOptimization(hyp_lgbm, pds, random_state=random_state)
                                          
      # Optimize
      optimizer.maximize(init_points=init_iter, n_iter=n_iters)

      return optimizer

bayesian_ouput=bayesian_opt_lgbm(X, y, init_iter=5, n_iters=10, random_state=77, seed = 101,num_iterations=1000)

In [None]:
opt_parameters=bayesian_ouput.max['params']
opt_parameters

In [None]:
filename="opt_parameters_{}.pkl".format(dt.datetime.now().strftime("%Y%m%d_%H%M%S"))
a_file = open(filename, "wb")

pickle.dump(opt_parameters, a_file)

a_file.close()

In [None]:
a_file = open(filename,"rb")
output = pickle.load(a_file)
output

In [None]:
#optimizer.max['params']

In [None]:
#'bagging_fraction': 1.0, 'feature_fraction': 0.9, 'max_depth': 8.0, 'min_child_weight': 25.0, 'min_split_gain': 0.013771321931506838, 'num_leaves': 88.93816438820497}

In [None]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'learning_rate': opt_parameters.get("learning_rate"),
    'feature_fraction': opt_parameters.get("feature_fraction"),
    'bagging_fraction': opt_parameters.get("bagging_fraction"),
    #'bagging_freq': 10,
    'verbose': -1,
    "max_depth": int(round(opt_parameters.get("max_depth"))),
    "num_leaves": int(round(opt_parameters.get("num_leaves"))),  
    #"max_bin": 512,
    'min_split_gain' : opt_parameters.get("min_split_gain"),
    "num_iterations": 500,
    "n_estimators": 10,
    'min_child_weight' : opt_parameters.get("min_child_weight")
}

In [None]:
#Construct a gradient boosting model.
gbm = lgb.LGBMRegressor(**hyper_params)

In [None]:
lgbm_train = lgb.Dataset(X, label=y)
lgbm_eval = lgb.Dataset(X_test, label=Y_test,reference=lgbm_train)

In [None]:
gbm = lgb.train(params=hyper_params,
                train_set=lgbm_train,
                valid_sets=lgbm_eval,
                verbose_eval=20,
                #eval_metric='lgb_r2_score',
                early_stopping_rounds=100)

In [None]:
#Build a gradient boosting model from the training set (X, y)
""" gbm.fit(X, y,
        eval_set=[(X_test, Y_test)],
        eval_metric='l1',
        early_stopping_rounds=50) """


In [None]:
Y_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration)

In [None]:
print('The r2 of prediction is:', r2_score(y, Y_pred))
print('The MSE of prediction is:', mean_squared_error(y, Y_pred, squared=True))
print('The RMSE of prediction is:', mean_squared_error(y, Y_pred, squared=False))

In [None]:
aux_y=pd.DataFrame(y)
aux_y.reset_index(inplace=True)
aux_y.drop(['index'], inplace=True, axis=1)
aux_y_pred=pd.DataFrame(Y_pred)
aux_y_pred.reset_index(inplace=True)
aux_y_pred.drop(['index'], inplace=True, axis=1)
frames=[aux_y,aux_y_pred]
result=pd.concat(frames,axis=1)
result