In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score, mean_squared_log_error, mean_squared_error,mean_absolute_error
import datetime as dt
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import preparing_data as F

In [3]:
data=pd.read_pickle("balanced_df_20210907_202919.pkl")
data.reset_index(inplace=True)
data.drop(['index'], inplace=True, axis=1)
print(data.shape)
data.head()

(216, 531)


Unnamed: 0,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,RELATIVE_POSITION_N,RELATIVE_VELOCITY_R,RELATIVE_VELOCITY_T,RELATIVE_VELOCITY_N,COLLISSION_PROBABILITY,OBJECT1_CR_R,...,OBJECT2_CORR_CTDOT_R_4,OBJECT2_CORR_CTDOT_T_4,OBJECT2_CORR_CTDOT_N_4,OBJECT2_CORR_CTDOT_RDOT_4,OBJECT2_CORR_CNDOT_R_4,OBJECT2_CORR_CNDOT_T_4,OBJECT2_CORR_CNDOT_N_4,OBJECT2_CORR_CNDOT_RDOT_4,OBJECT2_CORR_CNDOT_TDOT_4,COLLISSION_PROBABILITY_TARGET
0,2227.0,11172.0,101.7,-1463.6,1676.3,-201.4,-8424.3,-7336.0,-4.009306,63.680258,...,-0.965613,0.515422,-0.135461,-0.477611,-0.227886,0.075571,0.510505,-0.06881,0.250754,-30.0
1,2851.0,5277.0,150.9,2661.9,-1011.4,16.2,-1873.6,-4933.5,-4.920819,9.112098,...,-0.999617,0.232796,0.082441,-0.23746,0.02163,-0.148656,0.796492,0.154916,-0.018924,-30.0
2,2949.0,11029.0,-63.2,1995.1,-2170.9,79.0,-8116.4,-7467.5,-4.981716,10.82594,...,-0.08835,0.18181,-0.703869,-0.183976,0.048107,-0.19857,0.842794,0.201411,-0.917799,-5.019997
3,1047.0,9804.0,7.9,-790.9,-686.9,5.6,-6439.2,7394.2,-3.33564,9.38075,...,-0.999819,0.179695,-0.677245,-0.140373,0.549097,-0.571499,0.758658,0.397017,-0.548321,-30.0
4,7164.0,15276.0,-300.8,998.1,7088.5,21.5,-15127.4,2127.7,-4.848937,3486.93497,...,-0.991382,0.727569,-0.365051,-0.728437,-0.56575,0.2153,0.1728,-0.219876,0.624348,-30.0


In [None]:
#data=data.head(100)

In [4]:
data.shape

(216, 531)

In [5]:
############################### FEATURE ENGINEERING ##########################################
# Gradient: Miss distance two last CDM
data["_GRADIENT_MISS_DISTANCE_34"]=(-data.MISS_DISTANCE_3+data.MISS_DISTANCE_4)/abs(data.__time_to_tca_4-data.__time_to_tca_3)
# Gradient: Miss distance first and last CDM
data["_GRADIENT_MISS_DISTANCE_14"]=(-data.MISS_DISTANCE+data.MISS_DISTANCE_4)/abs(data.__time_to_tca_4-data.__time_to_tca)
#Gradient: COLLISSION PROBABILITY two last CDM
data["_GRADIENT_PC_34"]=(-data.COLLISSION_PROBABILITY_3+data.COLLISSION_PROBABILITY_4)/abs(data.__time_to_tca_4-data.__time_to_tca_3)
#Gradient: COLLISSION PROBABILITY first and last CDM
data["_GRADIENT_PC_14"]=(-data.COLLISSION_PROBABILITY+data.COLLISSION_PROBABILITY_4)/abs(data.__time_to_tca_4-data.__time_to_tca)

In [6]:
data.shape

(216, 535)

In [7]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [8]:
print("Train dataframe dimension {} x {}".format(train.shape[0],train.shape[1]))
print("Test dataframe dimension {} x {}".format(test.shape[0],test.shape[1]))

Train dataframe dimension 162 x 535
Test dataframe dimension 54 x 535


In [9]:
Y_train = train["COLLISSION_PROBABILITY_TARGET"]
X_train= train.drop(["COLLISSION_PROBABILITY_TARGET"], axis=1)
Y_test = test["COLLISSION_PROBABILITY_TARGET"]
X_test= test.drop(["COLLISSION_PROBABILITY_TARGET"], axis=1)


In [10]:
X = X_train
y = Y_train

In [11]:
def bayesian_opt_lgbm(X, y, init_iter=3, n_iters=7, random_state=11, seed = 101, num_iterations = 100):
      dtrain = lgb.Dataset(data=X, label=y)
      #Metric evaluation functions
      def lgb_r2_score(preds, dtrain):
            labels = dtrain.get_label()
            return 'metric', r2_score(labels, preds), True
      def lgb_mean_absolute_error(preds, dtrain):
            labels = dtrain.get_label()
            return 'metric', mean_absolute_error(labels, preds), True
      def lgb_adjusted_r2_score(preds, dtrain):
            labels = dtrain.get_label()
            n=dtrain.num_data()
            k=dtrain.num_feature()
            return 'metric', ((1-r2_score(labels, preds))*(n-1))/(n-k-1), True
            
      # Select metric
      metric='lgb_r2_score'
      metric_feval=lgb_r2_score

      # Objective Function
      def hyp_lgbm(num_leaves, feature_fraction, learning_rate, bagging_fraction, max_depth, min_split_gain, min_child_weight):
              params = {      'application':'regression',
                              'num_iterations': num_iterations,
                              'early_stopping_round':50,
                              'verbose':-1,
                              'metric':metric} # Default parameters
              params["num_leaves"] = int(round(num_leaves))
              params["learning_rate"] = learning_rate
              params['feature_fraction'] = max(min(feature_fraction, 1), 0)
              params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
              params['max_depth'] = int(round(max_depth))
              params['min_split_gain'] = min_split_gain
              params['min_child_weight'] = min_child_weight
              cv_results = lgb.cv(params, dtrain, nfold=5, seed=seed,categorical_feature=[], stratified=False,
                                  verbose_eval =None, feval=metric_feval)
              #print(cv_results)
              return np.max(cv_results['metric-mean'])
    
              # Domain space-- Range of hyperparameters 
      pds = {     'num_leaves': (80, 120),
                  'feature_fraction': (0.1, 0.9),
                  'bagging_fraction': (0.7, 1),
                  'max_depth': (7, 15),
                  'learning_rate':(0.001,0.05), 
                  'min_split_gain': (0.001, 0.1),
                  'min_child_weight': (10, 25)
                  }
      # Surrogate model
      optimizer = BayesianOptimization(hyp_lgbm, pds, random_state=random_state)
                                          
      # Optimize
      optimizer.maximize(init_points=init_iter, n_iter=n_iters)

      return optimizer

bayesian_ouput=bayesian_opt_lgbm(X, y, init_iter=5, n_iters=500, random_state=77, seed = 101,num_iterations=300)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7816  [0m | [0m 0.9757  [0m | [0m 0.6138  [0m | [0m 0.03793 [0m | [0m 8.115   [0m | [0m 11.31   [0m | [0m 0.07901 [0m | [0m 93.05   [0m |
| [0m 2       [0m | [0m 0.748   [0m | [0m 0.8623  [0m | [0m 0.2922  [0m | [0m 0.02773 [0m | [0m 10.2    [0m | [0m 20.73   [0m | [0m 0.08383 [0m | [0m 103.5   [0m |
| [0m 3       [0m | [0m 0.7388  [0m | [0m 0.7888  [0m | [0m 0.3248  [0m | [0m 0.03557 [0m | [0m 10.38   [0m | [0m 10.86   [0m | [0m 0.07496 [0m | [0m 98.09   [0m |
| [0m 4       [0m | [0m 0.6228  [0m | [0m 0.7527  [0m | [0m 0.1395  [0m | [0m 0.01533 [0m | [0m 7.534   [0m | [0m 21.27   [0m | [0m 0.007313[0m | [0m 97.28   [0m |
| [0m 5       [0m | [0m 0.7168  [0m | [0m 0.809

In [12]:
opt_parameters=bayesian_ouput.max['params']
opt_parameters

{'bagging_fraction': 0.7852616790907327,
 'feature_fraction': 0.8473572138702099,
 'learning_rate': 0.03514995339626963,
 'max_depth': 10.739616821289683,
 'min_child_weight': 10.473906532566208,
 'min_split_gain': 0.08439868886083136,
 'num_leaves': 119.97061197700813}

In [13]:
filename="opt_parameters_balanced_df_{}.pkl".format(dt.datetime.now().strftime("%Y%m%d_%H%M%S"))
a_file = open(filename, "wb")

pickle.dump(opt_parameters, a_file)

a_file.close()

In [None]:
# filename="opt_parameters_20210804_013351.pkl"
# a_file = open(filename,"rb")
# output = pickle.load(a_file)
# opt_parameters=output
# output

In [None]:
#optimizer.max['params']

In [None]:
#'bagging_fraction': 1.0, 'feature_fraction': 0.9, 'max_depth': 8.0, 'min_child_weight': 25.0, 'min_split_gain': 0.013771321931506838, 'num_leaves': 88.93816438820497}

In [14]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'learning_rate': opt_parameters.get("learning_rate"),
    'feature_fraction': opt_parameters.get("feature_fraction"),
    'bagging_fraction': opt_parameters.get("bagging_fraction"),
    #'bagging_freq': 10,
    'verbose': -1,
    "max_depth": int(round(opt_parameters.get("max_depth"))),
    "num_leaves": int(round(opt_parameters.get("num_leaves"))),  
    #"max_bin": 512,
    'min_split_gain' : opt_parameters.get("min_split_gain"),
    "num_iterations": 300,
    "n_estimators": 500,
    'min_child_weight' : opt_parameters.get("min_child_weight")
}

In [15]:
#Construct a gradient boosting model.
gbm = lgb.LGBMRegressor(**hyper_params)

In [16]:
lgbm_train = lgb.Dataset(X, label=y)
lgbm_eval = lgb.Dataset(X_test, label=Y_test,reference=lgbm_train)

In [17]:
gbm = lgb.train(params=hyper_params,
                train_set=lgbm_train,
                #valid_sets=lgbm_eval,
                verbose_eval=20,
                #eval_metric='lgb_r2_score',
                #early_stopping_rounds=100
                )

In [None]:
#Build a gradient boosting model from the training set (X, y)
""" gbm.fit(X, y,
        eval_set=[(X_test, Y_test)],
        eval_metric='l1',
        early_stopping_rounds=50) """


In [18]:
Y_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration)

In [19]:
print('The r2 of prediction is:', r2_score(y, Y_pred))
print('The MSE of prediction is:', mean_squared_error(y, Y_pred, squared=True))
print('The RMSE of prediction is:', mean_squared_error(y, Y_pred, squared=False))

The r2 of prediction is: 0.9968139385546472
The MSE of prediction is: 0.2991664854437583
The RMSE of prediction is: 0.5469611370506667


In [20]:
aux_y=pd.DataFrame(y)
aux_y.reset_index(inplace=True)
aux_y.drop(['index'], inplace=True, axis=1)
aux_y_pred=pd.DataFrame(Y_pred)
aux_y_pred.reset_index(inplace=True)
aux_y_pred.drop(['index'], inplace=True, axis=1)
frames=[aux_y,aux_y_pred]
result=pd.concat(frames,axis=1)
result.columns=["y_true","y_predicted"]
result

Unnamed: 0,y_true,y_predicted
0,-30.000000,-30.071054
1,-8.597739,-9.103203
2,-6.863597,-6.982731
3,-6.708187,-6.823920
4,-30.000000,-30.082275
...,...,...
157,-30.000000,-30.138476
158,-22.936667,-22.373685
159,-30.000000,-29.939103
160,-30.000000,-30.100213


In [29]:
result["y_true_10"]=10**result.y_true
result["y_predicted_10"]=10**result.y_predicted

In [31]:
result.to_csv("comparison.csv")

In [21]:
result.y_predicted.idxmax()

130

In [22]:
10**result.y_predicted[result.y_predicted.idxmax()]

0.00015616994768890568

In [23]:
10**result.y_true[result.y_predicted.idxmax()]

0.0004500999999999995

In [24]:
10**result.y_true.max()

0.0004500999999999995

In [25]:
10**result.y_predicted[result.y_true.idxmax()]

0.00015616994768890568

In [26]:
result.y_true.idxmax()

130

In [32]:
result[result["y_true_10"]>0.00001]

Unnamed: 0,y_true,y_predicted,y_true_10,y_predicted_10
19,-4.198871,-4.775076,6.3e-05,1.7e-05
20,-4.517698,-4.714176,3e-05,1.9e-05
28,-4.74739,-4.953882,1.8e-05,1.1e-05
38,-4.557677,-4.651466,2.8e-05,2.2e-05
51,-4.726073,-4.635717,1.9e-05,2.3e-05
55,-3.913996,-4.180045,0.000122,6.6e-05
60,-3.88941,-4.088024,0.000129,8.2e-05
74,-4.584359,-5.313001,2.6e-05,5e-06
77,-4.407601,-4.451491,3.9e-05,3.5e-05
102,-4.409939,-4.374651,3.9e-05,4.2e-05


In [33]:
result[result["y_true_10"]>0.0001]

Unnamed: 0,y_true,y_predicted,y_true_10,y_predicted_10
55,-3.913996,-4.180045,0.000122,6.6e-05
60,-3.88941,-4.088024,0.000129,8.2e-05
130,-3.346691,-3.806403,0.00045,0.000156
142,-3.753994,-4.490795,0.000176,3.2e-05


In [35]:
result[result["y_predicted_10"]>0.00005]

Unnamed: 0,y_true,y_predicted,y_true_10,y_predicted_10
55,-3.913996,-4.180045,0.000122,6.6e-05
60,-3.88941,-4.088024,0.000129,8.2e-05
122,-4.38711,-4.232895,4.1e-05,5.8e-05
130,-3.346691,-3.806403,0.00045,0.000156
