In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score, mean_squared_log_error, mean_squared_error,mean_absolute_error
import datetime as dt
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import preparing_data as F

In [2]:
data=pd.read_pickle("./dataframe/df_20210912_133401_ONLY_RISKY_EVENTS-6-withFE.pkl")
data.reset_index(inplace=True)
data.drop(['index'], inplace=True, axis=1)
print(data.shape)
data.head()

(272, 86)


Unnamed: 0,__time_to_tca,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,RELATIVE_POSITION_N,RELATIVE_VELOCITY_R,RELATIVE_VELOCITY_T,RELATIVE_VELOCITY_N,COLLISSION_PROBABILITY,...,PC_mavg_1,PC_trend_1,PC_trend_3,PC_gradient_1,PC_gradient_3,MD_mavg_1,MD_trend_1,MD_trend_3,MD_gradient_1,MD_gradient_3
0,0.832257,2244.0,11172.0,87.9,-1474.9,1689.9,-201.4,-8424.3,-7336.0,-3.418392,...,-3.475175,0.132439,0.590914,0.354472,0.579354,2181.333333,47.0,17.0,125.795356,16.667421
1,0.580855,966.0,212.0,191.0,-944.8,-68.9,33.8,-8.4,209.3,-3.711974,...,-3.981494,0.310256,0.575376,0.747481,0.536846,1131.0,-201.0,-326.0,-484.256505,-304.169574
2,0.189825,1102.0,212.0,193.3,-1083.1,-75.0,33.9,-8.4,209.3,-3.913996,...,-3.882734,-0.202022,0.296282,-0.51664,0.270366,1078.333333,136.0,-158.0,347.799165,-144.179478
3,0.994637,456.0,11707.0,-22.1,288.8,353.3,-10.2,-9077.7,7392.6,-2.055073,...,-2.841298,1.280567,1.312874,3.634346,1.445698,709.666667,-577.0,124.0,-1637.570088,136.545083
4,0.759012,629.0,11707.0,-18.7,395.3,489.9,-10.3,-9077.7,7392.6,-2.649558,...,-2.680091,-0.594485,0.483623,-2.523011,0.486607,706.0,173.0,-11.0,734.217036,-11.067877


In [3]:
data.shape

(272, 86)

In [4]:
train, test = train_test_split(data, test_size=0.30, random_state=42)

In [5]:
print("Train dataframe dimension {} x {}".format(train.shape[0],train.shape[1]))
print("Test dataframe dimension {} x {}".format(test.shape[0],test.shape[1]))

Train dataframe dimension 190 x 86
Test dataframe dimension 82 x 86


In [6]:
Y_train = train["COLLISSION_PROBABILITY"]
X_train= train.drop(["COLLISSION_PROBABILITY"], axis=1)
Y_test = test["COLLISSION_PROBABILITY"]
X_test= test.drop(["COLLISSION_PROBABILITY"], axis=1)


In [7]:
X = X_train
y = Y_train

In [8]:
# improving function
def bayesian_opt_lgbm(X, y, init_iter=3, n_iters=7, random_state=11, seed = 101, num_iterations = 100,evalm="lgb_r2"):
      dtrain = lgb.Dataset(data=X, label=y)
      #Metric evaluation functions
      def lgb_r2(preds, dtrain):                #R2
            labels = dtrain.get_label()
            return 'metric', r2_score(labels, preds), True
      def lgb_rmse(preds, dtrain):      #RMSE
            labels = dtrain.get_label()
            return 'metric', mean_squared_error(labels, preds,squared=False), True
      def lgb_mae(preds, dtrain):     #MAE
            labels = dtrain.get_label()
            return 'metric', mean_absolute_error(labels, preds), True
      def lgb_adjusted_r2(preds, dtrain):       #ADJUSTED R2
            labels = dtrain.get_label()
            n=dtrain.num_data()
            k=dtrain.num_feature()
            return 'metric', ((1-r2_score(labels, preds))*(n-1))/(n-k-1), True


      metrics_dict= {   "lgb_r2" : lgb_r2,
                        "lgb_rmse":lgb_rmse,
                        "lgb_mae":lgb_mae,
                        "lgb_adjusted_r2": lgb_adjusted_r2
                        }
      # Select metric
      metric=str(evalm)
      metric_feval=metrics_dict.get(str(evalm))

      # Objective Function
      def hyp_lgbm(num_leaves, feature_fraction, learning_rate, bagging_fraction, max_depth, min_split_gain, min_child_weight):
              params = {      'application':'regression',
                              'num_iterations': num_iterations,
                              'early_stopping_round':50,
                              'verbose':-1,
                              'metric':metric} # Default parameters
              params["num_leaves"] = int(round(num_leaves))
              params["learning_rate"] = learning_rate
              params['feature_fraction'] = max(min(feature_fraction, 1), 0)
              params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
              params['max_depth'] = int(round(max_depth))
              params['min_split_gain'] = min_split_gain
              params['min_child_weight'] = min_child_weight
              cv_results = lgb.cv(params, dtrain, nfold=5, seed=seed,categorical_feature=[], stratified=False,
                                  verbose_eval =None, feval=metric_feval)
              #print(cv_results)
              return np.max(cv_results['metric-mean'])
    
              # Domain space-- Range of hyperparameters 
      pds = {     'num_leaves': (60, 120),
                  'feature_fraction': (0.1, 0.9),
                  'bagging_fraction': (0.7, 1),
                  'max_depth': (7, 15),
                  'learning_rate':(0.001,0.05), 
                  'min_split_gain': (0.001, 0.1),
                  'min_child_weight': (10, 35)
                  }
      # Surrogate model
      optimizer = BayesianOptimization(hyp_lgbm, pds, random_state=random_state)
                                          
      # Optimize
      optimizer.maximize(init_points=init_iter, n_iter=n_iters)

      # Output dictionary
      output_dict=optimizer.max['params']
      output_dict["num_iterations"]=num_iterations
      output_dict["n_estimators"]=n_iters

      # Save dictionary to file
      filename="./opt_parameters_bo/param_{}_{}.pkl".format(dt.datetime.now().strftime("%Y%m%d_%H%M%S"),metric)
      a_file = open(filename, "wb")
      pickle.dump(output_dict, a_file)
      a_file.close()

      return optimizer,output_dict

In [9]:
def compare_true_vs_prediction(df_true,df_pred):
    aux_y=pd.DataFrame(df_true)
    aux_y.reset_index(inplace=True)
    aux_y.drop(['index'], inplace=True, axis=1)
    aux_y_pred=pd.DataFrame(df_pred)
    aux_y_pred.reset_index(inplace=True)
    aux_y_pred.drop(['index'], inplace=True, axis=1)
    frames=[aux_y,aux_y_pred]
    result=pd.concat(frames,axis=1)
    result.columns=["y_true","y_predicted"]
    result["y_true_10"]=10**result.y_true
    result["y_predicted_10"]=10**result.y_predicted
    result[result["y_true_10"]>0.00001]
    result[result["y_true_10"]>0.0001]
    return result


In [10]:
def create_and_validate_model(X, y, 
                            init_iter=5, n_iters=500, random_state=77, seed = 101,num_iterations=300,
                            evalm="lgb_r2",hp_metric="regression_L2"):
    bayesian=bayesian_opt_lgbm(X, y, init_iter, n_iters, random_state, seed,num_iterations,evalm)
    opt_parameters=bayesian[1]
    print("------------------------ OPTIMAL PARAMETERS ------------------------")
    print(opt_parameters)
    print("-------------------------------------------------------------------")
    
    # LOAD OPTIMAL PARAMETERS FOR FURTHER COMPUTATION
    hyper_params = {
                    'task': 'train',
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    'metric': str(hp_metric),
                    'learning_rate': opt_parameters.get("learning_rate"),
                    'feature_fraction': opt_parameters.get("feature_fraction"),
                    'bagging_fraction': opt_parameters.get("bagging_fraction"),
                    'verbose': -1,
                    "max_depth": int(round(opt_parameters.get("max_depth"))),
                    "num_leaves": int(round(opt_parameters.get("num_leaves"))),  
                    'min_split_gain' : opt_parameters.get("min_split_gain"),
                    "num_iterations": opt_parameters.get("num_iterations"),
                    "n_estimators": opt_parameters.get("n_estimators"),
                    'min_child_weight' : opt_parameters.get("min_child_weight")
                    }
    # TRAIN MODEL WITH OPTIMAL PARAMETERS
    lgbm_train = lgb.Dataset(X, label=y)
    gbm = lgb.train(params=hyper_params,train_set=lgbm_train)

    # TEST MODEL WITH TESTING SUBPART OF DATASET
    Y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    # REGRESION MODEL METRICS
    print('The r2 of prediction is:', r2_score(Y_test, Y_pred))
    print('The MSE of prediction is:', mean_squared_error(Y_test, Y_pred, squared=True))
    print('The RMSE of prediction is:', mean_squared_error(Y_test, Y_pred, squared=False))
    print('The MAE of prediction is:', mean_absolute_error(Y_test, Y_pred))

    # COMPARE TEST VALUES VS PREDICTED VALUES
    df_results=compare_true_vs_prediction(df_true=Y_test,df_pred=Y_pred)

    # WRITE TO A FILE
    
    outF = open("./validation-results/r_{}.txt".format(dt.datetime.now().strftime("%Y%m%d_%H%M%S")), "w+")
    # write line to output file
    outF.write("------------------------- MODEL HYPER-PARAMETERS ------------------------- \n")
    outF.write(str(hyper_params))
    outF.write("\n")
    outF.write("\n")
    outF.write("------------------------ REGRESSION MODEL METRICS ------------------------ \n")
    outF.write(str('The r2 of prediction is: ') + str(r2_score(Y_test, Y_pred))+str("\n"))
    outF.write(str('The MSE of prediction is: ') + str(mean_squared_error(Y_test, Y_pred, squared=True))+str("\n"))
    outF.write(str('The RMSE of prediction is: ')+ str(mean_squared_error(Y_test, Y_pred, squared=False))+str("\n"))
    outF.write(str('The MAE of prediction is: ')+ str(mean_absolute_error(Y_test, Y_pred))+str("\n"))
    outF.write("\n")
    outF.write("\n")
    outF.write("-------------------------- ADDITIONAL COMMENTS -------------------------- \n")
    outF.write(str("This model was created and validated at {}".format(dt.datetime.fromtimestamp(dt.datetime.timestamp(dt.datetime.now())))))
    outF.close()

    return gbm, df_results

In [11]:
model=create_and_validate_model(X, y,
                        init_iter=5, n_iters=1000, random_state=77, seed = 101,num_iterations=300,evalm="lgb_mae",hp_metric="mae")

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5654  [0m | [0m 0.9757  [0m | [0m 0.6138  [0m | [0m 0.03793 [0m | [0m 8.115   [0m | [0m 12.18   [0m | [0m 0.07901 [0m | [0m 79.57   [0m |
| [95m 2       [0m | [95m 0.5664  [0m | [95m 0.8623  [0m | [95m 0.2922  [0m | [95m 0.02773 [0m | [95m 10.2    [0m | [95m 27.88   [0m | [95m 0.08383 [0m | [95m 95.31   [0m |
| [0m 3       [0m | [0m 0.5643  [0m | [0m 0.7888  [0m | [0m 0.3248  [0m | [0m 0.03557 [0m | [0m 10.38   [0m | [0m 11.43   [0m | [0m 0.07496 [0m | [0m 87.14   [0m |
| [95m 4       [0m | [95m 0.5702  [0m | [95m 0.7527  [0m | [95m 0.1395  [0m | [95m 0.01533 [0m | [95m 7.534   [0m | [95m 28.78   [0m | [95m 0.007313[0m | [95m 85.91   [0m |
| [0m 5       [0m | [0m 0.5685 

In [None]:
# filename="./opt_parameters_bo/param_20210912_213528_lgb_r2.pkl"
# a_file = open(filename,"rb")
# output = pickle.load(a_file)
# opt_parameters=output
# output