In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
#./dataframe/df_only_risky_events.pkl
#data=pd.read_pickle("dataframe/_PRUEBA_df_20211225_143242.pkl") #dataframe full 2022
data=pd.read_pickle("dataframe/_PRUEBA_df_filtered_20220115_112153.pkl")
data.reset_index(inplace=True)
data.drop(['index'], inplace=True, axis=1)
print(data.shape)
data.head()

(5394, 85)


Unnamed: 0,__time_to_tca,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,RELATIVE_POSITION_N,RELATIVE_VELOCITY_R,RELATIVE_VELOCITY_T,RELATIVE_VELOCITY_N,COLLISSION_PROBABILITY,...,OBJECT2_CORR_CNDOT_TDOT,PC_trend_1,PC_trend_3,PC_gradient_1,PC_gradient_3,MD_trend_1,MD_trend_3,MD_gradient_1,MD_gradient_3,TARGET_PC
0,5.775947,568.0,2001.0,-20.9,-562.8,-75.9,0.8,-268.6,1983.8,-5.415895,...,-0.054742,0.22617,0.773872,0.65261,0.763062,-123.0,144.0,-354.914301,141.988456,-5.345246
1,5.420762,611.0,2001.0,-19.9,-605.4,-81.8,0.8,-268.6,1983.8,-5.345246,...,0.029148,0.070649,0.237282,0.198907,0.234313,43.0,50.0,121.063636,49.374243,-4.792366
2,5.119489,576.0,2001.0,-9.6,-571.2,-77.4,0.8,-268.6,1983.8,-4.792366,...,-0.03092,0.55288,0.8497,1.835148,0.847141,-35.0,-115.0,-116.173757,-114.653689,-4.20845
3,4.750068,328.0,2001.0,2.7,-325.3,-43.7,0.5,-268.6,1983.8,-4.20845,...,-0.104064,0.583916,1.207445,1.580622,1.176985,-248.0,-240.0,-671.319754,-233.945596,-4.049879
4,4.087221,56.0,2001.0,9.2,-55.1,-7.3,0.2,-268.6,1983.8,-4.049879,...,-0.117181,0.158571,1.295367,0.239227,0.971374,-272.0,-555.0,-410.351242,-416.185112,-5.289798


In [3]:
train, test = train_test_split(data, test_size=0.30, random_state=42)

In [4]:
print("Train dataframe dimension {} x {}".format(train.shape[0],train.shape[1]))
print("Test dataframe dimension {} x {}".format(test.shape[0],test.shape[1]))

Train dataframe dimension 3775 x 85
Test dataframe dimension 1619 x 85


In [5]:
columnt_to_predict_name="TARGET_PC"

In [6]:
Y_train = train[columnt_to_predict_name]
X_train= train.drop([columnt_to_predict_name], axis=1)
Y_test = test[columnt_to_predict_name]
X_test= test.drop([columnt_to_predict_name], axis=1)

In [7]:
X = X_train
y = Y_train

In [8]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
import datetime as dt
import pickle


def grid_search_optimization(
    X,
    y,
    n_estimators_list=[10, 50],
    learning_rate_list=[0.0001, 0.001],
    loss_functions_list = ["linear"],
    scoring_metric="r2",
    n_splits_for_cv = 5,
    n_repeats_for_cv = 3,
    n_cores = -1
):
    """Computes the optimal values for the LGBM model

    Parameters
    ----------
    X : dataframe
        Train dataset
    y : target dataframe
        Values to be predicted
    n_estimators_list: list
        List of n_estimators values for grid search
    learning_rate_list: list
        List of learning_rate values for grid search
    scoring_metric: 
        Scoring metrics from sci-kit learn default r2
    n_splits_for_cv:
        Number of splits for K Fold cross validation
    n_repeats_for_cv:
        Number of repetition for cross validation
    n_cores: int
        Number of CPU cores for computation. Default -1 = all

    Returns
    -------
    dictionary
        Optimized values
    """
    # define the model with default hyperparameters
    model = AdaBoostRegressor()
    # define the grid of values to search
    grid = dict()
    grid['n_estimators'] = n_estimators_list
    grid['learning_rate'] = learning_rate_list
    grid['loss'] = loss_functions_list
    # define the evaluation procedure
    cv = RepeatedKFold(n_splits=n_splits_for_cv, n_repeats=n_repeats_for_cv, random_state=1)
    # define the grid search procedure
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs= n_cores, cv=cv, scoring=scoring_metric,verbose=10)

    # execute the grid search
    grid_result = grid_search.fit(X, y)

    # summarize the best score and configuration
    now=dt.datetime.now()
    filename1="./opt_parameters_ada_boost/{}_{}_gs_opt_param.pkl".format(
            now.strftime("%Y%m%d_%H%M%S"),scoring_metric)
    filename2="./opt_parameters_ada_boost/{}_gs_full_run.txt".format(
            now.strftime("%Y%m%d_%H%M%S"),scoring_metric)
    with open(filename1, "wb") as optimal_parameters_logger:
        output_dict = grid_result.best_params_
        output_dict["scoring"] = scoring_metric
        output_dict["best_score"] = grid_result.best_score_
        output_dict["n_splits"] = n_splits_for_cv
        output_dict["n_repeats"] = n_repeats_for_cv
        pickle.dump(output_dict, optimal_parameters_logger)
        optimal_parameters_logger.close()
    with open(filename2, "a") as results_logger:
        output_1 = "Best: {} using {} \n".format(grid_result.best_score_, grid_result.best_params_)
        print(output_1)
        results_logger.write(output_1)
        # summarize all scores that were evaluated
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            output_2 = "{} ({}) with: {} \n".format(mean, stdev, param)
            print(output_2)
            results_logger.write(output_2)
        results_logger.close()
    return output_dict

In [None]:
n_estimators_v = [100,300,500,800,1000,1500,2000,3000,4000,5000]
learning_rate_v=[0.001,0.002,0.003,0.005]
#n_estimators_v = [100,300,500]
#learning_rate_v=[0.01]
loss_function_v = ["exponential"]
#regression_metrics=["neg_root_mean_squared_error"]

In [None]:
grid_search_optimization(X,y,
                            n_estimators_v,
                            learning_rate_v,
                            scoring_metric="r2",
                            loss_functions_list=loss_function_v,
                            n_cores = -1)

In [26]:
n_estimator_opt=100
learning_rate_opt=0.05

In [27]:
model = AdaBoostRegressor(n_estimators=n_estimator_opt,learning_rate=learning_rate_opt,loss="exponential")
model.fit(X_train,Y_train)
pred=model.predict(X_test)
# REGRESION MODEL METRICS
print("The r2 of prediction is:", r2_score(Y_test, pred))
print("The MSE of prediction is:", mean_squared_error(Y_test, pred, squared=True))
print("The RMSE of prediction is:", mean_squared_error(Y_test, pred, squared=False))
print("The MAE of prediction is:", mean_absolute_error(Y_test, pred))

The r2 of prediction is: 0.76242439973564
The MSE of prediction is: 0.11082532850518953
The RMSE of prediction is: 0.3329043834274183
The MAE of prediction is: 0.2221136467729529


In [28]:
filename = "adaboost_filtered_trees_{}_learning_rate_{}.sav".format(n_estimator_opt, learning_rate_opt)
pickle.dump(model, open(filename, 'wb'))

In [21]:
# load the model from disk
filename = "adaboost_filtered_trees_{}_learning_rate_{}.sav".format(n_estimator_opt, learning_rate_opt)

loaded_model = pickle.load(open(filename, 'rb'))

In [22]:
bla=loaded_model.predict(X_test)
# REGRESION MODEL METRICS
print("The r2 of prediction is:", r2_score(Y_test, bla))
print("The MSE of prediction is:", mean_squared_error(Y_test, bla, squared=True))
print("The RMSE of prediction is:", mean_squared_error(Y_test, bla, squared=False))
print("The MAE of prediction is:", mean_absolute_error(Y_test, bla))

The r2 of prediction is: 0.7668075383990707
The MSE of prediction is: 0.10878066237904704
The RMSE of prediction is: 0.32981913585940864
The MAE of prediction is: 0.2144781031720866


In [43]:
# list_of_filenames=[
#                     "adaboost_unfiltered_trees_100_learning_rate_0.001.sav",
#                     "adaboost_unfiltered_trees_300_learning_rate_0.001.sav",
#                     "adaboost_unfiltered_trees_500_learning_rate_0.001.sav",
#                     "adaboost_unfiltered_trees_800_learning_rate_0.001.sav",
#                     "adaboost_unfiltered_trees_100_learning_rate_0.002.sav",
#                     "adaboost_unfiltered_trees_300_learning_rate_0.002.sav",
#                     "adaboost_unfiltered_trees_100_learning_rate_0.003.sav",
#                     "adaboost_unfiltered_trees_300_learning_rate_0.003.sav",
#                     ]

list_of_filenames=[
                    "adaboost_filtered_trees_100_learning_rate_0.01.sav",
                    "adaboost_filtered_trees_100_learning_rate_0.03.sav",
                    "adaboost_filtered_trees_500_learning_rate_0.001.sav",
                    "adaboost_filtered_trees_800_learning_rate_0.001.sav",
                    "adaboost_filtered_trees_100_learning_rate_0.002.sav",
                    ]

In [27]:
list_of_filenames[0][44:49]

'0.001'

In [60]:
results_df = pd.DataFrame(columns=['trees','learning_rate','r2','MSE','RMSE','MAE'])
for i in list_of_filenames:
    loaded_model = pickle.load(open(i, 'rb'))
    prediction_over_x_test=loaded_model.predict(X_test)
    # print(i)
    # print("\n")
    # REGRESION MODEL METRICS
    trees = i[26:29]
    lr = i[44:49]
    r2=r2_score(Y_test, prediction_over_x_test)
    MSE=mean_squared_error(Y_test, prediction_over_x_test, squared=True)
    RMSE = mean_squared_error(Y_test, prediction_over_x_test, squared=False)
    MAE = mean_absolute_error(Y_test, prediction_over_x_test)
    # print("The r2 of prediction is:", r2)
    # print("The MSE of prediction is:", MSE)
    # print("The RMSE of prediction is:", RMSE)
    # print("The MAE of prediction is:", MAE)
    to_append = [trees,lr,r2,MSE,RMSE,MAE]
    a_series = pd.Series(to_append, index = results_df.columns)
    results_df = results_df.append(a_series, ignore_index=True)
    loaded_model = None

In [61]:
results_df

Unnamed: 0,trees,learning_rate,r2,MSE,RMSE,MAE
0,100,0.001,0.825056,16.646892,4.08006,2.108679
1,300,0.001,0.824558,16.694322,4.085869,2.163587
2,500,0.001,0.824573,16.692851,4.085689,2.217877
3,800,0.001,0.824359,16.713268,4.088186,2.286927
4,100,0.002,0.824781,16.673068,4.083267,2.135083
5,300,0.002,0.82451,16.698901,4.086429,2.236963
6,100,0.003,0.824643,16.686173,4.084871,2.160679
7,300,0.003,0.823975,16.749759,4.092647,2.316794


In [63]:
np.round(results_df,5).to_csv("adaboost_results_unfiltered_dataframe")