In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_pickle("./dataframe/df_only_risky_events.pkl")
data.reset_index(inplace=True)
data.drop(['index'], inplace=True, axis=1)
print(data.shape)
data.head()

In [None]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [None]:
print("Train dataframe dimension {} x {}".format(train.shape[0],train.shape[1]))
print("Test dataframe dimension {} x {}".format(test.shape[0],test.shape[1]))

In [None]:
Y_train = train["COLLISSION_PROBABILITY"]
X_train= train.drop(["COLLISSION_PROBABILITY"], axis=1)
Y_test = test["COLLISSION_PROBABILITY"]
X_test= test.drop(["COLLISSION_PROBABILITY"], axis=1)


In [None]:
X = X_train
y = Y_train

In [None]:
#print(list(X.columns))

In [None]:
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import AdaBoostRegressor
# import datetime as dt
# import pickle

# # define the model with default hyperparameters
# model = AdaBoostRegressor()
# # define the grid of values to search
# grid = dict()
# #grid['n_estimators'] = [10, 50, 100, 500,1000,1500,2000,3000]
# grid['n_estimators'] = [10, 50]
# grid['learning_rate'] = [0.0001, 0.001]
# # define the evaluation procedure
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define the grid search procedure
# grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='r2')

# # execute the grid search
# grid_result = grid_search.fit(X, y)

# # summarize the best score and configuration
# now=dt.datetime.now()
# filename1="./opt_parameters_ada_boost/{}_gs_opt_param.pkl".format(
#         now.strftime("%Y%m%d_%H%M%S"))
# filename2="./opt_parameters_ada_boost/{}_gs_full_run.txt".format(
#         now.strftime("%Y%m%d_%H%M%S"))
# with open(filename1, "wb") as optimal_parameters_logger:
#     output_dict = grid_result.best_params_
#     pickle.dump(output_dict, optimal_parameters_logger)
#     optimal_parameters_logger.close()
# with open(filename2, "a") as results_logger:
#     output_1 = "Best: {} using {} \n".format(grid_result.best_score_, grid_result.best_params_)
#     print(output_1)
#     results_logger.write(output_1)
#     # summarize all scores that were evaluated
#     means = grid_result.cv_results_['mean_test_score']
#     stds = grid_result.cv_results_['std_test_score']
#     params = grid_result.cv_results_['params']
#     for mean, stdev, param in zip(means, stds, params):
#         output_2 = "{} ({}) with: {} \n".format(mean, stdev, param)
#         #print("%f (%f) with: %r" % (mean, stdev, param))
#         print(output_2)
#         results_logger.write(output_2)
#     results_logger.close()

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
import datetime as dt
import pickle


def grid_search_optimization(
    X,
    y,
    n_estimators_list=[10, 50],
    learning_rate_list=[0.0001, 0.001],
    loss_functions_list = ["linear"],
    scoring_metric="r2",
    n_splits_for_cv = 5,
    n_repeats_for_cv = 3
):
    """Computes the optimal values for the LGBM model

    Parameters
    ----------
    X : dataframe
        Train dataset
    y : target dataframe
        Values to be predicted
    n_estimators_list: list
        List of n_estimators values for grid search
    learning_rate_list: list
        List of learning_rate values for grid search
    scoring_metric: 
        Scoring metrics from sci-kit learn default r2
    n_splits_for_cv:
        Number of splits for K Fold cross validation
    n_repeats_for_cv:
        Number of repetition for cross validation

    Returns
    -------
    dictionary
        Optimized values
    """
    # define the model with default hyperparameters
    model = AdaBoostRegressor()
    # define the grid of values to search
    grid = dict()
    #grid['n_estimators'] = [10, 50, 100, 500,1000,1500,2000,3000]
    grid['n_estimators'] = n_estimators_list
    grid['learning_rate'] = learning_rate_list
    grid['loss'] = loss_functions_list
    # define the evaluation procedure
    cv = RepeatedKFold(n_splits=n_splits_for_cv, n_repeats=n_repeats_for_cv, random_state=1)
    # define the grid search procedure
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring=scoring_metric)

    # execute the grid search
    grid_result = grid_search.fit(X, y)

    # summarize the best score and configuration
    now=dt.datetime.now()
    filename1="./opt_parameters_ada_boost/{}_{}_gs_opt_param.pkl".format(
            now.strftime("%Y%m%d_%H%M%S"),scoring_metric)
    filename2="./opt_parameters_ada_boost/{}_gs_full_run.txt".format(
            now.strftime("%Y%m%d_%H%M%S"),scoring_metric)
    with open(filename1, "wb") as optimal_parameters_logger:
        output_dict = grid_result.best_params_
        output_dict["scoring"] = scoring_metric
        output_dict["best_score"] = grid_result.best_score_
        output_dict["n_splits"] = n_splits_for_cv
        output_dict["n_repeats"] = n_repeats_for_cv
        pickle.dump(output_dict, optimal_parameters_logger)
        optimal_parameters_logger.close()
    with open(filename2, "a") as results_logger:
        output_1 = "Best: {} using {} \n".format(grid_result.best_score_, grid_result.best_params_)
        print(output_1)
        results_logger.write(output_1)
        # summarize all scores that were evaluated
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            output_2 = "{} ({}) with: {} \n".format(mean, stdev, param)
            print(output_2)
            results_logger.write(output_2)
        results_logger.close()
    return output_dict

In [None]:
n_estimators_v=list(range(50,4025,25))
learning_rate_v=[0.001,0.002,0.0025,0.005,0.0075,0.01,0.0125,0.015,0.0175,0.02,0.025,0.03,0.04,0.05]
loss_function_v = ["linear","exponential"]
regression_metrics=["r2","neg_mean_absolute_error","neg_mean_squared_error","neg_root_mean_squared_error"]

In [None]:
for i in regression_metrics:
    grid_search_optimization(X,y,n_estimators_v,learning_rate_v,scoring_metric=i,loss_functions_list=loss_function_v)