In [1]:
import sys
sys.path.append('..') 
import pandas as pd

## Data Loading

In [2]:
# # Save feature_engineered_df 
# feature_engineered_df.to_csv("data/feature_engineered_df.csv")
# # Load feature_engineered_df 
parse_dates = ["date"]
feature_engineered_df = pd.read_csv('data/feature_engineered_df.csv', index_col=[0], parse_dates=parse_dates)
feature_engineered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 15 to 5912
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         4698 non-null   datetime64[ns]
 1   province                     4698 non-null   object        
 2   current_month_consumption    4698 non-null   float64       
 3   ARIMA_prediction             4698 non-null   float64       
 4   last_year_total_consumption  4698 non-null   float64       
 5   rolling_mean_2               4698 non-null   float64       
 6   rolling_mean_3               4698 non-null   float64       
 7   lag1_monthly_share           4698 non-null   float64       
 8   lag2_monthly_share           4698 non-null   float64       
 9   lag3_monthly_share           4698 non-null   float64       
 10  lag1                         4698 non-null   float64       
 11  lag2                         4698 non-null

## Data Preperation

### Ultimate Train Test Split

In [3]:
from src.model_selection.data_preperation import DataPreperation as dp
model_df = feature_engineered_df.sort_values(by=["date"]).reset_index(drop=True).copy()
main_train, main_test = dp.train_test_split(data=model_df, index_column1="date",index_column2="province",lag=1)

Date range date at train is:  2017-04-01 00:00:00 2021-11-01 00:00:00 with shape of:  (4536, 21)
Date range date at test is:  2021-12-01 00:00:00 2021-12-01 00:00:00 with shape of:  (81, 21)


### Grid Search Train Test Split

In [4]:
X_train, y_train, X_test, y_test = dp.ts_train_test_split(data=main_train.reset_index(),
                                                          index_column1="date",
                                                          index_column2="province",
                                                          lag=0)

Date range date at train is:  2017-04-01 00:00:00 2021-10-01 00:00:00 with shape of:  (4455, 21)
Date range date at test is:  2021-11-01 00:00:00 2021-11-01 00:00:00 with shape of:  (81, 21)
Maximum date at train is:  (Timestamp('2021-10-01 00:00:00'), 'ŞIRNAK')  Shape is:  (4455, 19)
Minimum date at train is:  (Timestamp('2017-04-01 00:00:00'), 'ADANA')  Shape is:  (4455, 19)
Maximum date at test is:  (Timestamp('2021-11-01 00:00:00'), 'ŞIRNAK')  Shape is:  (81, 19)
Minimum date at test is:  (Timestamp('2021-11-01 00:00:00'), 'ADANA')  Shape is:  (81, 19)


## Hyperparameter Optimization

In [57]:
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV

In [59]:
gs_pipeline_params = {
    "estimator_list":["xgboost","lightgbm","catboost"],
    "estimator_params": {
        "init_params":
            {
            "xgboost__init_params": {"objective":"reg:gamma"},
            "lightgbm__init_params": {"objective":"gamma"},
            "catboost__init_params": {}
            },
        "grid_search_params":
            {
            "xgboost__grid_search_params": {
                'alpha': [i for i in np.linspace(0.0,0.01, num=4)], # Defaults to 0
                'colsample_bytree': [0.8, 0.9, 1], # Defaults to 1
                'lambda': [i for i in np.linspace(0.33,1,num=4)], # Defaults to 1
                'learning_rate': [i for i in np.linspace(0.075,0.225, num=6)], # Defaults to 0.3
                'max_depth': [6, 8, 10], # Defaults to 6
                'min_child_weight': [1,2,3,4], # Defaults to 1
                'n_estimators': [150,200,250], 
                'subsample': [0.5,1], # Defaults to 1
                },
            "lightgbm__grid_search_params": {
                'num_iterations': [100, 250, 500],
                'num_leaves':[20,31,50],
                'max_depth':[-1,8,10],
                'learning_rate':[i for i in np.linspace(0.01,0.15,num=4)],
                'min_data_in_leaf':[15, 20, 25],
                'min_child_samples':[5,10,15],
                'feature_fraction': [0.25, 0.5, 1],
                'bagging_fraction': [i for i in np.linspace(0,1,num=4)],
                'bagging_freq': [50, 75, 100],
                },
            "catboost__grid_search_params":{
                
            }        
            },
        "fit_params": 
            {
            "xgboost__fit_params":{},
            "lightgbm__fit_params":{},
            "catboost__fit_params":{}                        
            }
    },
    "cross_validation_params": {
        "validator": "time_series",
        "k": 9,
        "test_size": 81,
        "gap": 0
    },
    
    "train_test_split":{"index_column1":"date",
                           "index_column2":"province",
                           "lag":0, 
                           "verbose": False},
    "hp_optimizer": {
        "optimizer_type": "RandomSearchCV",
        "scoring":"neg_mean_absolute_percantage_error",
        "n_jobs": -1,
    }
}
gs_pipeline_params["train_test_split"]

{'index_column1': 'date',
 'index_column2': 'province',
 'lag': 0,
 'verbose': False}

In [60]:
from typing import Any
def grid_search_pipeline(data: pd.DataFrame, estimator_list: list,
                        estimator_params: dict,
                        scoring: str="neg_mean_absolute_percantage_error", **kwargs):
    # Train Test Split the data for Grid Search
    X_train, y_train, X_test, y_test = dp.ts_train_test_split(data=data.reset_index(),
                                                            **kwargs["train_test_split"])
    # Establish Estimators pipeline
    estimator_pipeline = make_estimator_pipeline(estimator_list=estimator_list, **estimator_params["init_params"])
    print(estimator_params["init_params"])
    # Initialize the Cross Validator
    cv = get_cross_validator(**kwargs["cross_validation_params"])
    # Perform Random Search and Store Results
    for est in estimator_list:
        print("Estimator is",est)
        grid = hp_optimizer(estimator=estimator_pipeline[est], cv=cv, **kwargs["hp_optimizer"]).fit(X=X_train, y=y_train, **estimator_params["fit_params"][f"{est}__fit_params"])
        
        
def hp_optimizer(estimator: Any, search_params:dict, cv:Any, optimizer_type:str, scoring:str, n_jobs:int):
    if "GridSearchCV" in optimizer_type:
        return GridSearchCV(
                        estimator=estimator, # Target Estimator
                        param_distributions=search_params, # Hyperparameters set
                        cv=cv, # Cross Validator
                        n_jobs=n_jobs, # Number of parallel jobs
                        scoring=scoring, # Scoring parameter
                        Verbose=True
                        )
    elif "RandomSearchCV" in optimizer_type:
        return RandomizedSearchCV(
                        estimator=estimator, # Target Estimator
                        param_distributions=search_params, # Hyperparameters set
                        cv=cv, # Cross Validator
                        n_jobs=n_jobs, # Number of parallel jobs
                        scoring=scoring, # Scoring parameter
                        )    
        
def make_estimator_pipeline(estimator_list: list, **kwargs):
    estimator_pipeline = dict()
    print("Estimators are:",estimator_list)
    # print(kwargs)
    if "xgboost" in estimator_list:
        estimator_pipeline["xgboost"] = xgb.XGBRegressor(**kwargs["xgboost__init_params"])
        # print(kwargs["xgboost__init_params"])
    if "lightgbm" in estimator_list:
        estimator_pipeline["lightgbm"] = lgb.LGBMRegressor(**kwargs["lightgbm__init_params"])
        # print(kwargs["lightgbm__init_params"])
    if "catboost" in estimator_list:
        estimator_pipeline["catboost"] = cbt.CatBoostRegressor(**kwargs["catboost__init_params"])
        # print(kwargs["catboost__init_params"])
    return estimator_pipeline

def get_cross_validator(validator: str, k: int=9, test_size: int=81, gap: int=0):
    if validator == "time_series":
        from sklearn.model_selection import TimeSeriesSplit
        # print("time_series_cross_val")
        tscv = TimeSeriesSplit(gap=gap, max_train_size=None, n_splits=k, test_size=test_size)
        return tscv
    else:
        print("k-fold cross validation")
        return k
grid_search_pipeline(data=main_train, **gs_pipeline_params)

Date range date at train is:  2017-04-01 00:00:00 2021-10-01 00:00:00 with shape of:  (4455, 21)
Date range date at test is:  2021-11-01 00:00:00 2021-11-01 00:00:00 with shape of:  (81, 21)
Estimators are: ['xgboost', 'lightgbm', 'catboost']
{'xgboost__init_params': {'objective': 'reg:gamma'}, 'lightgbm__init_params': {'objective': 'gamma'}, 'catboost__init_params': {}}
Estimator is xgboost
{'optimizer_type': 'RandomSearchCV', 'scoring': 'neg_mean_absolute_percantage_error', 'n_jobs': -1}
Estimator is lightgbm
{'optimizer_type': 'RandomSearchCV', 'scoring': 'neg_mean_absolute_percantage_error', 'n_jobs': -1}
Estimator is catboost
{'optimizer_type': 'RandomSearchCV', 'scoring': 'neg_mean_absolute_percantage_error', 'n_jobs': -1}
