In [1]:
import sys
sys.path.append('..') 
import pandas as pd

## Data Loading

In [2]:
# # Save feature_engineered_df 
# feature_engineered_df.to_csv("data/feature_engineered_df.csv")
# # Load feature_engineered_df 
parse_dates = ["date"]
feature_engineered_df = pd.read_csv('data/feature_engineered_df.csv', index_col=[0], parse_dates=parse_dates)
feature_engineered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 15 to 5912
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         4698 non-null   datetime64[ns]
 1   province                     4698 non-null   object        
 2   current_month_consumption    4698 non-null   float64       
 3   ARIMA_prediction             4698 non-null   float64       
 4   last_year_total_consumption  4698 non-null   float64       
 5   rolling_mean_2               4698 non-null   float64       
 6   rolling_mean_3               4698 non-null   float64       
 7   lag1_monthly_share           4698 non-null   float64       
 8   lag2_monthly_share           4698 non-null   float64       
 9   lag3_monthly_share           4698 non-null   float64       
 10  lag1                         4698 non-null   float64       
 11  lag2                         4698 non-null

## Data Preperation

### Ultimate Train Test Split

In [3]:
from src.model_selection.data_preperation import DataPreperation as dp
model_df = feature_engineered_df.sort_values(by=["date"]).reset_index(drop=True).copy()
main_train, main_test = dp.train_test_split(data=model_df, index_column1="date",index_column2="province",lag=1)

Date range date at train is:  2017-04-01 00:00:00 2021-11-01 00:00:00 with shape of:  (4536, 21)
Date range date at test is:  2021-12-01 00:00:00 2021-12-01 00:00:00 with shape of:  (81, 21)


### Grid Search Train Test Split

In [4]:
X_train, y_train, X_test, y_test = dp.ts_train_test_split(data=main_train.reset_index(),
                                                          index_column1="date",
                                                          index_column2="province",
                                                          lag=0)

Date range date at train is:  2017-04-01 00:00:00 2021-10-01 00:00:00 with shape of:  (4455, 21)
Date range date at test is:  2021-11-01 00:00:00 2021-11-01 00:00:00 with shape of:  (81, 21)
Maximum date at train is:  (Timestamp('2021-10-01 00:00:00'), 'ŞIRNAK')  Shape is:  (4455, 19)
Minimum date at train is:  (Timestamp('2017-04-01 00:00:00'), 'ADANA')  Shape is:  (4455, 19)
Maximum date at test is:  (Timestamp('2021-11-01 00:00:00'), 'ŞIRNAK')  Shape is:  (81, 19)
Minimum date at test is:  (Timestamp('2021-11-01 00:00:00'), 'ADANA')  Shape is:  (81, 19)


## Hyperparameter Optimization

In [19]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from sklearn.model_selection import TimeSeriesSplit

In [42]:
gs_pipeline_params = {
    "estimator_list":["xgboost","lightgbm","catboost"],
    # "cv":,
    "estim_params": {
        "init_params":
                {
                "xgboost__init_params":{"objective":"reg:gamma"},
                "lightgbm__init_params":{"objective":"gamma"},
                "catboost__init_params":{}
                },
        # "grid_search_params":value,
        # "fit_params":value
    },
    "scoring":"neg_mean_absolute_percantage_error",
    "gs_train_test_split":{"index_column1":"date",
                           "index_column2":"province",
                           "lag":0, 
                           "verbose": False}
}
gs_pipeline_params["gs_train_test_split"]

{'index_column1': 'date',
 'index_column2': 'province',
 'lag': 0,
 'verbose': False}

In [43]:
from typing import Any
def grid_search_pipeline(data: pd.DataFrame, estimator_list: list,
                        estim_params: dict, cv: Any=None,
                        scoring: str="neg_mean_absolute_percantage_error", **kwargs):
    # Grid Search Train Test Split
    X_train, y_train, X_test, y_test = dp.ts_train_test_split(data=data.reset_index(),
                                                            **kwargs["gs_train_test_split"])
    # Get Estimators from the pipeline
    estimator_pipeline = make_estimator_pipeline(estimator_list=estimator_list, **estim_params["init_params"])
    # print(estim_params["init_params"])
def make_estimator_pipeline(estimator_list: list, **kwargs):
    estimator_pipeline = dict()
    print("Estimators are:",estimator_list)
    # print(kwargs)
    if "xgboost" in estimator_list:
        estimator_pipeline["xgboost"] = xgb.XGBRegressor(**kwargs["xgboost__init_params"])
        # print(kwargs["xgboost__init_params"])
    if "lightgbm" in estimator_list:
        estimator_pipeline["lightgbm"] = lgb.LGBMRegressor(**kwargs["lightgbm__init_params"])
        # print(kwargs["lightgbm__init_params"])
    if "catboost" in estimator_list:
        estimator_pipeline["catboost"] = cbt.CatBoostRegressor(**kwargs["catboost__init_params"])
        # print(kwargs["catboost__init_params"])
    return estimator_pipeline
grid_search_pipeline(data=main_train, **gs_pipeline_params)

Date range date at train is:  2017-04-01 00:00:00 2021-10-01 00:00:00 with shape of:  (4455, 21)
Date range date at test is:  2021-11-01 00:00:00 2021-11-01 00:00:00 with shape of:  (81, 21)
Estimators are: ['xgboost', 'lightgbm', 'catboost']
