In [1]:
import sys
sys.path.append('..') 
import os

In [2]:
import pandas as pd
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta

## Data Loading

In [3]:
# # Save feature_engineered_df 
# feature_engineered_df.to_csv("data/feature_engineered_df.csv")
# # Load feature_engineered_df 
parse_dates = ["date"]
feature_engineered_df = pd.read_csv('data/feature_engineered_df.csv', index_col=[0], parse_dates=parse_dates)
feature_engineered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 15 to 5912
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   date                                4698 non-null   datetime64[ns]
 1   province                            4698 non-null   object        
 2   current_month_consumption           4698 non-null   float64       
 3   last_year_same_month_consumption    4698 non-null   float64       
 4   last_year_total_consumption         4698 non-null   float64       
 5   current_month_share                 4698 non-null   float64       
 6   previous_1_month_share              4698 non-null   float64       
 7   previous_2_month_share              4698 non-null   float64       
 8   previous_3_month_share              4698 non-null   float64       
 9   previous_1_month_consumption        4698 non-null   float64       
 10  previous_2_month_consum

## Time-Series Split Testing

In [4]:
deneme = feature_engineered_df.sort_values(by=["date"]).reset_index(drop=True).copy()
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=81)
cv = tscv
for train_index, test_index in tscv.split(deneme):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [   0    1    2 ... 4290 4291 4292] TEST: [4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306
 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320
 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334
 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348
 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362
 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373]
TRAIN: [   0    1    2 ... 4371 4372 4373] TEST: [4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387
 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401
 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415
 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429
 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443
 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454]
TRAIN: [   0    1    2 ... 4452 4453 4454] TEST: [4455 4456 4457 4458 4459 446

## Data Preperation

In [5]:
model_df = feature_engineered_df.sort_values(by=["date"]).reset_index(drop=True).copy()

In [6]:
# from src.model_selection.data_preperation import DataPreperation

# Initialize Data Preperation module
# dp = DataPreperation(data=model_df)
# # Train-Test split
# X_train, y_train, X_test, y_test = dp.tts_last_month(index_column1="date" ,index_column2="province")

# latter
# X_train, y_train, X_test, y_test, X_val, y_val = dp.ttvs_last_month(index_column1="date" ,index_column2="province",lag=lag, split_size = 0.05)
# print("Maximum date at val is: ", X_test.index.max(), " Shape is: ", X_val.shape)
# print("Minimum date at val is: ", X_test.index.min(), " Shape is: ", X_val.shape)  
# print("Maximum date at train is: ", X_train.index.max()," Shape is: ", X_train.shape)
# print("Minimum date at train is: ", X_train.index.min()," Shape is: ", X_train.shape)
# print("Maximum date at test is: ", X_test.index.max(), " Shape is: ", X_test.shape)
# print("Minimum date at test is: ", X_test.index.min(), " Shape is: ", X_test.shape)

## Model Selection

### Utils

#### Scoring

In [7]:
from typing import Any
import numpy as np
from sklearn.metrics import mean_squared_error 
def rmse(actual_value: Any, prediction: Any):
    return np.sqrt(((actual_value - prediction) ** 2)/2)

def ape(actual_value: Any, prediction: Any):
    return abs(actual_value-prediction)/actual_value*100
     
def prep_scoring_df(X_test: pd.DataFrame, y_test: pd.DataFrame, target_col: str="current_month_consumption", col_list: list=None)-> pd.DataFrame:
    scoring_df = X_test.copy()
    scoring_df["actual_value"] = y_test[target_col]
    scoring_df.reset_index(level="province",inplace=True)
    if not col_list:
        col_list = ["province", "actual_value"]
    scoring_df = scoring_df[col_list]
    return scoring_df

def scoring(X_test: pd.DataFrame, y_test: pd.DataFrame, scoring_df: pd.DataFrame, predictions: Any, model_name: str="XGB")-> pd.DataFrame:
    if scoring_df.empty:
        scoring_df = prep_scoring_df(X_test=X_test, y_test=y_test)
    else: 
        scoring_df = scoring_df.copy()
    # Insert predictions
    scoring_df[f"{model_name}_prediction"] = predictions
    # Add rmse
    scoring_df[f"{model_name}_rmse"] = scoring_df.apply(lambda row: rmse(row.actual_value, row[f"{model_name}_prediction"]), axis=1)
    # Add percentage difference
    scoring_df[f"{model_name}_absolute_percent_error"] = scoring_df.apply(lambda row: ape(row.actual_value, row[f"{model_name}_prediction"]), axis=1)
    
    print("Number of records greater than 20 percent error:",(scoring_df[f"{model_name}_absolute_percent_error"] > 20).sum())
    print("Number of records greater than 15 percent error:",(scoring_df[f"{model_name}_absolute_percent_error"] > 15).sum())
    print("Number of records greater than 10 percent error:",(scoring_df[f"{model_name}_absolute_percent_error"] > 10).sum())
    print("Number of records greater than 5 percent error:",(scoring_df[f"{model_name}_absolute_percent_error"] > 5).sum())
    return scoring_df

### XGB

In [8]:
# Initialization parameters
init_params = {
    # "early_stopping_rounds": 20,
    # "eval_metric": ["rmsle","mape","rmse"] # default ,mae(kötü) ,mape(kötü), rmse, rmsle(1 tane 54 gerisi 20nin altında), 
}

# Grid Search Parameters
# grid_search_params = {
#     'alpha': [0.005, 0.01, 0.015],    
#     'colsample_bytree': [1.0],
#     'learning_rate': [0.045, 0.050],    
#     'max_depth': [7, 8, 9],
#     'min_child_weight': [2, 3],
#     'n_estimators': [400, 450, 500],
#     'subsample': [1],
# }
grid_search_params = {
    'alpha': [0.005, 0.0075, 0.01, 0.0125, 0.015],    
    'colsample_bytree': [0.8, 1.0],
    'learning_rate': [0.045, 0.050],    
    'max_depth': [6, 7, 8],
    'min_child_weight': [1,2, 3],
    'n_estimators': [400, 450, 500, 550, 1000],
    'subsample': [0.5, 1],
}
fit_params = {}

In [9]:
from src.model_selection.data_preperation import DataPreperation
from src.model_selection.modelling import xgb_simulator

for lag in range(3):
    print("Run for Lag=", lag)
    # Train test split
    dp = DataPreperation(data=model_df)
    X_train, y_train, X_test, y_test = dp.tts_last_month(index_column1="date" ,index_column2="province", lag=lag)

    # Initialize Simulator
    xgb = xgb_simulator(init_params=init_params, fit_params=fit_params, 
                        grid_search_params=grid_search_params, save=False)
    
    # Train the model
    xgb(X_train=X_train, y_train=y_train)
    # Set the best_model and best_parameter
    best_model = xgb.best_model
    best_params = xgb.best_params
    print(f"Best Parameters when lag = {lag}",best_params)
    # Predict test set
    predictons = best_model.predict(X_test)
    # Log results
    scoring_df=pd.DataFrame()
    scoring_df = scoring(X_test=X_test, y_test=y_test, predictions=predictons, scoring_df=scoring_df, model_name=f"Lag_{lag}_XGB_")    


Run for Lag= 0
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits
Best Parameters when lag = 0 {'alpha': 0.015, 'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 1000, 'subsample': 0.5}
Number of records greater than 20 percent error: 5
Number of records greater than 15 percent error: 9
Number of records greater than 10 percent error: 12
Number of records greater than 5 percent error: 33
Run for Lag= 1
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits
Best Parameters when lag = 1 {'alpha': 0.0075, 'colsample_bytree': 1.0, 'learning_rate': 0.045, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 1000, 'subsample': 0.5}
Number of records greater than 20 percent error: 1
Number of records greater than 15 percent error: 1
Number of records greater than 10 percent error: 2
Number of records greater than 5 percent error: 4
Run for Lag= 2
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits
Best

In [10]:
"""With cv = timseries.split"""
# Run for Lag= 0
# Fitting 5 folds for each of 108 candidates, totalling 540 fits
# Best Parameters when lag = 0 {'alpha': 0.005, 'colsample_bytree': 1.0, 'learning_rate': 0.045, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 500, 'subsample': 1}
# Number of records greater than 20 percent error: 6
# Number of records greater than 15 percent error: 9
# Number of records greater than 10 percent error: 16
# Number of records greater than 5 percent error: 37
# Run for Lag= 1
# Fitting 5 folds for each of 108 candidates, totalling 540 fits
# Best Parameters when lag = 1 {'alpha': 0.01, 'colsample_bytree': 1.0, 'learning_rate': 0.045, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 500, 'subsample': 1}
# Number of records greater than 20 percent error: 1
# Number of records greater than 15 percent error: 1
# Number of records greater than 10 percent error: 2
# Number of records greater than 5 percent error: 10
# Run for Lag= 2
# Fitting 5 folds for each of 108 candidates, totalling 540 fits
# Best Parameters when lag = 2 {'alpha': 0.01, 'colsample_bytree': 1.0, 'learning_rate': 0.045, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 500, 'subsample': 1}
# Number of records greater than 20 percent error: 0
# Number of records greater than 15 percent error: 0
# Number of records greater than 10 percent error: 2
# Number of records greater than 5 percent error: 11

'With cv = timseries.split'

In [11]:
from src.utils.plotting import plot_metrics
plot_metrics(scoring_df, col_x="province", col_y1="XGB_absolute_percent_error", col_y2="XGB_absolute_percent_error")

Shape of dataframe is :  (81, 5)


KeyError: 'XGB_absolute_percent_error'

<Figure size 1800x360 with 0 Axes>