In [1]:
import os
import boto3
from dotenv import load_dotenv
import io 
import pandas as pd

load_dotenv()

YC_ACCESS_KEY_ID = os.getenv("YC_ACCESS_KEY_ID")
YC_SECRET_ACCESS_KEY = os.getenv("YC_SECRET_ACCESS_KEY")
YC_ENDPOINT_URL = os.getenv("YC_ENDPOINT_URL")
YC_BUCKET_NAME = os.getenv("YC_BUCKET_NAME")

In [2]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',  
    endpoint_url=YC_ENDPOINT_URL,
    aws_access_key_id=YC_ACCESS_KEY_ID,
    aws_secret_access_key=YC_SECRET_ACCESS_KEY
)

In [3]:
import gzip


data_location = "final_train_datasets/"
file_names = ["0m_lags.csv.gzip", "3m_lags.csv.gzip", "6m_lags.csv.gzip", "12m_lags.csv.gzip"]


data_storage = dict()
for file_name in file_names:
    response = s3_client.get_object(Bucket=YC_BUCKET_NAME, Key=f'{data_location}{file_name}')
    data_storage[file_name[: len(file_name) - 9]] = pd.read_csv(io.BytesIO(response['Body'].read()), compression='gzip')

Small feature extraction step was made before  by adding some lags features

In [5]:
data_storage['12m_lags'].head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,item_cnt_month_3m_avg,item_cnt_month_6m_avg,item_cnt_month_12m_avg,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_8,item_cnt_month_lag_9,item_cnt_month_lag_10,item_cnt_month_lag_11,item_cnt_month_lag_12
0,1,0,30,22.0,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,31,11.0,37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,32,6.0,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,32,10.0,40,6.0,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,33,3.0,37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
import sys
sys.path.append('../src')
from validation_schema import TimeSeriesRollingValidator

ModuleNotFoundError: No module named 'validation_schema'

Generating a sliding window for model evaluation.

In [5]:
validator = TimeSeriesRollingValidator(data_storage["0m_lags"], 'date_block_num', train_window=24, test_window=1)
splits = validator.split_data_rolling()

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [7]:
models = list()
for train_set, val_set, test_set in splits:
    X_train = train_set.drop(columns=['item_cnt_month'])
    y_train = train_set['item_cnt_month']
    
    X_val = val_set.drop(columns=['item_cnt_month'])
    y_val = val_set['item_cnt_month']
    
    X_test = test_set.drop(columns=['item_cnt_month'])
    y_test = test_set['item_cnt_month']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    models.append(model)

In [8]:
validator.validate(models, splits, "item_cnt_month")

Unnamed: 0,model,rmse,train_months,val_months,test_months
0,LinearRegression,2.035288,1242622,46218,41008
1,LinearRegression,2.081189,1226186,41008,40039
2,LinearRegression,2.486169,1207732,40039,32463
3,LinearRegression,2.129647,1184277,32463,31799
4,LinearRegression,2.056785,1162536,31799,31605
...,...,...,...,...,...
76,LinearRegression,2.040697,1162536,31799,31605
77,LinearRegression,1.824572,1141404,31605,33248
78,LinearRegression,2.011911,1117272,33248,33183
79,LinearRegression,2.475891,1092955,33183,29271


I've used a **rolling-window forecast** approach to evaluate my model. This is the gold standard for time series, as it prevents data leakage and provides a realistic measure of future performance. I used the last month for validation and the preceding months for training, which accurately simulates a real-world forecasting scenario.