In [1]:
import os
import boto3
from dotenv import load_dotenv
import io 
import pandas as pd

load_dotenv()

YC_ACCESS_KEY_ID = os.getenv("YC_ACCESS_KEY_ID")
YC_SECRET_ACCESS_KEY = os.getenv("YC_SECRET_ACCESS_KEY")
YC_ENDPOINT_URL = os.getenv("YC_ENDPOINT_URL")
YC_BUCKET_NAME = os.getenv("YC_BUCKET_NAME")

In [2]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',
    endpoint_url=YC_ENDPOINT_URL,
    aws_access_key_id=YC_ACCESS_KEY_ID,
    aws_secret_access_key=YC_SECRET_ACCESS_KEY
)

In [3]:
import gzip


data_location = "final_train_datasets/"
file_names = ["0m_lags.csv.gzip", "3m_lags.csv.gzip", "6m_lags.csv.gzip", "12m_lags.csv.gzip"]


data_storage = dict()
for file_name in file_names:
    response = s3_client.get_object(Bucket=YC_BUCKET_NAME, Key=f'{data_location}{file_name}')
    data_storage[file_name[: len(file_name) - 9]] = pd.read_csv(io.BytesIO(response['Body'].read()), compression='gzip')




In [4]:
import sys
sys.path.append('../src')
from validation_schema import TimeSeriesRollingValidator

Генеруруем скользящее окно для оцкнок модели 

In [5]:
validator = TimeSeriesRollingValidator(data_storage["0m_lags"], 'date_block_num', train_window=24, test_window=1)
splits = validator.split_data_rolling()

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [7]:
models = list()
for train_set, val_set, test_set in splits:
    X_train = train_set.drop(columns=['item_cnt_month'])
    y_train = train_set['item_cnt_month']
    
    X_val = val_set.drop(columns=['item_cnt_month'])
    y_val = val_set['item_cnt_month']
    
    X_test = test_set.drop(columns=['item_cnt_month'])
    y_test = test_set['item_cnt_month']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    models.append(model)

In [8]:
validator.validate(models, splits, "item_cnt_month")

Unnamed: 0,model,rmse,train_months,val_months,test_months
0,LinearRegression,2.035288,1242622,46218,41008
1,LinearRegression,2.087133,1226186,41008,40039
2,LinearRegression,2.489635,1207732,40039,32463
3,LinearRegression,2.133188,1184277,32463,31799
4,LinearRegression,2.048215,1162536,31799,31605
5,LinearRegression,1.831536,1141404,31605,33248
6,LinearRegression,2.015752,1117272,33248,33183
7,LinearRegression,2.478341,1092955,33183,29271
8,LinearRegression,2.247612,1068589,29271,30950


Видно что чем больше окно, тем меньше ошибка.

Применять агрегирующие функции для получения новых фич следует после зазбиения на splits и не будет утечек