In [4]:
import numpy as np
import pandas as pd
import polars as pl
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

from datetime import timedelta
from epiweeks import Week

In [5]:
def get_epiweek_start_dates(start_week, end_week):
    week_dates = []
    epiweeks = []
    current_week = start_week

    while True:
        week_dates.append(current_week.startdate())
        epiweeks.append(int(f"{current_week.year}{current_week.week:02d}"))

        if current_week.year == end_week.year and current_week.week == end_week.week:
            break

        next_date = current_week.startdate() + timedelta(days=7)
        current_week = Week.fromdate(next_date)

    return pl.DataFrame({'date': week_dates, 'epiweek': epiweeks}).sort('date')

# Validation set 1
start_week = Week(2022, 41)
end_week = Week(2023, 40)
validation_dates_1 = get_epiweek_start_dates(start_week, end_week)

# Validation set 2
start_week = Week(2023, 41)
end_week = Week(2024, 40)
validation_dates_2 = get_epiweek_start_dates(start_week, end_week)

# Validation set 3
start_week = Week(2024, 41)
end_week = Week(2025, 40)
validation_dates_3 = get_epiweek_start_dates(start_week, end_week)

validation_date = {1: validation_dates_1, 2: validation_dates_2, 3: validation_dates_3}

In [6]:
data = pl.read_parquet('../data/3_primary/complete_dataset.parquet')
data = data.filter(pl.col('uf') != 'ES')

data = data.rename({'casos': 'target'})
data = data.with_columns(
    np.log1p(pl.col("target")).alias("log_casos")
)

static_features = data.group_by('uf').agg(pl.col('uf_latitude').first(),pl.col('uf_longitude').first())

In [7]:
def train_test_sprint_split(data,validation_set=1):
    exo = ['enso','log_casos']
    train = data.filter(pl.col('train_'+str(validation_set)))[['uf','date','target',*exo]]
    train = TimeSeriesDataFrame.from_data_frame(
        train.to_pandas(),id_column='uf',
        timestamp_column='date',
        static_features_df=static_features.to_pandas()
    )
    test =  data.filter(
        pl.col('target_'+str(validation_set)) | (
            (pl.col('date') < np.max(np.array(data.filter(pl.col('target_'+str(validation_set))).sort('date')['date'])))
        )
    )[['uf','date','target',*exo]]
    test = TimeSeriesDataFrame.from_data_frame(
        test.to_pandas(),
        id_column='uf',
        timestamp_column='date',
        static_features_df=static_features.to_pandas()
    )
    return train,test

def train_model(train, validation_set=1):
    prediction_length = 67
    predictor = TimeSeriesPredictor(
        prediction_length=prediction_length,
        path='./SprintModels/chronos_bolt_small'+str(validation_set)
        ).fit(
        train_data=train,
        hyperparameters={
            "Chronos": [
                # {"model_path": "bolt_small", "fine_tune": True, "ag_args": {"name_suffix": ""}},
                {"model_path": "bolt_small", "fine_tune": False, "ag_args": {"name_suffix": ""}},
            ]
        },
        # time_limit=5,  # time limit in seconds
        enable_ensemble=False,
    )
    return predictor

def validation_sprint(validation_dates, train, predictor, validation_set=1):
    predictions = predictor.predict(data=train,model='Chronos[bolt_small]')

    # dates_target =data.filter(pl.col('target_'+str(validation_set)))[['date','epiweek', 'target_'+str(validation_set)]].unique().sort('date')
    validation = pl.DataFrame(predictions.reset_index()).rename({'item_id': 'uf','timestamp': 'date'})

    validation = validation.with_columns(
        pl.col('date').dt.date().alias('date')
    )

    validation = validation.join(validation_dates, on='date', how='left').filter(pl.col('epiweek').is_not_null())
    return validation


In [8]:
for v in [1,2,3]:
    train, test = train_test_sprint_split(data,v)
    predictor = train_model(train,validation_set=v)
    validation =validation_sprint(validation_date[v], train, predictor,validation_set=v)
    validation.write_parquet('../data/4_model_output/validation_sprint_'+str(v)+'.parquet')

Beginning AutoGluon training...
AutoGluon will save models to '/Users/davibarreira/Documents/DSProjects/Mosqlimate/jbd-mosqlimate-sprint/train_model/SprintModels/chronos_bolt_small1'
AutoGluon Version:  1.4.0
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:33 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T8122
CPU Count:          8
GPU Count:          1
Memory Avail:       4.10 GB / 16.00 GB (25.6%)
Disk Space Avail:   378.45 GB / 926.35 GB (40.9%)

Fitting with arguments:
{'enable_ensemble': False,
 'eval_metric': WQL,
 'hyperparameters': {'Chronos': [{'ag_args': {'name_suffix': ''},
                                  'fine_tune': False,
                                  'model_path': 'bolt_small'}]},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 67,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_wind

In [9]:
print(((data.filter(pl.col('target_1'))[['date']].sort(by='date').unique() == validation_dates_1['date']).to_numpy().all()))
print(((data.filter(pl.col('target_2'))[['date']].sort(by='date').unique() == validation_dates_2['date']).to_numpy().all()))


max_date_v3 = data.filter(pl.col('target_3'))[['date']].sort(by='date').unique().max()[0].item()
((data.filter(pl.col('target_3'))[['date']].sort(by='date').unique() == validation_dates_3[['date']].filter(pl.col('date') <= max_date_v3)).to_numpy().all())

True
True


True