In [252]:
import os
import sys
import pandas as pd
# import polars as pl
import numpy as np
import json
import random
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
import time

import json

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

import warnings
warnings.filterwarnings('ignore')

In [253]:
CLEAN_DATA_PATH = os.path.join(ROOT, 'data', 'processed', 'forecast_clean.csv')
CLEAN_CLIENT_PATH = os.path.join(ROOT, 'data', 'processed', 'client_clean.pkl')
PREDICTION_IDS_PATH = os.path.join(ROOT, 'predictions', 'predictions_4.json')

data = pd.read_csv(CLEAN_DATA_PATH, parse_dates=['ds'])

In [254]:
clients = pd.read_pickle(CLEAN_CLIENT_PATH).rename(columns={"client_id": "unique_id"})

In [255]:
STATIC = list(clients.columns)

# Creating past variables

In [256]:
merged_data = pd.merge(data, clients, on="unique_id", how="left")

In [257]:
def bin_financial_features(df):

    df = df.copy()
    
    # Columns to bin
    cols_to_bin = ['yearly_income', 'total_debt', 'credit_score']
    
    for col in cols_to_bin:
        # Create new column name
        binned_col_name = f'{col}_bin'
        
        # Create bins using quartiles
        bins = pd.qcut(df[col], q=4, labels=[1, 2, 3, 4])
        
        # Add new binned column
        df[binned_col_name] = bins
        
        # Print the ranges for each bin
        bin_ranges = pd.qcut(df[col], q=4, retbins=True)[1]
        print(f"\nBin ranges for {col}:")
        for i in range(4):
            print(f"Bin {i+1}: {bin_ranges[i]:.2f} to {bin_ranges[i+1]:.2f}")
    
    return df

merged_data = bin_financial_features(merged_data)


Bin ranges for yearly_income:
Bin 1: 1.00 to 31912.00
Bin 2: 31912.00 to 39947.00
Bin 3: 39947.00 to 52172.00
Bin 4: 52172.00 to 280199.00

Bin ranges for total_debt:
Bin 1: 0.00 to 17392.00
Bin 2: 17392.00 to 51633.00
Bin 3: 51633.00 to 83828.00
Bin 4: 83828.00 to 461854.00

Bin ranges for credit_score:
Bin 1: 488.00 to 684.00
Bin 2: 684.00 to 715.00
Bin 3: 715.00 to 755.00
Bin 4: 755.00 to 850.00


In [258]:
def build_group_features(df, cols_to_group, target_column, lags, aggfunc):
    if not isinstance(lags, list):
        raise ValueError("Lags should be a list of integers.")

    new_name = "_".join(cols_to_group + [target_column, aggfunc])

    grouped_df = (
        df.set_index("ds")
        .groupby(cols_to_group, observed=False)
        .resample("ME")[target_column]
        .agg(aggfunc)
        .reset_index()
        .rename(columns={target_column: new_name})
    )

    new_group_cols = []
    for n in lags:
        col_name = f'{new_name}_lag{n}'
        grouped_df[col_name] = grouped_df.groupby(cols_to_group, observed=False)[new_name].shift(n)
        new_group_cols.append(col_name)

    grouped_df.drop(columns=new_name, inplace=True)

    merge_cols = ['ds'] + cols_to_group
    df = pd.merge(df, grouped_df, on=merge_cols, how='left')

    return df, new_group_cols


print(f"Initial DataFrame shape: {merged_data.shape}")
# Create grouped lag features
groups = [
    ['current_age'],
    ['retirement_age'],
    ['gender'],
    ['latitude'],
    ['num_credit_cards'],
    ['yearly_income_bin'],
    ['total_debt_bin'],
    ['credit_score_bin'],
]

agg_functions = ['sum', 'max', 'min', 'mean']

df = merged_data.copy()
for group in groups:
    for agg_func in agg_functions:
        temp_df, new_group_cols = build_group_features(
                                                        df=merged_data,
                                                        cols_to_group=group,
                                                        target_column='y',
                                                        aggfunc=agg_func,
                                                        lags=[1, 2, 3],
                                                    )
        df = df.merge(temp_df[['ds', 'unique_id'] + new_group_cols], on=['ds', 'unique_id'], how='left')

df = df.dropna()
print(f"Final DataFrame shape: {df.shape}")

Initial DataFrame shape: (139946, 18)
Final DataFrame shape: (136645, 114)


In [259]:
def create_expense_interactions(df):
    """
    Create meaningful feature interactions for expense forecasting.
    
    Parameters:
    df: pandas DataFrame containing the expense data
    
    Returns:
    DataFrame with added interaction features
    """
    df = df.copy()
    
    # 1. Financial Capacity Interactions
    # Debt to Income ratio
    df['debt_to_income'] = df['total_debt'] / df['yearly_income'].clip(lower=1)
    
    # Credit utilization proxy (debt per credit card)
    df['debt_per_card'] = df['total_debt'] / df['num_credit_cards'].clip(lower=1)
    
    # Income adequacy (per capita income ratio)
    df['income_adequacy'] = df['yearly_income'] / df['per_capita_income'].clip(lower=1)
    
    # 2. Age-based Financial Interactions
    # Years to retirement
    df['years_to_retirement'] = df['retirement_age'] - df['current_age']
    
    # Financial pressure index (debt relative to years to retirement)
    df['retirement_debt_pressure'] = df['total_debt'] / (df['years_to_retirement'].clip(lower=1))
    
    # Age-income comparison
    df['income_age_ratio'] = df['yearly_income'] / df['current_age']
    
    # 3. Geographic Financial Context
    # Location-income interaction (normalized coordinates * income)
    df['lat_income_effect'] = df['latitude'] * df['yearly_income'] / 100000
    df['long_income_effect'] = df['longitude'] * df['yearly_income'] / 100000
    
    # 4. Credit-based Interactions
    # Credit score to debt ratio
    df['credit_score_debt_ratio'] = df['credit_score'] / df['total_debt'].clip(lower=1)
    
    # Credit efficiency (credit score per card)
    df['credit_efficiency'] = df['credit_score'] / df['num_credit_cards'].clip(lower=1)
    
    # 5. Temporal Components
    # Extract temporal features from ds
    df['month'] = pd.to_datetime(df['ds']).dt.month
    df['year'] = pd.to_datetime(df['ds']).dt.year
    
    # Monthly income
    df['monthly_income'] = df['yearly_income'] / 12
    
    # Seasonal debt burden (debt relative to month)
    df['seasonal_debt_ratio'] = df['total_debt'] * df['month'] / 12
    
    # 6. Complex Interactions
    # Financial health score
    df['financial_health_score'] = (
        (df['credit_score'] / 850) * 
        (1 - df['debt_to_income'].clip(upper=1)) * 
        (df['yearly_income'] / df['per_capita_income'].clip(lower=1)).clip(upper=5)
    )
    
    # Risk metric
    df['risk_metric'] = (
        (df['total_debt'] / df['yearly_income'].clip(lower=1)) *
        (850 - df['credit_score']) / 850 *
        (70 - df['current_age'].clip(upper=70)) / 70
    )
    
    return df

def get_most_important_features(df, target_col='y', n_correlations=10):
    """
    Print the most correlated interaction features with the target variable.
    """
    interaction_cols = [col for col in df.columns if col not in ['unique_id', 'ds', 'y']]
    correlations = df[interaction_cols + [target_col]].corr()[target_col].sort_values(ascending=False)
    
    print("\nTop correlations with expenses (y):")
    print(correlations.head(n_correlations))

In [260]:
final_data = create_expense_interactions(df)
get_most_important_features(final_data)


Top correlations with expenses (y):
y                             1.000000
current_age_y_mean_lag2       0.207646
current_age_y_mean_lag3       0.206801
current_age_y_mean_lag1       0.206325
current_age_y_min_lag1        0.176297
current_age_y_min_lag2        0.176142
current_age_y_min_lag3        0.175852
retirement_age_y_mean_lag2    0.171851
retirement_age_y_mean_lag3    0.170826
retirement_age_y_mean_lag1    0.170796
Name: y, dtype: float64


In [261]:
def analyze_feature_variability(data):
    """
    Analyze which features are static (constant) or dynamic (varying) for each unique_id.
    
    Parameters:
    data: pandas DataFrame with 'unique_id' column
    
    Returns:
    tuple: (static_features, dynamic_features)
    """
    # Initialize sets to store results
    static_features = set()
    dynamic_features = set()
    
    # First pass: check if features are static across ALL IDs
    for unique_id, group in data.groupby('unique_id'):
        for col in group.columns:
            if group[col].nunique() == 1:
                static_features.add(col)
            else:
                dynamic_features.add(col)
        break  # We only need one group to initialize our sets
    
    # Second pass: validate across all other IDs
    for unique_id, group in data.groupby('unique_id'):
        current_static = {col for col in group.columns if group[col].nunique() == 1}
        current_dynamic = {col for col in group.columns if group[col].nunique() > 1}
        
        # Keep only features that are consistently static across all IDs
        static_features = static_features.intersection(current_static)
        # Add any new dynamic features
        dynamic_features = dynamic_features.union(current_dynamic)
    
    # Print results
    print("\nStatic Features (constant within each unique_id):")
    print("-" * 50)
    for feature in sorted(static_features):
        print(f"- {feature}")
        
    print("\nDynamic Features (varying within each unique_id):")
    print("-" * 50)
    for feature in sorted(dynamic_features):
        print(f"- {feature}")
    
    return list(static_features), list(dynamic_features)

# Use the function
static_features, dynamic_features = analyze_feature_variability(final_data)

# Print summary
print("\nSummary:")
print("-" * 50)
print(f"Total features: {len(data.columns)}")
print(f"Static features: {len(static_features)}")
print(f"Dynamic features: {len(dynamic_features)}")


Static Features (constant within each unique_id):
--------------------------------------------------
- birth_month
- birth_year
- credit_efficiency
- credit_score
- credit_score_bin
- credit_score_bin_y_max_lag1
- credit_score_bin_y_max_lag2
- credit_score_bin_y_max_lag3
- credit_score_debt_ratio
- current_age
- debt_per_card
- debt_to_income
- financial_health_score
- gender
- gender_y_max_lag1
- gender_y_max_lag2
- gender_y_max_lag3
- income_adequacy
- income_age_ratio
- lat_income_effect
- latitude
- long_income_effect
- longitude
- monthly_income
- num_credit_cards
- per_capita_income
- retirement_age
- retirement_debt_pressure
- risk_metric
- total_debt
- total_debt_bin
- total_debt_bin_y_max_lag1
- total_debt_bin_y_max_lag2
- total_debt_bin_y_max_lag3
- unique_id
- yearly_income
- yearly_income_bin
- yearly_income_bin_y_max_lag1
- yearly_income_bin_y_max_lag2
- yearly_income_bin_y_max_lag3
- years_to_retirement

Dynamic Features (varying within each unique_id):
-----------------

In [262]:
static_features_df = final_data[static_features].reset_index(drop=True)
static_features_df = static_features_df.drop_duplicates(subset=['unique_id'])
static_features_df

Unnamed: 0,gender,unique_id,credit_score_debt_ratio,credit_score,yearly_income_bin_y_max_lag3,longitude,retirement_debt_pressure,yearly_income_bin_y_max_lag2,income_age_ratio,monthly_income,...,lat_income_effect,risk_metric,credit_score_bin_y_max_lag3,gender_y_max_lag2,total_debt,financial_health_score,retirement_age,num_credit_cards,yearly_income,credit_score_bin_y_max_lag1
0,0,0,0.021078,763,0.0,-70.0,1005.527778,0.0,1806.454545,4967.750000,...,26.22972,0.032852,0.0,0.0,36199,0.718867,69,4,59613,0.0
115,0,1,0.048262,704,0.0,-87.0,470.548387,0.0,1054.883721,3780.000000,...,13.60800,0.021306,0.0,0.0,14587,1.145650,74,3,45360,0.0
230,0,2,0.008324,673,0.0,-74.0,5053.125000,0.0,571.812500,2287.250000,...,11.25327,0.192781,0.0,0.0,80850,0.000000,64,5,27447,0.0
345,0,3,0.036431,681,0.0,-99.0,1168.312500,0.0,570.265306,2328.583333,...,9.50062,0.039902,0.0,0.0,18693,0.540743,65,4,27943,0.0
460,0,4,0.006207,716,0.0,-122.0,6409.000000,0.0,1415.388889,6369.250000,...,36.68688,0.054388,0.0,0.0,115362,0.000000,72,5,76431,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136078,0,1994,0.007201,740,0.0,-120.0,5708.833333,0.0,1628.750000,6515.000000,...,30.49020,0.053459,0.0,0.0,102759,0.000000,66,3,78180,0.0
136185,0,1995,0.065625,605,0.0,-78.0,9219.000000,0.0,612.593750,3267.166667,...,15.68240,0.005809,0.0,0.0,9219,1.019132,62,4,39206,0.0
136300,0,1996,0.012161,728,0.0,-95.0,2993.100000,0.0,973.217391,3730.666667,...,12.98272,0.065802,0.0,0.0,59862,0.000000,66,3,44768,0.0
136415,0,1997,0.029304,758,0.0,-93.0,25867.000000,0.0,510.526316,3233.333333,...,17.46000,0.000000,0.0,0.0,25867,0.294552,69,7,38800,0.0


In [264]:
data = final_data[dynamic_features+['unique_id']].reset_index(drop=True).copy()
data

Unnamed: 0,yearly_income_bin_y_sum_lag2,gender_y_mean_lag3,current_age_y_max_lag2,latitude_y_min_lag3,credit_score_bin_y_sum_lag1,latitude_y_sum_lag3,num_credit_cards_y_sum_lag3,gender_y_sum_lag3,total_debt_bin_y_sum_lag3,yearly_income_bin_y_mean_lag2,...,gender_y_sum_lag1,retirement_age_y_min_lag1,latitude_y_max_lag2,yearly_income_bin_y_mean_lag3,num_credit_cards_y_mean_lag3,latitude_y_sum_lag1,retirement_age_y_mean_lag3,gender_y_min_lag1,credit_score_bin_y_min_lag1,unique_id
0,-136064.00,-491.150702,0.0,-2433.0,-127367.00,-8718.0,-149299.00,-531916.21,-114990.10,-503.940741,...,-529503.93,-2099.0,0.0,-586.353383,-522.024476,-11665.0,-440.296296,-7104.0,-5215.0,0
1,-152725.00,-448.587967,0.0,-2279.0,-128749.55,-13643.0,-144894.00,-492101.00,-104237.00,-563.560886,...,-536853.55,-3391.0,0.0,-503.940741,-497.917526,-12966.0,-495.452381,-7238.0,-5270.0,0
2,-149270.00,-480.930000,-56.0,-2099.0,-133831.80,-11665.0,-142645.00,-529503.93,-119788.93,-544.781022,...,-553721.45,-2795.0,0.0,-563.560886,-490.189003,-9799.0,-392.107143,-7458.0,-4565.0,0
3,-157104.38,-485.401040,0.0,-2833.0,-129734.00,-12966.0,-155532.00,-536853.55,-124180.00,-571.288655,...,-578581.41,-2058.0,-53.0,-544.781022,-530.825939,-13445.0,-459.869048,-8036.0,-4769.0,0
4,-156773.41,-497.950944,0.0,-2075.0,-134586.00,-9799.0,-148377.72,-553721.45,-116858.00,-563.933129,...,-589434.59,-3042.0,0.0,-571.288655,-499.588283,-12948.0,-447.261905,-7536.0,-4536.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136640,-116457.48,-484.171385,0.0,-2365.0,-147153.62,-29174.0,-163572.00,-583910.69,-138853.00,-389.489900,...,-596590.62,-4669.0,0.0,-408.140468,-543.428571,-33207.0,-455.348837,-7236.0,-4670.0,1998
136641,-116825.62,-470.151874,0.0,-2757.0,-130844.24,-32083.0,-153931.20,-567003.16,-131394.20,-390.721137,...,-576193.24,-4313.0,0.0,-389.489900,-511.399336,-33542.0,-489.529070,-7420.0,-4608.0,1998
136642,-119794.00,-494.685423,0.0,-2750.0,-143808.00,-33207.0,-159443.00,-596590.62,-143499.00,-400.648829,...,-607200.18,-4904.0,0.0,-390.721137,-529.710963,-36616.0,-454.837209,-7304.0,-5129.0,1998
136643,-124675.00,-477.772172,0.0,-2573.0,-146394.00,-33542.0,-159625.24,-576193.24,-141635.00,-416.973244,...,-628489.05,-4894.0,0.0,-400.648829,-530.316412,-30557.0,-463.220930,-7642.0,-4453.0,1998


# Autogluon

In [266]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [267]:
train_data = TimeSeriesDataFrame.from_data_frame(
    data,
    id_column="unique_id",
    timestamp_column="ds",
    static_features_df=static_features_df,
)

train_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,yearly_income_bin_y_sum_lag2,gender_y_mean_lag3,current_age_y_max_lag2,latitude_y_min_lag3,credit_score_bin_y_sum_lag1,latitude_y_sum_lag3,num_credit_cards_y_sum_lag3,gender_y_sum_lag3,total_debt_bin_y_sum_lag3,yearly_income_bin_y_mean_lag2,...,num_credit_cards_y_min_lag3,gender_y_sum_lag1,retirement_age_y_min_lag1,latitude_y_max_lag2,yearly_income_bin_y_mean_lag3,num_credit_cards_y_mean_lag3,latitude_y_sum_lag1,retirement_age_y_mean_lag3,gender_y_min_lag1,credit_score_bin_y_min_lag1
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,2010-04-30,-136064.0,-491.150702,0.0,-2433.0,-127367.0,-8718.0,-149299.0,-531916.21,-114990.1,-503.940741,...,-6347.0,-529503.93,-2099.0,0.0,-586.353383,-522.024476,-11665.0,-440.296296,-7104.0,-5215.0
0,2010-05-31,-152725.0,-448.587967,0.0,-2279.0,-128749.55,-13643.0,-144894.0,-492101.0,-104237.0,-563.560886,...,-6662.0,-536853.55,-3391.0,0.0,-503.940741,-497.917526,-12966.0,-495.452381,-7238.0,-5270.0
0,2010-06-30,-149270.0,-480.93,-56.0,-2099.0,-133831.8,-11665.0,-142645.0,-529503.93,-119788.93,-544.781022,...,-6998.0,-553721.45,-2795.0,0.0,-563.560886,-490.189003,-9799.0,-392.107143,-7458.0,-4565.0
0,2010-07-31,-157104.38,-485.40104,0.0,-2833.0,-129734.0,-12966.0,-155532.0,-536853.55,-124180.0,-571.288655,...,-6702.0,-578581.41,-2058.0,-53.0,-544.781022,-530.825939,-13445.0,-459.869048,-8036.0,-4769.0
0,2010-08-31,-156773.41,-497.950944,0.0,-2075.0,-134586.0,-9799.0,-148377.72,-553721.45,-116858.0,-563.933129,...,-7458.0,-589434.59,-3042.0,0.0,-571.288655,-499.588283,-12948.0,-447.261905,-7536.0,-4536.0


In [268]:
train_data.static_features.head()

Unnamed: 0_level_0,gender,credit_score_debt_ratio,credit_score,yearly_income_bin_y_max_lag3,longitude,retirement_debt_pressure,yearly_income_bin_y_max_lag2,income_age_ratio,monthly_income,total_debt_bin_y_max_lag3,...,lat_income_effect,risk_metric,credit_score_bin_y_max_lag3,gender_y_max_lag2,total_debt,financial_health_score,retirement_age,num_credit_cards,yearly_income,credit_score_bin_y_max_lag1
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0.021078,763,0.0,-70.0,1005.527778,0.0,1806.454545,4967.75,0.0,...,26.22972,0.032852,0.0,0.0,36199,0.718867,69,4,59613,0.0
1,0,0.048262,704,0.0,-87.0,470.548387,0.0,1054.883721,3780.0,0.0,...,13.608,0.021306,0.0,0.0,14587,1.14565,74,3,45360,0.0
2,0,0.008324,673,0.0,-74.0,5053.125,0.0,571.8125,2287.25,0.0,...,11.25327,0.192781,0.0,0.0,80850,0.0,64,5,27447,0.0
3,0,0.036431,681,0.0,-99.0,1168.3125,0.0,570.265306,2328.583333,0.0,...,9.50062,0.039902,0.0,0.0,18693,0.540743,65,4,27943,0.0
4,0,0.006207,716,0.0,-122.0,6409.0,0.0,1415.388889,6369.25,0.0,...,36.68688,0.054388,0.0,0.0,115362,0.0,72,5,76431,0.0


In [269]:
TRAIN = True

In [270]:
MODELS_PATH = os.path.join(ROOT, 'models')

if TRAIN:

    predictor = TimeSeriesPredictor(
        prediction_length=3,  # Forecast horizon
        path=MODELS_PATH,
        target="y",  # Name of the target column
        eval_metric="RMSE"  # Evaluation metric
    )

    predictor.fit(
        train_data=train_data,
        presets="fast_training",
        excluded_model_types=[
        "DeepAR",
        "TemporalFusionTransformer",
        "AutoARIMA",
        "Prophet"
        ],
        time_limit=3600  # optional: set a time limit in seconds

    )

else:
    predictor = TimeSeriesPredictor.load(path=MODELS_PATH)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/home/ezemriv/other_projects/hackathon-caixabank-data-ai-report/models'
AutoGluon Version:  1.1.1
Python Version:     3.10.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #47-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 21:40:26 UTC 2024
CPU Count:          8
GPU Count:          0
Memory Avail:       1.96 GB / 7.57 GB (25.8%)
Disk Space Avail:   106.07 GB / 232.64 GB (45.6%)
Setting presets to: fast_training

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': RMSE,
 'excluded_model_types': ['DeepAR',
                          'TemporalFusionTransformer',
                          'AutoARIMA',
                          'Prophet'],
 'hyperparameters': 'very_light',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 3,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full':

KeyboardInterrupt: 

In [186]:
leaderboard = predictor.leaderboard()
print("Model Leaderboard:")
print(leaderboard)

# fast training mode

# Model Leaderboard:
#               model   score_val  pred_time_val  fit_time_marginal  fit_order
# 0  WeightedEnsemble -318.520044      56.570607           1.057646          6
# 1               ETS -318.900128      56.180754           0.173531          4
# 2             Theta -326.901913      23.104767           0.141360          5
# 3  RecursiveTabular -339.327945       0.389853           2.717882          3
# 4             Naive -435.845209       3.403606           0.137586          1
# 5     SeasonalNaive -438.106356       0.974761           0.151612          2

Model Leaderboard:
              model   score_val  pred_time_val  fit_time_marginal  fit_order
0  WeightedEnsemble -316.834665      53.738745           1.313350          7
1     DirectTabular -317.716762       0.263793           1.732658          4
2               ETS -318.900128      53.258255           0.135220          5
3             Theta -326.901913      25.662149           0.136749          6
4  RecursiveTabular -339.673437       0.216697           3.917453          3
5             Naive -435.845209       3.884053           0.168334          1
6     SeasonalNaive -438.106356       0.893244           0.124744          2


In [187]:
MY_SELECTION = True

if MY_SELECTION:
    selected_model = 'DirectTabular'
    test_preds = predictor.predict(train_data, model=selected_model).reset_index()

else:
    test_preds = predictor.predict(train_data).reset_index()

In [188]:
test_preds = (test_preds
    .rename(columns={'item_id': 'unique_id', 'timestamp': 'ds'})[['unique_id', 'ds', 'mean']]
)

test_preds['ds'] = test_preds['ds'].dt.to_period('M').astype(str)
test_preds['mean'] = test_preds['mean'].astype(float).round(2)

test_preds

Unnamed: 0,unique_id,ds,mean
0,0,2019-11,-652.99
1,0,2019-12,-652.99
2,0,2020-01,-654.18
3,1,2019-11,-180.05
4,1,2019-12,-159.50
...,...,...,...
3652,1997,2019-12,-567.36
3653,1997,2020-01,-568.40
3654,1998,2019-11,-160.91
3655,1998,2019-12,-163.67


In [189]:
def save_predictions_to_json(test_preds, pred_col, save_path):
    # Initialize the predictions dictionary structure
    preds_dict = {'target': {}}

    # Populate the dictionary with predictions grouped by unique_id
    for unique_id in test_preds['unique_id'].unique():
        preds_dict['target'][str(unique_id)] = (
            test_preds[test_preds['unique_id'] == unique_id]
            .set_index('ds')[pred_col]
            .to_dict()
        )

    # Save the predictions dictionary to JSON
    with open(save_path, 'w') as f:
        json.dump(preds_dict, f)

    print(f"Updated predictions saved to {save_path}")


save_predictions_to_json(test_preds, 'mean', PREDICTION_IDS_PATH)

# Run jq command with dynamic path
!jq . {PREDICTION_IDS_PATH} > temp.json && mv temp.json {PREDICTION_IDS_PATH}

# Stage, commit, and push the changes
time.sleep(2)
!git add {ROOT}
!git commit -m "Add updated predictions"
!git push

Updated predictions saved to /home/ezemriv/other_projects/hackathon-caixabank-data-ai-report/predictions/predictions_4.json
[main f690c28] Add updated predictions
 36 files changed, 5065 insertions(+), 3993 deletions(-)
Enumerating objects: 117, done.
Counting objects: 100% (117/117), done.
Delta compression using up to 8 threads
Compressing objects: 100% (59/59), done.
Writing objects: 100% (71/71), 21.94 MiB | 5.20 MiB/s, done.
Total 71 (delta 19), reused 0 (delta 0), pack-reused 0
To https://bitbucket.org/nuweio/hackathon-caixabank-data-ai-report-2130
   c7f66fc..f690c28  main -> main
