In [533]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import lightgbm as lgb

df_train = pd.read_csv(('train.csv'))
df_test = pd.read_csv('test.csv')
census = pd.read_csv('census_starter.csv', index_col='cfips')
df_merged_df = pd.read_csv('sample_submission.csv')
df_new = pd.read_csv('revealed_test.csv')

In [534]:
df_train = pd.concat([df_train, df_new]).sort_values(by=['cfips', 'first_day_of_month']).reset_index()
drop_index = (df_test.first_day_of_month == '2022-11-01') | (df_test.first_day_of_month == '2022-12-01')
df_test = df_test.loc[~ drop_index]

In [535]:
df_train['is_train'] = 1
df_test['is_train'] = 0

In [536]:
state_dict = df_train[['cfips', 'state', 'county']]
state_dict = state_dict.set_index('cfips')
state_dict = state_dict.drop_duplicates()
state_dict = state_dict.to_dict()

df_test['state'] = df_test['cfips'].map(state_dict['state'])
df_test['county'] = df_test['cfips'].map(state_dict['county'])
zero_mask = (df_train['microbusiness_density'] == 0) & (df_train['active'] == 0) 
df_train['ratio'] = df_train.loc[~ zero_mask].groupby('cfips', group_keys=False).apply(lambda x: x.active / x.microbusiness_density)
df_train['ratio'] = df_train['ratio'].fillna(0)
df = pd.concat([df_train, df_test], axis=0)
df.state = df['state'].astype('category').cat.codes
df = df.drop(columns=['county'])

In [537]:
df.first_day_of_month = pd.to_datetime(df.first_day_of_month)
df['month'] = df.first_day_of_month.dt.month
df['year'] = df.first_day_of_month.dt.year - 2019
df['is_new_year'] = 0
df.loc[df.month == 1, 'is_new_year'] = 1
seasons = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall', 11: 'Fall', 12: 'Winter'}
df['season'] = df['month'].apply(lambda x: seasons[x])
df['md_national_avg'] = df.groupby(['year','month'])['microbusiness_density'].transform('mean')
df['md_state_avg'] = df.groupby(['state','year','month'])['microbusiness_density'].transform('mean')
df['a_national_avg'] = df.groupby(['year','month'])['active'].transform('mean')
df['a_state_avg'] = df.groupby(['state','year','month'])['active'].transform('mean')


In [538]:
df = pd.concat([df, pd.get_dummies(df['season'], prefix='season')], axis=1)
df = df.drop(columns='season')

In [539]:
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

# Create a US holiday calendar
cal = USFederalHolidayCalendar()

holidays = cal.holidays(start='2019-01-01', end='2023-06-01')
df['holidays'] = pd.to_datetime(df['first_day_of_month'], format='%Y-%m-%d').dt.to_period('M').apply(lambda x: len(holidays[holidays.to_period('M') == x]))

In [540]:
df['idx'] = df.groupby('cfips')['cfips'].cumcount()
df = df.merge(census, on='cfips')

In [541]:
import matplotlib.pyplot as plt
%matplotlib inline

df = df.set_index('row_id')
mask = ((df.microbusiness_density == 0) & (df.active == 0))

df['md_state_avg_log_diff'] = df.loc[~ mask].groupby('cfips', group_keys=False)['md_state_avg'].apply(lambda x: np.log(x).diff())
df.loc[mask, [f'md_state_avg_log_diff']] = 0
df[f'md_state_avg_log_diff'] = df.loc[~ mask].groupby('cfips', group_keys=False)['md_state_avg_log_diff'].bfill()# is backfill needed?

df['a_state_avg_log_diff'] = df.loc[~ mask].groupby('cfips', group_keys=False)['a_state_avg'].apply(lambda x: np.log(x).diff())
df.loc[mask, [f'a_state_avg_log_diff']] = 0
df[f'a_state_avg_log_diff'] = df.loc[~ mask].groupby('cfips', group_keys=False)['a_state_avg_log_diff'].bfill()# is backfill needed?

LAGS = [1, 2, 3, 4, 5, 6, 7, 8]
for i in LAGS:
    df[f'md_log_diff_{i}'] = df.loc[~ mask].groupby('cfips', group_keys=False)['microbusiness_density'].apply(lambda x: np.log(x).diff())
    df.loc[mask, [f'md_log_diff_{i}' for i in LAGS]] = 0
    df[f'md_log_diff_{i}'] = df.loc[~ mask].groupby('cfips', group_keys=False)[f'md_log_diff_{i}'].bfill()# is backfill needed?
    df[f'a_log_diff_{i}'] = df.loc[~ mask].groupby('cfips', group_keys=False)['active'].apply(lambda x: np.log(x).diff())
    df.loc[mask, [f'a_log_diff_{i}' for i in LAGS]] = 0
    df[f'a_log_diff_{i}'] = df.loc[~ mask].groupby('cfips', group_keys=False)[f'a_log_diff_{i}'].bfill() # is backfill needed?
    df[f'md_lag_{i}'] = df.groupby('cfips', group_keys=False)['microbusiness_density'].shift(i)
    df[f'a_lag_{i}'] = df.groupby('cfips', group_keys=False)['microbusiness_density'].shift(i)
    
df['target'] = df.loc[~ mask].groupby('cfips', group_keys=False)['microbusiness_density'].apply(lambda x: np.log(x.shift(-5)) - np.log(x))
df.loc[mask, ['target']] = 0

In [542]:
year_mask = True
if year_mask:
    mask = (df['year'] == 1) | (df['year'] == 0)
    df = df.loc[~mask]
df

Unnamed: 0_level_0,index,cfips,state,first_day_of_month,microbusiness_density,active,is_train,ratio,month,year,...,a_lag_4,md_lag_5,a_lag_5,md_lag_6,a_lag_6,md_lag_7,a_lag_7,md_lag_8,a_lag_8,target
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001_2021-01-01,17.0,1001,0,2021-01-01,2.947244,1243.0,1,421.749990,1,2,...,3.205756,3.174679,3.174679,3.083837,3.083837,3.019292,3.019292,3.004948,3.004948,0.089221
1001_2021-02-01,18.0,1001,0,2021-02-01,3.106106,1310.0,1,421.749988,2,2,...,3.193804,3.205756,3.205756,3.174679,3.174679,3.083837,3.083837,3.019292,3.019292,0.033036
1001_2021-03-01,19.0,1001,0,2021-03-01,3.144043,1326.0,1,421.749997,3,2,...,3.038416,3.193804,3.193804,3.205756,3.205756,3.174679,3.174679,3.083837,3.083837,0.023846
1001_2021-04-01,20.0,1001,0,2021-04-01,3.224659,1360.0,1,421.749995,4,2,...,3.002558,3.038416,3.038416,3.193804,3.193804,3.205756,3.205756,3.174679,3.174679,-0.011834
1001_2021-05-01,21.0,1001,0,2021-05-01,3.227030,1361.0,1,421.749991,5,2,...,2.947244,3.002558,3.002558,3.038416,3.038416,3.193804,3.193804,3.205756,3.205756,-0.007375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56045_2023-02-01,,56045,50,2023-02-01,,,0,,2,4,...,1.785395,1.785395,1.785395,1.785395,1.785395,1.803249,1.803249,1.803249,1.803249,
56045_2023-03-01,,56045,50,2023-03-01,,,0,,3,4,...,1.785395,1.785395,1.785395,1.785395,1.785395,1.785395,1.785395,1.803249,1.803249,
56045_2023-04-01,,56045,50,2023-04-01,,,0,,4,4,...,1.803249,1.785395,1.785395,1.785395,1.785395,1.785395,1.785395,1.785395,1.785395,
56045_2023-05-01,,56045,50,2023-05-01,,,0,,5,4,...,,1.803249,1.803249,1.785395,1.785395,1.785395,1.785395,1.785395,1.785395,


In [543]:
df = df.to_csv('base_df.csv')

In [544]:
df = pd.read_csv('base_df.csv', index_col='row_id')

In [545]:
# reorganize train test
df_train = df[df.is_train == 1]
df_train = df_train.fillna(0)
df_test = df[df.is_train == 0]
target_train = df[df.is_train == 1]['target']
target_train = target_train.fillna(0)

In [546]:
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def get_NN_cfips_by_feats(df, N, p, metric, feature):
    df = df.reset_index()
    NN_feature = ['cfips', 'first_day_of_month', feature]
    df = df[NN_feature]
    df = df.pivot(index='cfips', columns='first_day_of_month', values=feature)
    df.columns.name = None
    nn = NearestNeighbors(
                n_neighbors=N, 
                p=p, 
                metric=metric, 
                # metric_params=metric_params
            )
    nn.fit(df)
    neighbors = nn.kneighbors(df, return_distance=False)
    df_nn = pd.DataFrame(neighbors, index=df.index)
    df_nn = df_nn.apply(lambda col: df.index[col])
    df_nn = df_nn.iloc[:, 1:]
    return df_nn

def get_NN_cfips_by_census(census, N, p, metric):
    nn_census= NearestNeighbors(
                n_neighbors=N, 
                p=p, 
                metric=metric, 
                # metric_params=metric_params
            )
    census_na = census[census.isna().any(axis=1)]
    na_mask = census_na.index
    census.loc[na_mask] = census.loc[na_mask].fillna(method='ffill', axis=1)
    df_mask = df.cfips.unique()
    census = census.loc[df_mask]
    nn_census.fit(census)
    neighbors = nn_census.kneighbors(census, return_distance=False)
    df_nn = pd.DataFrame(neighbors, index=census.index)
    df_nn = df_nn.iloc[:, 1:]
    df_nn= df_nn.loc[df_mask]
    df_nn = df_nn.rename(columns={1: 'NN_1', 2: 'NN_2', 3: 'NN_3'})
    df_nn[['NN_1', 'NN_2', 'NN_3']] = df_nn[['NN_1', 'NN_2', 'NN_3']].applymap(lambda x: census.index[x])
    return df_nn

def generate_feature(df_nn, df, feature, colname):
    cfips = df.cfips.unique()
    df_grouped = df.groupby('cfips')
    dfs = []
    for cfips_val in tqdm(cfips):
        nn_list = df_nn.loc[cfips_val].values.tolist()
        feats = pd.DataFrame(np.asarray([df_grouped.get_group(i)[feature].values for i in nn_list]).T, columns=[f'{colname}_NN_{i}' for i in range(len(nn_list))])
        # print(nn_list)
        # print(feats)
        dfs.append(feats)
    result = pd.concat(dfs, axis=0)
    result = result.reset_index()
    result = result.drop(columns='index')
    return result

def combine_nn_feats(df, df_nn):
    df = df.reset_index()
    df = pd.concat([df, df_nn], axis=1)
    df = df.drop(columns='first_day_of_month')
    df = df.set_index('row_id')
    return df

In [547]:
# df_nn_c = get_NN_cfips_by_census(census, N=4, p=2)
# df_nn_c
# generate_feature(df_nn_c, df_train, 'microbusiness_density', 'md')

In [548]:
KNN = True

if KNN:
    # NN by md:
    df_nn_c1 = get_NN_cfips_by_census(census, N=4, p=1, metric='manhattan')
    df_nn_1 = get_NN_cfips_by_feats(df_train, N=4, p=1, metric='manhattan', feature='microbusiness_density')
    md_log_diff_1_p1 = generate_feature(df_nn_1, df_train, 'md_log_diff_1', 'md_log_diff_1_1')
    md_log_diff_2_p1 = generate_feature(df_nn_1, df_train, 'md_log_diff_2', 'md_log_diff_2_1')
    md_log_diff_3_p1 = generate_feature(df_nn_1, df_train, 'md_log_diff_3', 'md_log_diff_3_1')
    md_log_diff_4_p1 = generate_feature(df_nn_1, df_train, 'md_log_diff_4', 'md_log_diff_4_1')
    md_log_diff_5_p1 = generate_feature(df_nn_1, df_train, 'md_log_diff_5', 'md_log_diff_5_1')
    md_state_avg_diff_c_p1 = generate_feature(df_nn_c1, df_train, 'md_state_avg_log_diff', 'md_state_avg_c_1')
    md_state_avg_diff_p1 = generate_feature(df_nn_1, df_train, 'md_state_avg_log_diff', 'md_state_avg_1')
    a_state_avg_diff_c_p1 = generate_feature(df_nn_c1, df_train, 'a_state_avg_log_diff', 'a_state_avg_c_1')
    a_state_avg_diff_p1 = generate_feature(df_nn_1, df_train, 'a_state_avg_log_diff', 'a_state_avg_1')
    md_lag_1_c_p1 = generate_feature(df_nn_c1, df_train, 'md_lag_1', 'lag_1_c_1')
    md_lag_2_c_p1 = generate_feature(df_nn_c1, df_train, 'md_lag_2', 'lag_2_c_1')
    md_lag_3_c_p1 = generate_feature(df_nn_c1, df_train, 'md_lag_3', 'lag_3_c_1')
    md_lag_4_c_p1 = generate_feature(df_nn_c1, df_train, 'md_lag_4', 'lag_4_c_1')
    md_lag_5_c_p1 = generate_feature(df_nn_c1, df_train, 'md_lag_5', 'lag_5_c_1')
    md_lag_1_p1 = generate_feature(df_nn_1, df_train, 'md_lag_1', 'lag_1_1')
    md_lag_2_p1 = generate_feature(df_nn_1, df_train, 'md_lag_2', 'lag_2_1')
    md_lag_3_p1 = generate_feature(df_nn_1, df_train, 'md_lag_3', 'lag_3_1')
    md_lag_4_p1 = generate_feature(df_nn_1, df_train, 'md_lag_4', 'lag_4_1')
    md_lag_5_p1 = generate_feature(df_nn_1, df_train, 'md_lag_5', 'lag_5_1')

    # NN by log_diff:
    df_nn_c2 = get_NN_cfips_by_census(census, N=4, p=2, metric='euclidean')
    df_nn_2 = get_NN_cfips_by_feats(df_train, N=4, p=2, metric = 'euclidean', feature='microbusiness_density')
    md_log_diff_1_p2 = generate_feature(df_nn_2, df_train, 'md_log_diff_1', 'md_log_diff_1_2')
    md_log_diff_2_p2 = generate_feature(df_nn_2, df_train, 'md_log_diff_2', 'md_log_diff_2_2')
    md_log_diff_3_p2 = generate_feature(df_nn_2, df_train, 'md_log_diff_3', 'md_log_diff_3_2')
    md_log_diff_4_p2 = generate_feature(df_nn_2, df_train, 'md_log_diff_4', 'md_log_diff_4_2')
    md_log_diff_5_p2 = generate_feature(df_nn_2, df_train, 'md_log_diff_5', 'md_log_diff_5_2')
    md_state_avg_diff_c_p2 = generate_feature(df_nn_c2, df_train, 'md_state_avg_log_diff', 'md_state_avg_c_2')
    md_state_avg_diff_p2 = generate_feature(df_nn_2, df_train, 'md_state_avg_log_diff', 'md_state_avg_2')
    a_state_avg_diff_c_p2 = generate_feature(df_nn_c2, df_train, 'a_state_avg_log_diff', 'a_state_avg_c_2')
    a_state_avg_diff_p2 = generate_feature(df_nn_2, df_train, 'a_state_avg_log_diff', 'a_state_avg_2')
    md_lag_1_c_p2 = generate_feature(df_nn_c2, df_train, 'md_lag_1', 'lag_1_c_2')
    md_lag_2_c_p2 = generate_feature(df_nn_c2, df_train, 'md_lag_2', 'lag_2_c_2')
    md_lag_3_c_p2 = generate_feature(df_nn_c2, df_train, 'md_lag_3', 'lag_3_c_2')
    md_lag_4_c_p2 = generate_feature(df_nn_c2, df_train, 'md_lag_4', 'lag_4_c_2')
    md_lag_5_c_p2 = generate_feature(df_nn_c2, df_train, 'md_lag_5', 'lag_5_c_2')
    md_lag_1_p2 = generate_feature(df_nn_2, df_train, 'md_lag_1', 'lag_1_2')
    md_lag_2_p2 = generate_feature(df_nn_2, df_train, 'md_lag_2', 'lag_2_2')
    md_lag_3_p2 = generate_feature(df_nn_2, df_train, 'md_lag_3', 'lag_3_2')
    md_lag_4_p2 = generate_feature(df_nn_2, df_train, 'md_lag_4', 'lag_4_2')
    md_lag_5_p2 = generate_feature(df_nn_2, df_train, 'md_lag_5', 'lag_5_2')
    NN_feats_list = [
        md_state_avg_diff_c_p1,
        md_state_avg_diff_c_p2,
        a_state_avg_diff_c_p1,
        a_state_avg_diff_c_p2,
        md_lag_1_c_p1,
        md_lag_1_c_p2,
        md_lag_2_c_p1,
        md_lag_2_c_p2,
        md_lag_3_c_p1,
        md_lag_3_c_p2,
        md_lag_4_c_p1,
        md_lag_4_c_p2,
        md_lag_5_c_p1,
        md_lag_5_c_p2,
        md_state_avg_diff_p1, 
        md_state_avg_diff_p2,
        a_state_avg_diff_p1,
        a_state_avg_diff_p2,
        md_lag_1_p1,
        md_lag_1_p2,
        md_lag_2_p1,
        md_lag_2_p2,
        md_lag_3_p1,
        md_lag_3_p2,
        md_lag_4_p1,
        md_lag_4_p2,
        md_lag_5_p1,
        md_lag_5_p2,
        md_log_diff_1_p1,
        md_log_diff_1_p2,
        md_log_diff_2_p1,
        md_log_diff_2_p2,
        md_log_diff_3_p1,
        md_log_diff_3_p2,
        md_log_diff_4_p1,
        md_log_diff_4_p2,
        md_log_diff_5_p1,
        md_log_diff_5_p2,
        ]
    NN_feats = pd.concat(NN_feats_list, axis=1)
    df_train = combine_nn_feats(df_train, NN_feats)
else:
    df_train = df_train.drop(columns='first_day_of_month')

100%|██████████| 3135/3135 [00:00<00:00, 3686.98it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3679.24it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3824.13it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3889.68it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3786.11it/s]
100%|██████████| 3135/3135 [00:00<00:00, 4010.39it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3172.22it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3659.33it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3247.36it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3694.53it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3198.89it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3646.98it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3636.93it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3238.50it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3743.99it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3811.61it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3727.59it/s]
100%|██████████| 3135/3135 [00:01<00:00, 2694.08it/s]
100%|██████████| 3135/3135 [

In [549]:
df_train

Unnamed: 0_level_0,index,cfips,state,microbusiness_density,active,is_train,ratio,month,year,is_new_year,...,md_log_diff_4_1_NN_2,md_log_diff_4_2_NN_0,md_log_diff_4_2_NN_1,md_log_diff_4_2_NN_2,md_log_diff_5_1_NN_0,md_log_diff_5_1_NN_1,md_log_diff_5_1_NN_2,md_log_diff_5_2_NN_0,md_log_diff_5_2_NN_1,md_log_diff_5_2_NN_2
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001_2021-01-01,17.0,1001,0,2.947244,1243.0,1,421.749990,1,2,1,...,0.014377,0.005454,0.006040,-0.012909,0.003468,0.005454,0.014377,0.005454,0.006040,-0.012909
1001_2021-02-01,18.0,1001,0,3.106106,1310.0,1,421.749988,2,2,0,...,0.004673,0.017133,0.053859,-0.022776,-0.012601,0.017133,0.004673,0.017133,0.053859,-0.022776
1001_2021-03-01,19.0,1001,0,3.144043,1326.0,1,421.749997,3,2,0,...,0.013889,0.014644,0.007463,0.005158,0.010811,0.014644,0.013889,0.014644,0.007463,0.005158
1001_2021-04-01,20.0,1001,0,3.224659,1360.0,1,421.749995,4,2,0,...,-0.002302,0.004396,0.000000,0.010237,0.000000,0.004396,-0.002302,0.004396,0.000000,0.010237
1001_2021-05-01,21.0,1001,0,3.227030,1361.0,1,421.749991,5,2,0,...,-0.006936,0.000000,0.014760,-0.003711,-0.001794,0.000000,-0.006936,0.000000,0.014760,-0.003711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56045_2022-08-01,122262.0,56045,50,1.785395,100.0,1,56.009999,8,3,0,...,0.000854,0.002457,-0.015198,-0.016807,-0.015198,-0.018405,0.000854,0.002457,-0.015198,-0.016807
56045_2022-09-01,122263.0,56045,50,1.785395,100.0,1,56.009999,9,3,0,...,0.009342,0.008552,-0.006144,-0.008511,-0.006144,0.000000,0.009342,0.008552,-0.006144,-0.008511
56045_2022-10-01,122264.0,56045,50,1.785395,100.0,1,56.009999,10,3,0,...,0.010093,-0.011009,0.006144,-0.008584,0.006144,0.000000,0.010093,-0.011009,0.006144,-0.008584
56045_2022-11-01,6268.0,56045,50,1.785395,100.0,1,56.009999,11,3,0,...,-0.009248,0.002457,0.004584,0.008584,0.004584,-0.012461,-0.009248,0.002457,0.004584,0.008584


In [550]:
# df_test = df_test.drop(columns='first_day_of_month')
# df = pd.concat([df_train, df_test], axis=0)

In [551]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit

def smape_loss(y_true, y_pred):
    """
    Custom SMAPE loss function for LightGBM
    """
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

def smape_cv(model, df_train, target_train):
    tscv = TimeSeriesSplit(n_splits=5)
    smape_list = []
    for _, (train_index, test_index) in enumerate(tscv.split(df_train), start=1):
        # Create training and validation datasets
        X_train = df_train.iloc[train_index]
        y_train = target_train.iloc[train_index]
        X_valid = df_train.iloc[test_index]
        y_valid = target_train.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = np.exp(model.predict(X_valid)) * X_valid['microbusiness_density']
        y_valid = np.exp(y_valid) * X_valid['microbusiness_density']

        # Calculate SMAPE score
        smape = smape_loss(y_valid, y_pred)
        smape_list.append(smape)
    
    print(f'Mean SMAPE: {np.mean(smape_list):.4f}')
    return np.mean(smape_list)

def lgbm_objective(trial, df_train, target_train):
    """
    Objective function for LightGBM optimization without rolling window cross-validation.
    """
    params = {
        'n_iter'           : 200,
        'verbosity'        : -1,
        'objective'        : 'l1',
        'random_state'     : 42,
        'extra_trees'      : True,
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth'        : trial.suggest_int('max_depth', 3, 10),
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'num_leaves'       : trial.suggest_int('num_leaves', 8, 1024),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 5, 250),}

    model = lgb.LGBMRegressor(**params)
    score = smape_cv(model, df_train, target_train)
    return score


In [553]:
import optuna
import pandas as pd

# Load your data into df_train and target_train
# df_train = ...
# target_train = ...

# Create the study
study = optuna.create_study(direction='minimize')

# Optimize the study
study.optimize(lambda trial: lgbm_objective(trial, df_train, target_train), n_trials=20)

# Print the best trial
print(f"Best trial: {study.best_trial.value} with parameters {study.best_trial.params}")


[32m[I 2023-03-14 19:31:00,525][0m A new study created in memory with name: no-name-94532758-556b-4cdb-b52a-ca376201e263[0m
















[32m[I 2023-03-14 19:31:09,836][0m Trial 0 finished with value: 1.5173975379537745 and parameters: {'colsample_bytree': 0.49154151260249135, 'colsample_bynode': 0.11142946725011174, 'max_depth': 5, 'learning_rate': 0.07354210694050282, 'lambda_l1': 8.798090798420677, 'lambda_l2': 9.663972207751238, 'num_leaves': 344, 'min_data_in_leaf': 228}. Best is trial 0 with value: 1.5173975379537745.[0m


Mean SMAPE: 1.5174


















[32m[I 2023-03-14 19:31:25,162][0m Trial 1 finished with value: 2.1222443958002506 and parameters: {'colsample_bytree': 0.4144823683016665, 'colsample_bynode': 0.3998192032311526, 'max_depth': 8, 'learning_rate': 0.012383608038792614, 'lambda_l1': 6.956412581820987, 'lambda_l2': 4.68013846881712, 'num_leaves': 157, 'min_data_in_leaf': 141}. Best is trial 0 with value: 1.5173975379537745.[0m


Mean SMAPE: 2.1222


















[32m[I 2023-03-14 19:31:33,092][0m Trial 2 finished with value: 1.0254241274843972 and parameters: {'colsample_bytree': 0.9977205230313255, 'colsample_bynode': 0.7080360647784012, 'max_depth': 4, 'learning_rate': 0.018978445945649083, 'lambda_l1': 2.7602839764331377, 'lambda_l2': 7.776303590311883, 'num_leaves': 687, 'min_data_in_leaf': 17}. Best is trial 2 with value: 1.0254241274843972.[0m


Mean SMAPE: 1.0254


















[32m[I 2023-03-14 19:31:39,260][0m Trial 3 finished with value: 1.658345108796468 and parameters: {'colsample_bytree': 0.2245332218709437, 'colsample_bynode': 0.7863241004983526, 'max_depth': 4, 'learning_rate': 0.03203098904076456, 'lambda_l1': 2.4135939101094235, 'lambda_l2': 2.366492814227422, 'num_leaves': 407, 'min_data_in_leaf': 37}. Best is trial 2 with value: 1.0254241274843972.[0m


Mean SMAPE: 1.6583


















[32m[I 2023-03-14 19:31:52,708][0m Trial 4 finished with value: 0.6695384759267544 and parameters: {'colsample_bytree': 0.8034255449224383, 'colsample_bynode': 0.768360669652405, 'max_depth': 8, 'learning_rate': 0.05701226637350264, 'lambda_l1': 6.708374329205309, 'lambda_l2': 2.414955103177976, 'num_leaves': 22, 'min_data_in_leaf': 238}. Best is trial 4 with value: 0.6695384759267544.[0m


Mean SMAPE: 0.6695


















[32m[I 2023-03-14 19:31:59,969][0m Trial 5 finished with value: 2.7295873062391203 and parameters: {'colsample_bytree': 0.22067708705850753, 'colsample_bynode': 0.12741106929572957, 'max_depth': 5, 'learning_rate': 0.02108475511279568, 'lambda_l1': 6.436315127470481, 'lambda_l2': 4.7615465632915335, 'num_leaves': 14, 'min_data_in_leaf': 17}. Best is trial 4 with value: 0.6695384759267544.[0m


Mean SMAPE: 2.7296


















[32m[I 2023-03-14 19:32:14,270][0m Trial 6 finished with value: 2.793215726943933 and parameters: {'colsample_bytree': 0.14998808081028622, 'colsample_bynode': 0.36510560544423265, 'max_depth': 7, 'learning_rate': 0.01653332482542995, 'lambda_l1': 0.26686774693203275, 'lambda_l2': 0.666895622047969, 'num_leaves': 1022, 'min_data_in_leaf': 57}. Best is trial 4 with value: 0.6695384759267544.[0m


Mean SMAPE: 2.7932


















[32m[I 2023-03-14 19:32:20,527][0m Trial 7 finished with value: 2.196127524502728 and parameters: {'colsample_bytree': 0.3867991372071178, 'colsample_bynode': 0.37925908719328005, 'max_depth': 3, 'learning_rate': 0.01771983101236604, 'lambda_l1': 3.139448389025574, 'lambda_l2': 4.867169332526855, 'num_leaves': 950, 'min_data_in_leaf': 166}. Best is trial 4 with value: 0.6695384759267544.[0m


Mean SMAPE: 2.1961


















[32m[I 2023-03-14 19:32:37,509][0m Trial 8 finished with value: 1.0440343875222937 and parameters: {'colsample_bytree': 0.8010683740723699, 'colsample_bynode': 0.17930759951733294, 'max_depth': 7, 'learning_rate': 0.056246708329188454, 'lambda_l1': 8.955379393565995, 'lambda_l2': 3.7379255333411545, 'num_leaves': 996, 'min_data_in_leaf': 137}. Best is trial 4 with value: 0.6695384759267544.[0m


Mean SMAPE: 1.0440


















[32m[I 2023-03-14 19:32:51,328][0m Trial 9 finished with value: 1.2702664549340086 and parameters: {'colsample_bytree': 0.6055680119010364, 'colsample_bynode': 0.6995098011813041, 'max_depth': 10, 'learning_rate': 0.020297683369053222, 'lambda_l1': 8.301091501093564, 'lambda_l2': 8.604296885882471, 'num_leaves': 768, 'min_data_in_leaf': 98}. Best is trial 4 with value: 0.6695384759267544.[0m


Mean SMAPE: 1.2703


















[32m[I 2023-03-14 19:33:10,545][0m Trial 10 finished with value: 0.5541441907908725 and parameters: {'colsample_bytree': 0.734433635548391, 'colsample_bynode': 0.9927792740206849, 'max_depth': 10, 'learning_rate': 0.09786532872252506, 'lambda_l1': 5.5290029335075594, 'lambda_l2': 0.0892597537625397, 'num_leaves': 218, 'min_data_in_leaf': 233}. Best is trial 10 with value: 0.5541441907908725.[0m


Mean SMAPE: 0.5541


















[32m[I 2023-03-14 19:33:28,015][0m Trial 11 finished with value: 0.5710503230898868 and parameters: {'colsample_bytree': 0.6990214346641206, 'colsample_bynode': 0.9913040720840991, 'max_depth': 10, 'learning_rate': 0.09752922004575239, 'lambda_l1': 5.298625722692774, 'lambda_l2': 0.1346009087164929, 'num_leaves': 179, 'min_data_in_leaf': 249}. Best is trial 10 with value: 0.5541441907908725.[0m


Mean SMAPE: 0.5711


















[32m[I 2023-03-14 19:33:48,189][0m Trial 12 finished with value: 0.5711293806179187 and parameters: {'colsample_bytree': 0.6525864531802021, 'colsample_bynode': 0.9891698094094834, 'max_depth': 10, 'learning_rate': 0.09348441691985523, 'lambda_l1': 5.225789514940636, 'lambda_l2': 0.27778129979336985, 'num_leaves': 279, 'min_data_in_leaf': 203}. Best is trial 10 with value: 0.5541441907908725.[0m


Mean SMAPE: 0.5711


















[32m[I 2023-03-14 19:34:09,007][0m Trial 13 finished with value: 0.553661847851888 and parameters: {'colsample_bytree': 0.7308647708963529, 'colsample_bynode': 0.9818415726909208, 'max_depth': 9, 'learning_rate': 0.09473476194418658, 'lambda_l1': 4.884834179054331, 'lambda_l2': 0.04809727874257863, 'num_leaves': 554, 'min_data_in_leaf': 194}. Best is trial 13 with value: 0.553661847851888.[0m


Mean SMAPE: 0.5537


















[32m[I 2023-03-14 19:34:30,349][0m Trial 14 finished with value: 0.6451437067255299 and parameters: {'colsample_bytree': 0.780211469158408, 'colsample_bynode': 0.8718457706345211, 'max_depth': 9, 'learning_rate': 0.04891884320709357, 'lambda_l1': 4.329915524162238, 'lambda_l2': 1.6366086459857399, 'num_leaves': 553, 'min_data_in_leaf': 193}. Best is trial 13 with value: 0.553661847851888.[0m


Mean SMAPE: 0.6451


















[32m[I 2023-03-14 19:34:48,288][0m Trial 15 finished with value: 0.49877623401352195 and parameters: {'colsample_bytree': 0.922550834662478, 'colsample_bynode': 0.8865208494474415, 'max_depth': 9, 'learning_rate': 0.09982813819054616, 'lambda_l1': 4.135437333012513, 'lambda_l2': 1.4608779450861495, 'num_leaves': 539, 'min_data_in_leaf': 189}. Best is trial 15 with value: 0.49877623401352195.[0m


Mean SMAPE: 0.4988


















[32m[I 2023-03-14 19:35:04,598][0m Trial 16 finished with value: 0.5514725352063831 and parameters: {'colsample_bytree': 0.9309563903376601, 'colsample_bynode': 0.8813253268921614, 'max_depth': 8, 'learning_rate': 0.07148947095569, 'lambda_l1': 4.151358673676448, 'lambda_l2': 1.8356104846465877, 'num_leaves': 511, 'min_data_in_leaf': 180}. Best is trial 15 with value: 0.49877623401352195.[0m


Mean SMAPE: 0.5515


















[32m[I 2023-03-14 19:35:26,226][0m Trial 17 finished with value: 0.5670109115013255 and parameters: {'colsample_bytree': 0.9853906628769907, 'colsample_bynode': 0.6073365839725319, 'max_depth': 8, 'learning_rate': 0.0696531329223106, 'lambda_l1': 3.8875975347030316, 'lambda_l2': 3.324985236587981, 'num_leaves': 663, 'min_data_in_leaf': 98}. Best is trial 15 with value: 0.49877623401352195.[0m


Mean SMAPE: 0.5670


















[32m[I 2023-03-14 19:35:39,822][0m Trial 18 finished with value: 0.7033207231183802 and parameters: {'colsample_bytree': 0.8965114782805012, 'colsample_bynode': 0.873317613937075, 'max_depth': 6, 'learning_rate': 0.03789695318250636, 'lambda_l1': 2.063108786028798, 'lambda_l2': 1.9809163759416053, 'num_leaves': 429, 'min_data_in_leaf': 168}. Best is trial 15 with value: 0.49877623401352195.[0m


Mean SMAPE: 0.7033


















[32m[I 2023-03-14 19:36:02,758][0m Trial 19 finished with value: 0.5430054714553861 and parameters: {'colsample_bytree': 0.876672240265486, 'colsample_bynode': 0.8643191960002234, 'max_depth': 9, 'learning_rate': 0.07277889651307486, 'lambda_l1': 3.817536482492845, 'lambda_l2': 1.2552769958120122, 'num_leaves': 786, 'min_data_in_leaf': 166}. Best is trial 15 with value: 0.49877623401352195.[0m


Mean SMAPE: 0.5430
Best trial: 0.49877623401352195 with parameters {'colsample_bytree': 0.922550834662478, 'colsample_bynode': 0.8865208494474415, 'max_depth': 9, 'learning_rate': 0.09982813819054616, 'lambda_l1': 4.135437333012513, 'lambda_l2': 1.4608779450861495, 'num_leaves': 539, 'min_data_in_leaf': 189}


In [567]:
import xgboost as xgb

def xgb_objective(trial, df_train, target_train):
    """
    Objective function for XGBoost optimization without rolling window cross-validation.
    """
    params = {
        'n_estimators': 200,
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'random_state': 42,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 10.0),
        'lambda': trial.suggest_float('lambda', 1e-2, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 250),
    }

    model = xgb.XGBRegressor(**params)
    score = smape_cv(model, df_train, target_train)
    return score


In [569]:
# Create the study
study = optuna.create_study(direction='minimize')

# Optimize the study
study.optimize(lambda trial: xgb_objective(trial, df_train, target_train), n_trials=30)

# Print the best trial
print(f"Best trial: {study.best_trial.value} with parameters {study.best_trial.params}")


[32m[I 2023-03-14 19:38:17,243][0m A new study created in memory with name: no-name-6bced3e3-1f87-4326-8537-55141f03b4db[0m
[32m[I 2023-03-14 19:38:37,577][0m Trial 0 finished with value: 2.704374945494864 and parameters: {'colsample_bytree': 0.4682941391932415, 'colsample_bynode': 0.6982854631495193, 'max_depth': 8, 'learning_rate': 0.015216171491942062, 'alpha': 9.758805941563816, 'lambda': 0.4437882972622384, 'min_child_weight': 22}. Best is trial 0 with value: 2.704374945494864.[0m


Mean SMAPE: 2.7044


[32m[I 2023-03-14 19:38:52,449][0m Trial 1 finished with value: 1.0140977498720574 and parameters: {'colsample_bytree': 0.412737287458086, 'colsample_bynode': 0.24590672717561096, 'max_depth': 6, 'learning_rate': 0.045272347755726104, 'alpha': 9.088250070490984, 'lambda': 1.9219035329453777, 'min_child_weight': 7}. Best is trial 1 with value: 1.0140977498720574.[0m


Mean SMAPE: 1.0141


[32m[I 2023-03-14 19:39:11,892][0m Trial 2 finished with value: 0.634442486787863 and parameters: {'colsample_bytree': 0.34220562707296787, 'colsample_bynode': 0.5125269664182217, 'max_depth': 6, 'learning_rate': 0.05499796909602932, 'alpha': 3.3487545746642224, 'lambda': 2.97233068127703, 'min_child_weight': 164}. Best is trial 2 with value: 0.634442486787863.[0m


Mean SMAPE: 0.6344


[32m[I 2023-03-14 19:40:04,890][0m Trial 3 finished with value: 0.3947248514907172 and parameters: {'colsample_bytree': 0.9704938060255041, 'colsample_bynode': 0.6347958159527731, 'max_depth': 8, 'learning_rate': 0.07148620646545495, 'alpha': 5.637277708051547, 'lambda': 2.1040886445097766, 'min_child_weight': 131}. Best is trial 3 with value: 0.3947248514907172.[0m


Mean SMAPE: 0.3947


[32m[I 2023-03-14 19:40:19,716][0m Trial 4 finished with value: 4.415756868484651 and parameters: {'colsample_bytree': 0.43317525570134396, 'colsample_bynode': 0.5479895614972511, 'max_depth': 5, 'learning_rate': 0.012556416286093395, 'alpha': 3.803379833191915, 'lambda': 7.573408106063435, 'min_child_weight': 99}. Best is trial 3 with value: 0.3947248514907172.[0m


Mean SMAPE: 4.4158


[32m[I 2023-03-14 19:40:26,434][0m Trial 5 finished with value: 2.3727163816313572 and parameters: {'colsample_bytree': 0.16841460812657605, 'colsample_bynode': 0.2695972141646038, 'max_depth': 3, 'learning_rate': 0.04437588681751722, 'alpha': 6.477419844861327, 'lambda': 9.7624847104419, 'min_child_weight': 223}. Best is trial 3 with value: 0.3947248514907172.[0m


Mean SMAPE: 2.3727


[32m[I 2023-03-14 19:40:36,534][0m Trial 6 finished with value: 3.086685909456854 and parameters: {'colsample_bytree': 0.11236394790089554, 'colsample_bynode': 0.7691182043135943, 'max_depth': 8, 'learning_rate': 0.01879933494761972, 'alpha': 5.979739570588936, 'lambda': 8.722094960761007, 'min_child_weight': 85}. Best is trial 3 with value: 0.3947248514907172.[0m


Mean SMAPE: 3.0867


[32m[I 2023-03-14 19:40:52,036][0m Trial 7 finished with value: 1.103094195365116 and parameters: {'colsample_bytree': 0.1626064839356779, 'colsample_bynode': 0.5889749587825721, 'max_depth': 7, 'learning_rate': 0.06559151330886043, 'alpha': 5.869296614911299, 'lambda': 8.40109276010664, 'min_child_weight': 114}. Best is trial 3 with value: 0.3947248514907172.[0m


Mean SMAPE: 1.1031


[32m[I 2023-03-14 19:41:48,713][0m Trial 8 finished with value: 0.37355391510470765 and parameters: {'colsample_bytree': 0.9811239871446191, 'colsample_bynode': 0.9880110640642417, 'max_depth': 10, 'learning_rate': 0.02664991657063991, 'alpha': 8.694159070362746, 'lambda': 3.198759207939015, 'min_child_weight': 32}. Best is trial 8 with value: 0.37355391510470765.[0m


Mean SMAPE: 0.3736


[32m[I 2023-03-14 19:41:54,026][0m Trial 9 finished with value: 5.5722010116715515 and parameters: {'colsample_bytree': 0.16747325557100784, 'colsample_bynode': 0.21400761085049358, 'max_depth': 7, 'learning_rate': 0.012061107934022442, 'alpha': 8.39774298017449, 'lambda': 6.875664527028463, 'min_child_weight': 57}. Best is trial 8 with value: 0.37355391510470765.[0m


Mean SMAPE: 5.5722


[32m[I 2023-03-14 19:43:10,036][0m Trial 10 finished with value: 0.6030201477390257 and parameters: {'colsample_bytree': 0.9824505003489774, 'colsample_bynode': 0.9909140551039898, 'max_depth': 10, 'learning_rate': 0.025266628462142584, 'alpha': 0.9430175016124718, 'lambda': 4.561797020462248, 'min_child_weight': 247}. Best is trial 8 with value: 0.37355391510470765.[0m


Mean SMAPE: 0.6030


[32m[I 2023-03-14 19:44:28,255][0m Trial 11 finished with value: 0.40770975313113167 and parameters: {'colsample_bytree': 0.9963081468724337, 'colsample_bynode': 0.9783536463800468, 'max_depth': 10, 'learning_rate': 0.09506606128845521, 'alpha': 7.804857418111723, 'lambda': 3.386831516086929, 'min_child_weight': 161}. Best is trial 8 with value: 0.37355391510470765.[0m


Mean SMAPE: 0.4077


[32m[I 2023-03-14 19:45:09,696][0m Trial 12 finished with value: 0.4403990640213202 and parameters: {'colsample_bytree': 0.8325631133608942, 'colsample_bynode': 0.847944309746787, 'max_depth': 9, 'learning_rate': 0.029230889918562417, 'alpha': 7.749936615036565, 'lambda': 0.06375027081316675, 'min_child_weight': 157}. Best is trial 8 with value: 0.37355391510470765.[0m


Mean SMAPE: 0.4404


[32m[I 2023-03-14 19:45:58,743][0m Trial 13 finished with value: 0.3231983642535061 and parameters: {'colsample_bytree': 0.7769442483931559, 'colsample_bynode': 0.8687191780722157, 'max_depth': 9, 'learning_rate': 0.035130120924274415, 'alpha': 7.049640654593863, 'lambda': 5.149566711277331, 'min_child_weight': 46}. Best is trial 13 with value: 0.3231983642535061.[0m


Mean SMAPE: 0.3232


[32m[I 2023-03-14 19:46:38,685][0m Trial 14 finished with value: 0.6570556218623265 and parameters: {'colsample_bytree': 0.7762297490990856, 'colsample_bynode': 0.8504486297064486, 'max_depth': 10, 'learning_rate': 0.023373659695271845, 'alpha': 9.813455752540357, 'lambda': 5.6591762440070905, 'min_child_weight': 47}. Best is trial 13 with value: 0.3231983642535061.[0m


Mean SMAPE: 0.6571


[32m[I 2023-03-14 19:47:29,049][0m Trial 15 finished with value: 0.33660470573846124 and parameters: {'colsample_bytree': 0.6802133288252847, 'colsample_bynode': 0.9992080325123118, 'max_depth': 9, 'learning_rate': 0.03489024768696658, 'alpha': 7.463047108437568, 'lambda': 5.119704029077876, 'min_child_weight': 60}. Best is trial 13 with value: 0.3231983642535061.[0m


Mean SMAPE: 0.3366


[32m[I 2023-03-14 19:48:14,294][0m Trial 16 finished with value: 0.3562230731827654 and parameters: {'colsample_bytree': 0.6412056135826667, 'colsample_bynode': 0.8687727899684035, 'max_depth': 9, 'learning_rate': 0.03616614468305965, 'alpha': 7.121859642173829, 'lambda': 5.319228675640063, 'min_child_weight': 69}. Best is trial 13 with value: 0.3231983642535061.[0m


Mean SMAPE: 0.3562


[32m[I 2023-03-14 19:48:40,741][0m Trial 17 finished with value: 0.3431757253487018 and parameters: {'colsample_bytree': 0.6647946424243605, 'colsample_bynode': 0.7524799347984756, 'max_depth': 4, 'learning_rate': 0.03584456963000797, 'alpha': 6.8631961821855505, 'lambda': 6.295863302066506, 'min_child_weight': 75}. Best is trial 13 with value: 0.3231983642535061.[0m


Mean SMAPE: 0.3432


[32m[I 2023-03-14 19:49:24,973][0m Trial 18 finished with value: 0.315527144979913 and parameters: {'colsample_bytree': 0.5883228563151464, 'colsample_bynode': 0.9155622657255146, 'max_depth': 9, 'learning_rate': 0.03430375555235966, 'alpha': 4.988248109104449, 'lambda': 4.468698542706674, 'min_child_weight': 37}. Best is trial 18 with value: 0.315527144979913.[0m


Mean SMAPE: 0.3155


[32m[I 2023-03-14 19:50:01,766][0m Trial 19 finished with value: 0.8859331945999285 and parameters: {'colsample_bytree': 0.5773300076132888, 'colsample_bynode': 0.8847151268331112, 'max_depth': 8, 'learning_rate': 0.021424951740198376, 'alpha': 5.073678663990298, 'lambda': 4.150987577386679, 'min_child_weight': 11}. Best is trial 18 with value: 0.315527144979913.[0m


Mean SMAPE: 0.8859


[32m[I 2023-03-14 19:50:37,353][0m Trial 20 finished with value: 0.39584091248838166 and parameters: {'colsample_bytree': 0.781973649355837, 'colsample_bynode': 0.4424624425573453, 'max_depth': 9, 'learning_rate': 0.03199835147273825, 'alpha': 4.200717806611307, 'lambda': 5.713394732051283, 'min_child_weight': 35}. Best is trial 18 with value: 0.315527144979913.[0m


Mean SMAPE: 0.3958


[32m[I 2023-03-14 19:51:22,080][0m Trial 21 finished with value: 0.36987006114431986 and parameters: {'colsample_bytree': 0.5965624031035428, 'colsample_bynode': 0.9207417229022221, 'max_depth': 9, 'learning_rate': 0.03806193014213736, 'alpha': 7.220679973813675, 'lambda': 4.689041264784632, 'min_child_weight': 50}. Best is trial 18 with value: 0.315527144979913.[0m


Mean SMAPE: 0.3699


[32m[I 2023-03-14 19:52:02,547][0m Trial 22 finished with value: 0.4246261860663479 and parameters: {'colsample_bytree': 0.6755389950457995, 'colsample_bynode': 0.7937964175799626, 'max_depth': 9, 'learning_rate': 0.02992040013461166, 'alpha': 4.830671050216401, 'lambda': 6.37718516440858, 'min_child_weight': 90}. Best is trial 18 with value: 0.315527144979913.[0m


Mean SMAPE: 0.4246


[33m[W 2023-03-14 19:52:06,134][0m Trial 23 failed with parameters: {'colsample_bytree': 0.5208313996967846, 'colsample_bynode': 0.9120042568617732, 'max_depth': 7, 'learning_rate': 0.04413552365828709, 'alpha': 7.725905284934811, 'lambda': 5.1487578637215226, 'min_child_weight': 65} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/bb/g8c6d14n37lb8f846cg2t1_m0000gn/T/ipykernel_28597/2937589560.py", line 5, in <lambda>
    study.optimize(lambda trial: xgb_objective(trial, df_train, target_train), n_trials=30)
  File "/var/folders/bb/g8c6d14n37lb8f846cg2t1_m0000gn/T/ipykernel_28597/3957172496.py", line 22, in xgb_objective
    score = smape_cv(model, df_train, target_train)
  File "/var/folders/bb/g8c6d14n37lb8f846cg2t1_m0000gn/T/ipykernel_28597/3551183721.py", line

KeyboardInterrupt: 

In [555]:
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb


LGBM = lgb.LGBMRegressor(**params)

def smape_cv2(model):
    tscv = TimeSeriesSplit(n_splits=5)
    smape_list = []
    model_name = model.__class__.__name__
    for _, (train_index, test_index) in tqdm(enumerate(tscv.split(df_train), start=1), 
                                             desc=f'{model_name} Cross Validations', total=7):
        X_train, X_test = df_train.iloc[train_index], df_train.iloc[test_index]
        y_train, y_test = target_train.iloc[train_index], target_train.iloc[test_index]
        clf = model.fit(X_train, y_train)
        pred = clf.predict(X_test)
        pred =  np.exp(pred) * X_test['microbusiness_density']
        y_test = np.exp(y_test) * X_test['microbusiness_density']
        smape = smape_loss(y_test, pred) 
        smape_list.append(smape)
    return model_name, smape_list

def print_smape_score(model):
    # print cv and save the average
    model_name, score = smape_cv2(model)
    for i, r in enumerate(score, start=1):
        print(f'{i} FOLDS: {model_name} smape: {r:.4f}')
    print(f'\n{model_name} mean smape: {np.mean(score):.4f}')
    print('='*30)
    return model_name, np.mean(score)

models = []
scores = []
for model in [LGBM]:
    model_name, mean_score = print_smape_score(model)
    models.append(model_name)
    scores.append(mean_score)

LGBMRegressor Cross Validations:   0%|          | 0/7 [00:00<?, ?it/s]



LGBMRegressor Cross Validations:  14%|█▍        | 1/7 [00:00<00:03,  1.62it/s]



LGBMRegressor Cross Validations:  29%|██▊       | 2/7 [00:01<00:03,  1.33it/s]



LGBMRegressor Cross Validations:  43%|████▎     | 3/7 [00:02<00:03,  1.11it/s]



LGBMRegressor Cross Validations:  57%|█████▋    | 4/7 [00:03<00:03,  1.01s/it]



LGBMRegressor Cross Validations:  71%|███████▏  | 5/7 [00:05<00:02,  1.08s/it]

1 FOLDS: LGBMRegressor smape: 0.7201
2 FOLDS: LGBMRegressor smape: 0.3819
3 FOLDS: LGBMRegressor smape: 0.5324
4 FOLDS: LGBMRegressor smape: 0.3208
5 FOLDS: LGBMRegressor smape: 0.3340

LGBMRegressor mean smape: 0.4578





In [213]:
model = xgb.XGBRegressor(**params)

In [556]:
x_test = df_train[df_train.idx == 40]
model.fit(df_train, target_train)

y_pred = np.exp(model.predict(x_test)) * x_test.microbusiness_density
y_true = np.exp(x_test.md_log_diff_3) * x_test.microbusiness_density
y_base = x_test.microbusiness_density



In [557]:
y_pred = y_pred.reset_index('row_id')
y_base = y_base.reset_index()
y_base = y_base.rename(columns={'microbusiness_density': 'y_base'})
y_true = y_true.reset_index()
y_true = y_true.rename(columns={0: 'y_true'})

In [558]:
y_pred

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-12-01,3.471754
1,1003_2022-12-01,8.249109
2,1005_2022-12-01,1.252116
3,1007_2022-12-01,1.287124
4,1009_2022-12-01,1.851950
...,...,...
3130,56037_2022-12-01,2.877461
3131,56039_2022-12-01,26.267223
3132,56041_2022-12-01,4.050750
3133,56043_2022-12-01,3.060968


In [559]:
clip_err = True

if clip_err:
    df_record = y_pred.merge(y_base, on = 'row_id', how='inner')
    df_record = df_record.merge(y_true, on = 'row_id', how='inner')
    def smape_(y_true, y_pred):
        return 100  * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
    
    base_err = df_record.apply(lambda x: smape_(x.y_true, x.y_base), axis=1)
    pred_err = df_record.apply(lambda x: smape_(x.y_true, x.microbusiness_density), axis=1)
    blacklist = base_err[(base_err + 1e-3) < pred_err].index
    y_pred.iloc[blacklist] = y_base.iloc[blacklist]
    avg_base_err = base_err.mean()
    avg_pred_err = pred_err.mean()
    print(avg_base_err, avg_pred_err)
    df_sub = pd.read_csv('sample_submission.csv')
    df_sub = df_sub.drop(columns='microbusiness_density')
    df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
    y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
    merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
    merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
    # merged_df = merged_df.drop(columns='cfips')
else:
    df_sub = pd.read_csv('sample_submission.csv')
    df_sub = df_sub.drop(columns='microbusiness_density')
    df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
    y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
    merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
    merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
    # merged_df = merged_df.drop(columns='cfips')

1.8284295961187311 1.8374568832227278


In [560]:
merged_df

Unnamed: 0,row_id,cfips,microbusiness_density
0,1001_2022-11-01,1001,3.471754
1,1003_2022-11-01,1003,8.249109
2,1005_2022-11-01,1005,1.252272
3,1007_2022-11-01,1007,1.287240
4,1009_2022-11-01,1009,1.852060
...,...,...,...
25075,56037_2023-06-01,56037,2.877461
25076,56039_2023-06-01,56039,26.267223
25077,56041_2023-06-01,56041,4.050750
25078,56043_2023-06-01,56043,3.060380


In [561]:
COLS = ['GEO_ID','NAME','S0101_C01_026E']
df2020 = pd.read_csv('ACSST5Y2020.S0101-Data.csv', usecols=COLS)
df2020 = df2020.iloc[1:]
df2020['S0101_C01_026E'] = df2020['S0101_C01_026E'].astype('int')
df2020


  df2020 = pd.read_csv('ACSST5Y2020.S0101-Data.csv', usecols=COLS)


Unnamed: 0,GEO_ID,NAME,S0101_C01_026E
1,0500000US01001,"Autauga County, Alabama",42496
2,0500000US01003,"Baldwin County, Alabama",171296
3,0500000US01005,"Barbour County, Alabama",19804
4,0500000US01007,"Bibb County, Alabama",17790
5,0500000US01009,"Blount County, Alabama",44383
...,...,...,...
3217,0500000US72145,"Vega Baja Municipio, Puerto Rico",41383
3218,0500000US72147,"Vieques Municipio, Puerto Rico",6884
3219,0500000US72149,"Villalba Municipio, Puerto Rico",17344
3220,0500000US72151,"Yabucoa Municipio, Puerto Rico",26803


In [562]:
df2021 = pd.read_csv('ACSST5Y2021.S0101-Data.csv', usecols=COLS)
df2021 = df2021.iloc[1:]
df2021['S0101_C01_026E'] = df2021['S0101_C01_026E'].astype('int')
df2021

  df2021 = pd.read_csv('ACSST5Y2021.S0101-Data.csv', usecols=COLS)


Unnamed: 0,GEO_ID,NAME,S0101_C01_026E
1,0500000US01001,"Autauga County, Alabama",44438
2,0500000US01003,"Baldwin County, Alabama",178105
3,0500000US01005,"Barbour County, Alabama",19995
4,0500000US01007,"Bibb County, Alabama",17800
5,0500000US01009,"Blount County, Alabama",45201
...,...,...,...
3217,0500000US72145,"Vega Baja Municipio, Puerto Rico",44604
3218,0500000US72147,"Vieques Municipio, Puerto Rico",6760
3219,0500000US72149,"Villalba Municipio, Puerto Rico",17811
3220,0500000US72151,"Yabucoa Municipio, Puerto Rico",25533


In [563]:
df2020['cfips'] = df2020.GEO_ID.apply(lambda x: int(x.split('US')[-1]) )
adult2020 = df2020.set_index('cfips').S0101_C01_026E.to_dict()

df2021['cfips'] = df2021.GEO_ID.apply(lambda x: int(x.split('US')[-1]) )
adult2021 = df2021.set_index('cfips').S0101_C01_026E.to_dict()

In [564]:
df2020

Unnamed: 0,GEO_ID,NAME,S0101_C01_026E,cfips
1,0500000US01001,"Autauga County, Alabama",42496,1001
2,0500000US01003,"Baldwin County, Alabama",171296,1003
3,0500000US01005,"Barbour County, Alabama",19804,1005
4,0500000US01007,"Bibb County, Alabama",17790,1007
5,0500000US01009,"Blount County, Alabama",44383,1009
...,...,...,...,...
3217,0500000US72145,"Vega Baja Municipio, Puerto Rico",41383,72145
3218,0500000US72147,"Vieques Municipio, Puerto Rico",6884,72147
3219,0500000US72149,"Villalba Municipio, Puerto Rico",17344,72149
3220,0500000US72151,"Yabucoa Municipio, Puerto Rico",26803,72151


In [565]:
merged_df.cfips = merged_df.cfips.astype(int)
merged_df['adult2020'] = merged_df.cfips.map(adult2020)
merged_df['adult2021'] = merged_df.cfips.map(adult2021)
merged_df

Unnamed: 0,row_id,cfips,microbusiness_density,adult2020,adult2021
0,1001_2022-11-01,1001,3.471754,42496,44438
1,1003_2022-11-01,1003,8.249109,171296,178105
2,1005_2022-11-01,1005,1.252272,19804,19995
3,1007_2022-11-01,1007,1.287240,17790,17800
4,1009_2022-11-01,1009,1.852060,44383,45201
...,...,...,...,...,...
25075,56037_2023-06-01,56037,2.877461,32049,31514
25076,56039_2023-06-01,56039,26.267223,19164,19169
25077,56041_2023-06-01,56041,4.050750,14516,14641
25078,56043_2023-06-01,56043,3.060380,6045,6000


In [566]:
merged_df.microbusiness_density = merged_df.microbusiness_density * merged_df.adult2020 / merged_df.adult2021
merged_df = merged_df.drop(['adult2020','adult2021','cfips'], axis=1)
merged_df.to_csv('lgbm_MAY.csv',index=False)
merged_df.head()

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-11-01,3.320034
1,1003_2022-11-01,7.933743
2,1005_2022-11-01,1.24031
3,1007_2022-11-01,1.286517
4,1009_2022-11-01,1.818544
