In [29]:
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import lightgbm as lgb
import pandas as pd

train = pd.read_csv('train.csv')
df_train = pd.read_csv('df_train.csv', index_col='row_id')
df_test = pd.read_csv('df_test.csv', index_col='row_id')
target_train = pd.read_csv('target_train.csv', index_col='row_id').squeeze()
target_test = pd.read_csv('target_test.csv', index_col='row_id').squeeze()

In [30]:
def smape_loss(y_true, y_pred):
    """
    Custom SMAPE loss function for LightGBM
    """
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

def smape_cv(model, df_train, target_train):
    tscv = TimeSeriesSplit(n_splits=5)
    smape_list = []
    for _, (train_index, test_index) in enumerate(tscv.split(df_train), start=1):
        # Create training and validation datasets
        X_train = df_train.iloc[train_index]
        y_train = target_train.iloc[train_index]
        X_valid = df_train.iloc[test_index]
        y_valid = target_train.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='mape', verbose=False)

        # Make predictions
        y_pred = np.exp(model.predict(X_valid)) * X_valid['microbusiness_density']
        y_valid = np.exp(y_valid) * X_valid['microbusiness_density']

        # Calculate SMAPE score
        smape = smape_loss(y_valid, y_pred)
        smape_list.append(smape)
    
    print(f'Mean SMAPE: {np.mean(smape_list):.4f}')
    return np.mean(smape_list)

def lgbm_objective(trial, df_train, target_train):
    """
    Objective function for LightGBM optimization without rolling window cross-validation.
    """
    params = {
        'n_estimators'      : 200,
        'verbose'           : -1,
        'objective'         : 'regression_l1',
        'random_state'      : 42,
        'extra_trees'       : True,
        'colsample_bytree'  : trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode'  : trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth'         : trial.suggest_int('max_depth', 3, 10),
        'learning_rate'     : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'lambda_l1'         : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'         : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'num_leaves'        : trial.suggest_int('num_leaves', 8, 1024),
        'min_data_in_leaf'  : trial.suggest_int('min_data_in_leaf', 5, 250),
        'early_stopping_round': 50,}

    model = lgb.LGBMRegressor(**params)
    score = smape_cv(model, df_train, target_train)
    return score

In [31]:
import optuna
import pandas as pd

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: lgbm_objective(trial, df_train, target_train), n_trials=20)

# Print the best trial
print(f"Best trial: {study.best_trial.value} with parameters {study.best_trial.params}")

[32m[I 2023-04-16 04:03:04,336][0m A new study created in memory with name: no-name-79375568-c738-477b-9927-23ebdb2f8d7c[0m




[32m[I 2023-04-16 04:03:36,167][0m Trial 0 finished with value: 1.743668328012426 and parameters: {'colsample_bytree': 0.9101158510912557, 'colsample_bynode': 0.7950496453512623, 'max_depth': 8, 'learning_rate': 0.01822180689543358, 'lambda_l1': 2.5162708518776586, 'lambda_l2': 2.9497479075951083, 'num_leaves': 445, 'min_data_in_leaf': 14}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7437


[32m[I 2023-04-16 04:03:43,156][0m Trial 1 finished with value: 1.7594763742191233 and parameters: {'colsample_bytree': 0.7869775799988069, 'colsample_bynode': 0.3231580682880848, 'max_depth': 3, 'learning_rate': 0.0913837832744824, 'lambda_l1': 4.833291533136985, 'lambda_l2': 1.1082355556861707, 'num_leaves': 912, 'min_data_in_leaf': 113}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7595


[32m[I 2023-04-16 04:04:09,265][0m Trial 2 finished with value: 1.7451131523857533 and parameters: {'colsample_bytree': 0.694606850736839, 'colsample_bynode': 0.46263929362924106, 'max_depth': 9, 'learning_rate': 0.08586990864129194, 'lambda_l1': 4.052087948078223, 'lambda_l2': 9.669237847358422, 'num_leaves': 477, 'min_data_in_leaf': 98}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7451


[32m[I 2023-04-16 04:04:26,034][0m Trial 3 finished with value: 1.750253627962214 and parameters: {'colsample_bytree': 0.6207516419315072, 'colsample_bynode': 0.22679922764141547, 'max_depth': 7, 'learning_rate': 0.09320530812950635, 'lambda_l1': 7.092336873263224, 'lambda_l2': 2.8697342532911643, 'num_leaves': 985, 'min_data_in_leaf': 73}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7503


[32m[I 2023-04-16 04:04:50,144][0m Trial 4 finished with value: 1.765970658421228 and parameters: {'colsample_bytree': 0.3895490294996682, 'colsample_bynode': 0.10745088172479553, 'max_depth': 10, 'learning_rate': 0.040187954499281926, 'lambda_l1': 2.5584560677185886, 'lambda_l2': 6.595828779817709, 'num_leaves': 995, 'min_data_in_leaf': 46}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7660


[32m[I 2023-04-16 04:05:20,239][0m Trial 5 finished with value: 1.749977290417964 and parameters: {'colsample_bytree': 0.8245367376571622, 'colsample_bynode': 0.3387809566068425, 'max_depth': 9, 'learning_rate': 0.018708397886125373, 'lambda_l1': 6.559444976912319, 'lambda_l2': 2.7020526588380442, 'num_leaves': 277, 'min_data_in_leaf': 41}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7500


[32m[I 2023-04-16 04:05:28,318][0m Trial 6 finished with value: 1.7513512290098496 and parameters: {'colsample_bytree': 0.8552275286070412, 'colsample_bynode': 0.6637729679740559, 'max_depth': 4, 'learning_rate': 0.07787725184741012, 'lambda_l1': 1.6042721065671621, 'lambda_l2': 6.753227814837282, 'num_leaves': 801, 'min_data_in_leaf': 9}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7514


[32m[I 2023-04-16 04:05:38,370][0m Trial 7 finished with value: 1.773574647209599 and parameters: {'colsample_bytree': 0.11036901920718034, 'colsample_bynode': 0.4977137163134838, 'max_depth': 6, 'learning_rate': 0.030730494551666432, 'lambda_l1': 2.8947939146450734, 'lambda_l2': 3.3799910024679796, 'num_leaves': 408, 'min_data_in_leaf': 9}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7736


[32m[I 2023-04-16 04:05:45,221][0m Trial 8 finished with value: 1.7577607620193105 and parameters: {'colsample_bytree': 0.8984565314847984, 'colsample_bynode': 0.6847951130815854, 'max_depth': 3, 'learning_rate': 0.0840625467948917, 'lambda_l1': 5.989140113506917, 'lambda_l2': 2.5396833135600487, 'num_leaves': 142, 'min_data_in_leaf': 58}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7578


[32m[I 2023-04-16 04:06:09,738][0m Trial 9 finished with value: 1.750823060232793 and parameters: {'colsample_bytree': 0.8916947974277349, 'colsample_bynode': 0.14686104810899425, 'max_depth': 9, 'learning_rate': 0.07861234636521508, 'lambda_l1': 8.624962194085922, 'lambda_l2': 6.286225889630309, 'num_leaves': 344, 'min_data_in_leaf': 119}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7508


[32m[I 2023-04-16 04:06:31,913][0m Trial 10 finished with value: 1.750752922862907 and parameters: {'colsample_bytree': 0.9683700434897472, 'colsample_bynode': 0.9799530057000598, 'max_depth': 7, 'learning_rate': 0.010207934161279629, 'lambda_l1': 0.03371173673889594, 'lambda_l2': 0.33159892685769776, 'num_leaves': 636, 'min_data_in_leaf': 193}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7508


[32m[I 2023-04-16 04:06:53,854][0m Trial 11 finished with value: 1.744158466432299 and parameters: {'colsample_bytree': 0.6652945019007349, 'colsample_bynode': 0.8415916271935976, 'max_depth': 8, 'learning_rate': 0.04720396037068302, 'lambda_l1': 4.305074359702532, 'lambda_l2': 9.97722440977874, 'num_leaves': 589, 'min_data_in_leaf': 184}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7442


[32m[I 2023-04-16 04:07:08,868][0m Trial 12 finished with value: 1.7466983905369773 and parameters: {'colsample_bytree': 0.5230721527774299, 'colsample_bynode': 0.8731053367680398, 'max_depth': 6, 'learning_rate': 0.04402608218651887, 'lambda_l1': 3.567714107792349, 'lambda_l2': 8.98718387361253, 'num_leaves': 640, 'min_data_in_leaf': 179}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7467


[32m[I 2023-04-16 04:07:34,383][0m Trial 13 finished with value: 1.745604896813941 and parameters: {'colsample_bytree': 0.9664908365641642, 'colsample_bynode': 0.8013114711302523, 'max_depth': 8, 'learning_rate': 0.02062226274575233, 'lambda_l1': 4.841782539208692, 'lambda_l2': 4.681677371994051, 'num_leaves': 627, 'min_data_in_leaf': 239}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7456


[32m[I 2023-04-16 04:07:50,354][0m Trial 14 finished with value: 1.7483403773146755 and parameters: {'colsample_bytree': 0.728993975929298, 'colsample_bynode': 0.7881774462808858, 'max_depth': 8, 'learning_rate': 0.051794827865504434, 'lambda_l1': 1.8804938166078893, 'lambda_l2': 8.025283785311483, 'num_leaves': 17, 'min_data_in_leaf': 163}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7483


[32m[I 2023-04-16 04:08:47,813][0m Trial 15 finished with value: 1.7448932613112849 and parameters: {'colsample_bytree': 0.6681411431229313, 'colsample_bynode': 0.9867350918149229, 'max_depth': 10, 'learning_rate': 0.025574304796259363, 'lambda_l1': 3.9176662710381085, 'lambda_l2': 9.918705971662554, 'num_leaves': 760, 'min_data_in_leaf': 228}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7449


[32m[I 2023-04-16 04:09:34,707][0m Trial 16 finished with value: 1.755279492017774 and parameters: {'colsample_bytree': 0.5483119686324778, 'colsample_bynode': 0.6348714642927674, 'max_depth': 8, 'learning_rate': 0.015228890075960719, 'lambda_l1': 9.703828730426089, 'lambda_l2': 5.299486963627453, 'num_leaves': 504, 'min_data_in_leaf': 142}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7553


[32m[I 2023-04-16 04:10:09,632][0m Trial 17 finished with value: 1.7464101400105463 and parameters: {'colsample_bytree': 0.7840298241217043, 'colsample_bynode': 0.8603925113970067, 'max_depth': 6, 'learning_rate': 0.03281113767563762, 'lambda_l1': 5.3676481834549525, 'lambda_l2': 8.214724832275662, 'num_leaves': 230, 'min_data_in_leaf': 209}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7464


[32m[I 2023-04-16 04:10:35,045][0m Trial 18 finished with value: 1.7457025152486119 and parameters: {'colsample_bytree': 0.9630255761229617, 'colsample_bynode': 0.7318270917587809, 'max_depth': 5, 'learning_rate': 0.057252551852280856, 'lambda_l1': 0.6664869719924802, 'lambda_l2': 4.3448241800941165, 'num_leaves': 763, 'min_data_in_leaf': 136}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7457


[32m[I 2023-04-16 04:11:07,270][0m Trial 19 finished with value: 1.743683551110469 and parameters: {'colsample_bytree': 0.9964136059729585, 'colsample_bynode': 0.5756934790502, 'max_depth': 7, 'learning_rate': 0.060227554758627434, 'lambda_l1': 2.7182194656122167, 'lambda_l2': 7.4992739745951456, 'num_leaves': 582, 'min_data_in_leaf': 166}. Best is trial 0 with value: 1.743668328012426.[0m


Mean SMAPE: 1.7437
Best trial: 1.743668328012426 with parameters {'colsample_bytree': 0.9101158510912557, 'colsample_bynode': 0.7950496453512623, 'max_depth': 8, 'learning_rate': 0.01822180689543358, 'lambda_l1': 2.5162708518776586, 'lambda_l2': 2.9497479075951083, 'num_leaves': 445, 'min_data_in_leaf': 14}


In [32]:
params = study.best_trial.params
LGBM = lgb.LGBMRegressor(**params)

In [33]:
LGBM.fit(df_train, target_train)
y_pred = np.exp(LGBM.predict(df_test)) * df_test.microbusiness_density



In [34]:
pred = y_pred.values
pred

array([3.35343867, 8.52822966, 1.19579881, ..., 3.90972377, 3.13056649,
       1.80607769])

In [35]:
my_cfips = df_train['cfips'].unique()
train = train[train.cfips.isin(my_cfips)]
y_true = train[train.row_id.str.contains('2022-07-01')].microbusiness_density.values
print(smape_loss(y_true, pred))

2.250033742275787


In [37]:
target_val = train[train.row_id.str.contains('2022-07-01')][['row_id', 'microbusiness_density']]
target_val.to_csv('target_val.csv')

In [None]:
# clip_err = True

# if clip_err:
#     df_record = y_pred.merge(y_base, on = 'row_id', how='inner')
#     df_record = df_record.merge(y_true, on = 'row_id', how='inner')
#     def smape_(y_true, y_pred):
#         return 100  * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
    
#     base_err = df_record.apply(lambda x: smape_(x.y_true, x.y_base), axis=1)
#     pred_err = df_record.apply(lambda x: smape_(x.y_true, x.microbusiness_density), axis=1)
#     blacklist = base_err[(base_err + 1e-3) < pred_err].index
#     y_pred.iloc[blacklist] = y_base.iloc[blacklist]
#     avg_base_err = base_err.mean()
#     avg_pred_err = pred_err.mean()
#     print(avg_base_err, avg_pred_err)
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')
# else:
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')