In [59]:
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import lightgbm as lgb
import pandas as pd

train = pd.read_csv('train.csv')
df_train = pd.read_csv('df_train.csv', index_col='row_id')
df_test = pd.read_csv('df_test.csv', index_col='row_id')
target_train = pd.read_csv('target_train.csv', index_col='row_id').squeeze()
target_test = pd.read_csv('target_test.csv', index_col='row_id').squeeze()

In [None]:
def smape_loss(y_true, y_pred):
    """
    Custom SMAPE loss function for LightGBM
    """
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

def smape_cv(model, df_train, target_train):
    tscv = TimeSeriesSplit(n_splits=5)
    smape_list = []
    for _, (train_index, test_index) in enumerate(tscv.split(df_train), start=1):
        # Create training and validation datasets
        X_train = df_train.iloc[train_index]
        y_train = target_train.iloc[train_index]
        X_valid = df_train.iloc[test_index]
        y_valid = target_train.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='mape', verbose=False)

        # Make predictions
        y_pred = np.exp(model.predict(X_valid)) * X_valid['microbusiness_density']
        y_valid = np.exp(y_valid) * X_valid['microbusiness_density']

        # Calculate SMAPE score
        smape = smape_loss(y_valid, y_pred)
        smape_list.append(smape)
    
    print(f'Mean SMAPE: {np.mean(smape_list):.4f}')
    return np.mean(smape_list)

def lgbm_objective(trial, df_train, target_train):
    """
    Objective function for LightGBM optimization without rolling window cross-validation.
    """
    params = {
        'n_estimators'      : 200,
        'verbose'           : -1,
        'objective'         : 'regression_l1',
        'random_state'      : 42,
        'extra_trees'       : True,
        'colsample_bytree'  : trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode'  : trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth'         : trial.suggest_int('max_depth', 3, 10),
        'learning_rate'     : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'lambda_l1'         : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'         : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'num_leaves'        : trial.suggest_int('num_leaves', 8, 1024),
        'min_data_in_leaf'  : trial.suggest_int('min_data_in_leaf', 5, 250),
        'early_stopping_round': 50,}

    model = lgb.LGBMRegressor(**params)
    score = smape_cv(model, df_train, target_train)
    return score

In [15]:
import optuna
import pandas as pd

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: lgbm_objective(trial, df_train, target_train), n_trials=20)

# Print the best trial
print(f"Best trial: {study.best_trial.value} with parameters {study.best_trial.params}")

[32m[I 2023-04-14 21:10:14,202][0m A new study created in memory with name: no-name-a1c265f8-a19c-4d54-bb6b-9315034f9c9d[0m




[32m[I 2023-04-14 21:10:23,761][0m Trial 0 finished with value: 1.7542303345776638 and parameters: {'colsample_bytree': 0.6954366493504512, 'colsample_bynode': 0.888367794528408, 'max_depth': 5, 'learning_rate': 0.05557089796308, 'lambda_l1': 0.6018665058271135, 'lambda_l2': 5.026156487310038, 'num_leaves': 139, 'min_data_in_leaf': 248}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7542


[32m[I 2023-04-14 21:10:31,604][0m Trial 1 finished with value: 1.7753899123422638 and parameters: {'colsample_bytree': 0.6406399870245952, 'colsample_bynode': 0.8551289858335458, 'max_depth': 4, 'learning_rate': 0.010465074144003937, 'lambda_l1': 2.2653315282894138, 'lambda_l2': 9.995542437705659, 'num_leaves': 402, 'min_data_in_leaf': 102}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7754


[32m[I 2023-04-14 21:10:40,269][0m Trial 2 finished with value: 1.7788806497870582 and parameters: {'colsample_bytree': 0.11592893439415639, 'colsample_bynode': 0.4984914896736866, 'max_depth': 9, 'learning_rate': 0.05342106248210964, 'lambda_l1': 2.6479298867089063, 'lambda_l2': 8.181170607113343, 'num_leaves': 713, 'min_data_in_leaf': 214}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7789


[32m[I 2023-04-14 21:10:45,099][0m Trial 3 finished with value: 1.796864832532448 and parameters: {'colsample_bytree': 0.24036751297728018, 'colsample_bynode': 0.10926868162637751, 'max_depth': 3, 'learning_rate': 0.0605160518523824, 'lambda_l1': 9.86497858014404, 'lambda_l2': 1.3930755348355035, 'num_leaves': 17, 'min_data_in_leaf': 61}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7969


[32m[I 2023-04-14 21:10:52,181][0m Trial 4 finished with value: 1.7581176505388072 and parameters: {'colsample_bytree': 0.6730038444881149, 'colsample_bynode': 0.8244038005691205, 'max_depth': 4, 'learning_rate': 0.052806867963640265, 'lambda_l1': 9.60049294528308, 'lambda_l2': 0.457718928404495, 'num_leaves': 425, 'min_data_in_leaf': 207}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7581


[32m[I 2023-04-14 21:11:13,759][0m Trial 5 finished with value: 1.7564238622396762 and parameters: {'colsample_bytree': 0.6444311594139467, 'colsample_bynode': 0.4598655888918951, 'max_depth': 9, 'learning_rate': 0.02397154994118437, 'lambda_l1': 2.671539223682533, 'lambda_l2': 1.4575759658560499, 'num_leaves': 535, 'min_data_in_leaf': 158}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7564


[32m[I 2023-04-14 21:11:28,194][0m Trial 6 finished with value: 1.7555572733122975 and parameters: {'colsample_bytree': 0.3007155705420981, 'colsample_bynode': 0.8540005315375161, 'max_depth': 9, 'learning_rate': 0.061507206678068575, 'lambda_l1': 4.042821233917824, 'lambda_l2': 9.430279746575115, 'num_leaves': 233, 'min_data_in_leaf': 145}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7556


[32m[I 2023-04-14 21:11:37,518][0m Trial 7 finished with value: 1.781446722730513 and parameters: {'colsample_bytree': 0.5764038441164506, 'colsample_bynode': 0.11604321307444028, 'max_depth': 5, 'learning_rate': 0.02527785695962093, 'lambda_l1': 4.049836517381972, 'lambda_l2': 6.615306938153532, 'num_leaves': 582, 'min_data_in_leaf': 144}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7814


[32m[I 2023-04-14 21:11:49,982][0m Trial 8 finished with value: 1.7661400765657251 and parameters: {'colsample_bytree': 0.29292239495357575, 'colsample_bynode': 0.3829897103405372, 'max_depth': 9, 'learning_rate': 0.03720958011459214, 'lambda_l1': 5.484070009727365, 'lambda_l2': 9.156959873395612, 'num_leaves': 295, 'min_data_in_leaf': 148}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7661


[32m[I 2023-04-14 21:12:07,696][0m Trial 9 finished with value: 1.7726874811364808 and parameters: {'colsample_bytree': 0.19753834021890676, 'colsample_bynode': 0.5423393362509015, 'max_depth': 9, 'learning_rate': 0.03019648550370554, 'lambda_l1': 0.10934767408981935, 'lambda_l2': 6.956863473783457, 'num_leaves': 550, 'min_data_in_leaf': 100}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7727


[32m[I 2023-04-14 21:12:18,240][0m Trial 10 finished with value: 1.7547864148866121 and parameters: {'colsample_bytree': 0.8683977638068292, 'colsample_bynode': 0.995996562134052, 'max_depth': 7, 'learning_rate': 0.09211157476852351, 'lambda_l1': 0.495301233185272, 'lambda_l2': 4.053692173000287, 'num_leaves': 862, 'min_data_in_leaf': 6}. Best is trial 0 with value: 1.7542303345776638.[0m


Mean SMAPE: 1.7548


[32m[I 2023-04-14 21:12:30,911][0m Trial 11 finished with value: 1.7515253660027548 and parameters: {'colsample_bytree': 0.9181979269158975, 'colsample_bynode': 0.9829250069655396, 'max_depth': 7, 'learning_rate': 0.09931514406901268, 'lambda_l1': 0.16304171108970622, 'lambda_l2': 4.0034753132066525, 'num_leaves': 998, 'min_data_in_leaf': 21}. Best is trial 11 with value: 1.7515253660027548.[0m


Mean SMAPE: 1.7515


[32m[I 2023-04-14 21:12:42,686][0m Trial 12 finished with value: 1.7512830113285887 and parameters: {'colsample_bytree': 0.9021574727585524, 'colsample_bynode': 0.9979866482171309, 'max_depth': 7, 'learning_rate': 0.09621989991340792, 'lambda_l1': 0.28196323343258434, 'lambda_l2': 4.036834389962285, 'num_leaves': 1006, 'min_data_in_leaf': 244}. Best is trial 12 with value: 1.7512830113285887.[0m


Mean SMAPE: 1.7513


[32m[I 2023-04-14 21:12:54,079][0m Trial 13 finished with value: 1.7526703566508783 and parameters: {'colsample_bytree': 0.9860668887807842, 'colsample_bynode': 0.9998948628556265, 'max_depth': 7, 'learning_rate': 0.09668967356958046, 'lambda_l1': 0.028211046877731155, 'lambda_l2': 3.192841036184949, 'num_leaves': 1012, 'min_data_in_leaf': 5}. Best is trial 12 with value: 1.7512830113285887.[0m


Mean SMAPE: 1.7527


[32m[I 2023-04-14 21:13:05,325][0m Trial 14 finished with value: 1.7520903016366678 and parameters: {'colsample_bytree': 0.8080214736534358, 'colsample_bynode': 0.7011809406335497, 'max_depth': 6, 'learning_rate': 0.09668122574635463, 'lambda_l1': 1.6809422250357078, 'lambda_l2': 3.0810377818521864, 'num_leaves': 1011, 'min_data_in_leaf': 53}. Best is trial 12 with value: 1.7512830113285887.[0m


Mean SMAPE: 1.7521


[32m[I 2023-04-14 21:13:19,181][0m Trial 15 finished with value: 1.7507524635585754 and parameters: {'colsample_bytree': 0.9852786192420953, 'colsample_bynode': 0.7056108869176172, 'max_depth': 7, 'learning_rate': 0.07995077763843395, 'lambda_l1': 1.0906700323714915, 'lambda_l2': 4.986165411020496, 'num_leaves': 826, 'min_data_in_leaf': 187}. Best is trial 15 with value: 1.7507524635585754.[0m


Mean SMAPE: 1.7508


[32m[I 2023-04-14 21:13:35,638][0m Trial 16 finished with value: 1.751282756249139 and parameters: {'colsample_bytree': 0.9932144499300561, 'colsample_bynode': 0.6663177719861635, 'max_depth': 8, 'learning_rate': 0.07259729510493959, 'lambda_l1': 2.016052996755408, 'lambda_l2': 5.754926949696287, 'num_leaves': 777, 'min_data_in_leaf': 193}. Best is trial 15 with value: 1.7507524635585754.[0m


Mean SMAPE: 1.7513


[32m[I 2023-04-14 21:13:53,069][0m Trial 17 finished with value: 1.7514677042387075 and parameters: {'colsample_bytree': 0.7947505540608778, 'colsample_bynode': 0.6613977757261544, 'max_depth': 8, 'learning_rate': 0.07213897018620868, 'lambda_l1': 1.6774012430934238, 'lambda_l2': 5.747661790117883, 'num_leaves': 779, 'min_data_in_leaf': 190}. Best is trial 15 with value: 1.7507524635585754.[0m


Mean SMAPE: 1.7515


[32m[I 2023-04-14 21:14:16,740][0m Trial 18 finished with value: 1.750562904110117 and parameters: {'colsample_bytree': 0.9851161038051562, 'colsample_bynode': 0.6647720786039399, 'max_depth': 10, 'learning_rate': 0.04242766504090565, 'lambda_l1': 1.5090908556438065, 'lambda_l2': 6.105983280243719, 'num_leaves': 687, 'min_data_in_leaf': 178}. Best is trial 18 with value: 1.750562904110117.[0m


Mean SMAPE: 1.7506


[32m[I 2023-04-14 21:14:35,498][0m Trial 19 finished with value: 1.7541432597372073 and parameters: {'colsample_bytree': 0.4651571259079887, 'colsample_bynode': 0.7325173422461719, 'max_depth': 10, 'learning_rate': 0.04105100171960733, 'lambda_l1': 3.5490622332470467, 'lambda_l2': 7.2261788423304285, 'num_leaves': 694, 'min_data_in_leaf': 174}. Best is trial 18 with value: 1.750562904110117.[0m


Mean SMAPE: 1.7541
Best trial: 1.750562904110117 with parameters {'colsample_bytree': 0.9851161038051562, 'colsample_bynode': 0.6647720786039399, 'max_depth': 10, 'learning_rate': 0.04242766504090565, 'lambda_l1': 1.5090908556438065, 'lambda_l2': 6.105983280243719, 'num_leaves': 687, 'min_data_in_leaf': 178}


In [31]:
params = study.best_trial.params
LGBM = lgb.LGBMRegressor(**params)

In [33]:
LGBM.fit(df_train, target_train)
y_pred = np.exp(LGBM.predict(df_test)) * df_test.microbusiness_density



array([3.36006323, 8.54059812, 1.19667252, ..., 3.91537597, 3.13450942,
       1.81122218])

In [67]:
my_cfips = df_train['cfips'].unique()
train = train[train.cfips.isin(my_cfips)]
pred = y_pred.values
y_true = train[train.row_id.str.contains('2022-07-01')].microbusiness_density.values
print(smape_loss(y_true, pred))

2.1456129466724616


In [None]:
# clip_err = True

# if clip_err:
#     df_record = y_pred.merge(y_base, on = 'row_id', how='inner')
#     df_record = df_record.merge(y_true, on = 'row_id', how='inner')
#     def smape_(y_true, y_pred):
#         return 100  * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
    
#     base_err = df_record.apply(lambda x: smape_(x.y_true, x.y_base), axis=1)
#     pred_err = df_record.apply(lambda x: smape_(x.y_true, x.microbusiness_density), axis=1)
#     blacklist = base_err[(base_err + 1e-3) < pred_err].index
#     y_pred.iloc[blacklist] = y_base.iloc[blacklist]
#     avg_base_err = base_err.mean()
#     avg_pred_err = pred_err.mean()
#     print(avg_base_err, avg_pred_err)
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')
# else:
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')