In [1]:
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import pandas as pd

df_train = pd.read_csv('df_train.csv', index_col='row_id')
df_test = pd.read_csv('df_test.csv', index_col='row_id')
target_train = pd.read_csv('target_train.csv', index_col='row_id').squeeze()
target_test = pd.read_csv('target_test.csv', index_col='row_id').squeeze()

In [2]:
def smape_loss(y_true, y_pred):
    """
    Custom SMAPE loss function for LightGBM
    """
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

def xgb_smape_eval(y_pred, y_true):
    y_true = y_true.get_label()
    return 'SMAPE', smape_loss(y_true, y_pred)

def smape_cv(model, df_train, target_train):
    tscv = TimeSeriesSplit(n_splits=5)
    smape_list = []
    for _, (train_index, test_index) in enumerate(tscv.split(df_train), start=1):
        # Create training and validation datasets
        X_train = df_train.iloc[train_index]
        y_train = target_train.iloc[train_index]
        X_valid = df_train.iloc[test_index]
        y_valid = target_train.iloc[test_index]

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=xgb_smape_eval, verbose=False)

        # Make predictions
        y_pred = np.exp(model.predict(X_valid)) * X_valid['microbusiness_density']
        y_valid = np.exp(y_valid) * X_valid['microbusiness_density']

        # Calculate SMAPE score
        smape = smape_loss(y_valid, y_pred)
        smape_list.append(smape)

    print(f'Mean SMAPE: {np.mean(smape_list):.4f}')
    return np.mean(smape_list)

def xgb_objective(trial, df_train, target_train):
    """
    Objective function for XGBoost optimization without rolling window cross-validation.
    """
    params = {
        'n_estimators': 200,
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'random_state': 42,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 10.0),
        'lambda': trial.suggest_float('lambda', 1e-2, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 250),
    }

    model = xgb.XGBRegressor(**params)
    score = smape_cv(model, df_train, target_train)
    return score

In [3]:
import optuna

# Optimize the XGBoost model
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(lambda trial: xgb_objective(trial, df_train, target_train), n_trials=20)

# Print the best trial for XGBoost
print(f"XGBoost best trial: {xgb_study.best_trial.value} with parameters {xgb_study.best_trial.params}")


[32m[I 2023-04-16 04:07:39,409][0m A new study created in memory with name: no-name-6b691d0c-60f4-46c8-b810-d51344c6fb12[0m
[32m[I 2023-04-16 04:08:08,833][0m Trial 0 finished with value: 1.8864803834509267 and parameters: {'colsample_bytree': 0.12134592038068481, 'colsample_bynode': 0.7465569808584384, 'max_depth': 10, 'learning_rate': 0.02497674143276924, 'alpha': 7.309729803932332, 'lambda': 2.0324445135383695, 'min_child_weight': 186}. Best is trial 0 with value: 1.8864803834509267.[0m


Mean SMAPE: 1.8865


[32m[I 2023-04-16 04:08:44,441][0m Trial 1 finished with value: 1.812176511859881 and parameters: {'colsample_bytree': 0.3369702902509209, 'colsample_bynode': 0.8741513757711705, 'max_depth': 3, 'learning_rate': 0.07108795537272454, 'alpha': 8.69679809574608, 'lambda': 3.738281801767208, 'min_child_weight': 43}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8122


[32m[I 2023-04-16 04:09:10,393][0m Trial 2 finished with value: 2.0957377560088113 and parameters: {'colsample_bytree': 0.4318908676707047, 'colsample_bynode': 0.6868024870813386, 'max_depth': 3, 'learning_rate': 0.02043114113600379, 'alpha': 8.604899792232104, 'lambda': 5.58533976523905, 'min_child_weight': 197}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 2.0957


[32m[I 2023-04-16 04:10:29,955][0m Trial 3 finished with value: 1.8397112278893257 and parameters: {'colsample_bytree': 0.752132811308811, 'colsample_bynode': 0.9350314198755895, 'max_depth': 4, 'learning_rate': 0.09170154109086298, 'alpha': 1.754192831516174, 'lambda': 2.681372052294992, 'min_child_weight': 194}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8397


[32m[I 2023-04-16 04:10:55,667][0m Trial 4 finished with value: 3.410968582423306 and parameters: {'colsample_bytree': 0.48968729646605125, 'colsample_bynode': 0.43520393656785894, 'max_depth': 6, 'learning_rate': 0.014580627567377318, 'alpha': 2.9515804170786843, 'lambda': 2.3090762026273866, 'min_child_weight': 237}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 3.4110


[32m[I 2023-04-16 04:11:29,132][0m Trial 5 finished with value: 2.5526663109355843 and parameters: {'colsample_bytree': 0.6055886523891348, 'colsample_bynode': 0.7534452508868785, 'max_depth': 6, 'learning_rate': 0.017289031474369655, 'alpha': 6.086836274722343, 'lambda': 5.296131272969717, 'min_child_weight': 109}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 2.5527


[32m[I 2023-04-16 04:12:17,017][0m Trial 6 finished with value: 2.596732662265556 and parameters: {'colsample_bytree': 0.9980654747070558, 'colsample_bynode': 0.33357911715687827, 'max_depth': 5, 'learning_rate': 0.016966857367296004, 'alpha': 0.034259893231894586, 'lambda': 4.205638679614789, 'min_child_weight': 145}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 2.5967


[32m[I 2023-04-16 04:12:59,032][0m Trial 7 finished with value: 2.0083211273459116 and parameters: {'colsample_bytree': 0.4365250216096015, 'colsample_bynode': 0.8338478658670877, 'max_depth': 8, 'learning_rate': 0.022032209746142957, 'alpha': 7.106992456673899, 'lambda': 7.812731162613958, 'min_child_weight': 148}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 2.0083


[32m[I 2023-04-16 04:13:25,641][0m Trial 8 finished with value: 1.9380592697439956 and parameters: {'colsample_bytree': 0.9487524994946809, 'colsample_bynode': 0.21471268171614605, 'max_depth': 6, 'learning_rate': 0.023331988968375215, 'alpha': 7.802023256795793, 'lambda': 7.585295077808368, 'min_child_weight': 223}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.9381


[32m[I 2023-04-16 04:14:01,214][0m Trial 9 finished with value: 1.8341569222905278 and parameters: {'colsample_bytree': 0.4179174343244001, 'colsample_bynode': 0.4403339109377974, 'max_depth': 5, 'learning_rate': 0.07866260196134645, 'alpha': 1.6439813973330528, 'lambda': 4.962988373905039, 'min_child_weight': 93}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8342


[32m[I 2023-04-16 04:14:54,169][0m Trial 10 finished with value: 1.826038698107545 and parameters: {'colsample_bytree': 0.21475115916666707, 'colsample_bynode': 0.9981350358497658, 'max_depth': 8, 'learning_rate': 0.048882126575198914, 'alpha': 9.457660147435973, 'lambda': 0.18898420611075828, 'min_child_weight': 16}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8260


[32m[I 2023-04-16 04:15:43,140][0m Trial 11 finished with value: 1.8297273125875477 and parameters: {'colsample_bytree': 0.19906572447247514, 'colsample_bynode': 0.9983689701111463, 'max_depth': 8, 'learning_rate': 0.05005984021849211, 'alpha': 9.998856538140673, 'lambda': 0.6652053823724264, 'min_child_weight': 9}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8297


[32m[I 2023-04-16 04:16:36,056][0m Trial 12 finished with value: 1.8770244648378445 and parameters: {'colsample_bytree': 0.27080075085501776, 'colsample_bynode': 0.9864769143288631, 'max_depth': 8, 'learning_rate': 0.04625130488890541, 'alpha': 9.441480048625193, 'lambda': 0.3027281010804812, 'min_child_weight': 6}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8770


[32m[I 2023-04-16 04:17:29,521][0m Trial 13 finished with value: 1.8201223000338846 and parameters: {'colsample_bytree': 0.31088558815540474, 'colsample_bynode': 0.608658203013638, 'max_depth': 10, 'learning_rate': 0.042973841566650475, 'alpha': 9.918514407740465, 'lambda': 9.857192541520647, 'min_child_weight': 57}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8201


[32m[I 2023-04-16 04:18:23,828][0m Trial 14 finished with value: 1.8309547693736747 and parameters: {'colsample_bytree': 0.3128501959442068, 'colsample_bynode': 0.6381679441432268, 'max_depth': 10, 'learning_rate': 0.03378871396505022, 'alpha': 5.387109456294783, 'lambda': 9.813593619040049, 'min_child_weight': 63}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 1.8310


[32m[I 2023-04-16 04:18:42,290][0m Trial 15 finished with value: 6.4676387170988345 and parameters: {'colsample_bytree': 0.3342692489467215, 'colsample_bynode': 0.5835416812931024, 'max_depth': 3, 'learning_rate': 0.01054534119717197, 'alpha': 8.311758750014329, 'lambda': 6.507180539675498, 'min_child_weight': 55}. Best is trial 1 with value: 1.812176511859881.[0m


Mean SMAPE: 6.4676


[32m[I 2023-04-16 04:19:39,782][0m Trial 16 finished with value: 1.8078326390409394 and parameters: {'colsample_bytree': 0.1590365164551108, 'colsample_bynode': 0.8167703932344198, 'max_depth': 9, 'learning_rate': 0.06997542512775441, 'alpha': 9.884726213519107, 'lambda': 9.85552222162039, 'min_child_weight': 47}. Best is trial 16 with value: 1.8078326390409394.[0m


Mean SMAPE: 1.8078


[32m[I 2023-04-16 04:20:33,639][0m Trial 17 finished with value: 1.8077138298700537 and parameters: {'colsample_bytree': 0.13050739045276077, 'colsample_bynode': 0.8412740504246687, 'max_depth': 9, 'learning_rate': 0.06884656516242062, 'alpha': 6.511191612332466, 'lambda': 3.5858974351323982, 'min_child_weight': 37}. Best is trial 17 with value: 1.8077138298700537.[0m


Mean SMAPE: 1.8077


[32m[I 2023-04-16 04:21:13,334][0m Trial 18 finished with value: 1.7958464330764052 and parameters: {'colsample_bytree': 0.10685820513121513, 'colsample_bynode': 0.757182951942658, 'max_depth': 9, 'learning_rate': 0.06363704354933969, 'alpha': 6.522802336114437, 'lambda': 8.738845417207408, 'min_child_weight': 95}. Best is trial 18 with value: 1.7958464330764052.[0m


Mean SMAPE: 1.7958


[32m[I 2023-04-16 04:21:50,884][0m Trial 19 finished with value: 1.7951311298755097 and parameters: {'colsample_bytree': 0.10880224509618677, 'colsample_bynode': 0.7127090048295895, 'max_depth': 9, 'learning_rate': 0.06040546947751284, 'alpha': 6.330624917943333, 'lambda': 6.337656972390408, 'min_child_weight': 93}. Best is trial 19 with value: 1.7951311298755097.[0m


Mean SMAPE: 1.7951
XGBoost best trial: 1.7951311298755097 with parameters {'colsample_bytree': 0.10880224509618677, 'colsample_bynode': 0.7127090048295895, 'max_depth': 9, 'learning_rate': 0.06040546947751284, 'alpha': 6.330624917943333, 'lambda': 6.337656972390408, 'min_child_weight': 93}


In [8]:
train_df = pd.read_csv('train.csv')
params = xgb_study.best_params
xgb_model = xgb.XGBRegressor(**params)

In [9]:
xgb_model.fit(df_train, target_train)
y_pred = np.exp(xgb_model.predict(df_test)) * df_test.microbusiness_density

In [10]:
my_cfips = df_train['cfips'].unique()
train = train_df[train_df.cfips.isin(my_cfips)]
pred = y_pred.values
y_true = train[train.row_id.str.contains('2022-07-01')].microbusiness_density.values
print(smape_loss(y_true, pred))

2.169519725462987


In [None]:
# clip_err = True

# if clip_err:
#     df_record = y_pred.merge(y_base, on = 'row_id', how='inner')
#     df_record = df_record.merge(y_true, on = 'row_id', how='inner')
#     def smape_(y_true, y_pred):
#         return 100  * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
    
#     base_err = df_record.apply(lambda x: smape_(x.y_true, x.y_base), axis=1)
#     pred_err = df_record.apply(lambda x: smape_(x.y_true, x.microbusiness_density), axis=1)
#     blacklist = base_err[(base_err + 1e-3) < pred_err].index
#     y_pred.iloc[blacklist] = y_base.iloc[blacklist]
#     avg_base_err = base_err.mean()
#     avg_pred_err = pred_err.mean()
#     print(avg_base_err, avg_pred_err)
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')
# else:
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')