In [2]:
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import pandas as pd

df_train = pd.read_csv('df_train.csv', index_col='row_id')
df_test = pd.read_csv('df_test.csv', index_col='row_id')
target_train = pd.read_csv('target_train.csv', index_col='row_id').squeeze()
target_test = pd.read_csv('target_test.csv', index_col='row_id').squeeze()

In [8]:
def smape_loss(y_true, y_pred):
    """
    Custom SMAPE loss function for LightGBM
    """
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

def xgb_smape_eval(y_pred, y_true):
    y_true = y_true.get_label()
    return 'SMAPE', smape_loss(y_true, y_pred)

def smape_cv(model, df_train, target_train):
    tscv = TimeSeriesSplit(n_splits=5)
    smape_list = []
    for _, (train_index, test_index) in enumerate(tscv.split(df_train), start=1):
        # Create training and validation datasets
        X_train = df_train.iloc[train_index]
        y_train = target_train.iloc[train_index]
        X_valid = df_train.iloc[test_index]
        y_valid = target_train.iloc[test_index]

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=xgb_smape_eval, verbose=False)

        # Make predictions
        y_pred = np.exp(model.predict(X_valid)) * X_valid['microbusiness_density']
        y_valid = np.exp(y_valid) * X_valid['microbusiness_density']

        # Calculate SMAPE score
        smape = smape_loss(y_valid, y_pred)
        smape_list.append(smape)

    print(f'Mean SMAPE: {np.mean(smape_list):.4f}')
    return np.mean(smape_list)

def xgb_objective(trial, df_train, target_train):
    """
    Objective function for XGBoost optimization without rolling window cross-validation.
    """
    params = {
        'n_estimators': 200,
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'random_state': 42,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 10.0),
        'lambda': trial.suggest_float('lambda', 1e-2, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 250),
    }

    model = xgb.XGBRegressor(**params)
    score = smape_cv(model, df_train, target_train)
    return score

In [10]:
import optuna

# Optimize the XGBoost model
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(lambda trial: xgb_objective(trial, df_train, target_train), n_trials=20)

# Print the best trial for XGBoost
print(f"XGBoost best trial: {xgb_study.best_trial.value} with parameters {xgb_study.best_trial.params}")


[32m[I 2023-04-14 21:22:14,607][0m A new study created in memory with name: no-name-6f57d486-3fe6-49f4-890f-e0572cb824b0[0m
[32m[I 2023-04-14 21:22:35,883][0m Trial 0 finished with value: 1.803249470593744 and parameters: {'colsample_bytree': 0.15996327708571992, 'colsample_bynode': 0.2377195265350667, 'max_depth': 8, 'learning_rate': 0.04916567077570185, 'alpha': 4.247074663537053, 'lambda': 5.1522160099247, 'min_child_weight': 73}. Best is trial 0 with value: 1.803249470593744.[0m


Mean SMAPE: 1.8032


[32m[I 2023-04-14 21:23:02,881][0m Trial 1 finished with value: 1.8079803846530176 and parameters: {'colsample_bytree': 0.5246112284274684, 'colsample_bynode': 0.32254866065534993, 'max_depth': 6, 'learning_rate': 0.04041848464609895, 'alpha': 6.910955818510056, 'lambda': 8.201931571879708, 'min_child_weight': 59}. Best is trial 0 with value: 1.803249470593744.[0m


Mean SMAPE: 1.8080


[32m[I 2023-04-14 21:23:37,531][0m Trial 2 finished with value: 1.8374242055638423 and parameters: {'colsample_bytree': 0.8969775740037893, 'colsample_bynode': 0.5517303227364284, 'max_depth': 4, 'learning_rate': 0.06452171612080242, 'alpha': 2.0727376185425914, 'lambda': 2.7888406027341457, 'min_child_weight': 34}. Best is trial 0 with value: 1.803249470593744.[0m


Mean SMAPE: 1.8374


[32m[I 2023-04-14 21:23:58,209][0m Trial 3 finished with value: 1.9501427948556753 and parameters: {'colsample_bytree': 0.5388187708976597, 'colsample_bynode': 0.7177488836801681, 'max_depth': 4, 'learning_rate': 0.022658561867709908, 'alpha': 5.752395293591239, 'lambda': 6.915065587423101, 'min_child_weight': 165}. Best is trial 0 with value: 1.803249470593744.[0m


Mean SMAPE: 1.9501


[32m[I 2023-04-14 21:24:17,855][0m Trial 4 finished with value: 1.8023944964579424 and parameters: {'colsample_bytree': 0.4454304843601765, 'colsample_bynode': 0.4406783649105078, 'max_depth': 3, 'learning_rate': 0.07044926152321077, 'alpha': 3.4816204603697933, 'lambda': 8.968833568760601, 'min_child_weight': 128}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8024


[32m[I 2023-04-14 21:24:30,936][0m Trial 5 finished with value: 6.595866871511253 and parameters: {'colsample_bytree': 0.9117134020108526, 'colsample_bynode': 0.4359084046021626, 'max_depth': 3, 'learning_rate': 0.010435218884365577, 'alpha': 9.658662633175634, 'lambda': 2.8619230022715056, 'min_child_weight': 184}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 6.5959


[32m[I 2023-04-14 21:24:49,898][0m Trial 6 finished with value: 1.8531438690552533 and parameters: {'colsample_bytree': 0.8679495154930775, 'colsample_bynode': 0.43015765224230973, 'max_depth': 3, 'learning_rate': 0.026594879439767957, 'alpha': 8.602585904809867, 'lambda': 0.9604610647518953, 'min_child_weight': 174}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8531


[32m[I 2023-04-14 21:25:24,008][0m Trial 7 finished with value: 1.8082388857368525 and parameters: {'colsample_bytree': 0.8815294347264775, 'colsample_bynode': 0.3592983166094136, 'max_depth': 6, 'learning_rate': 0.04047243092559217, 'alpha': 3.56826210277354, 'lambda': 8.633045780212402, 'min_child_weight': 135}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8082


[32m[I 2023-04-14 21:26:19,250][0m Trial 8 finished with value: 1.8349215576156368 and parameters: {'colsample_bytree': 0.5644285037421747, 'colsample_bynode': 0.6036854782740698, 'max_depth': 8, 'learning_rate': 0.05314486587317957, 'alpha': 3.645582394958934, 'lambda': 6.062061777466211, 'min_child_weight': 27}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8349


[32m[I 2023-04-14 21:27:05,847][0m Trial 9 finished with value: 1.8412029462615387 and parameters: {'colsample_bytree': 0.45319047304363336, 'colsample_bynode': 0.9440636329933008, 'max_depth': 6, 'learning_rate': 0.04691720878277397, 'alpha': 0.17023645924574246, 'lambda': 4.059910156509896, 'min_child_weight': 36}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8412


[32m[I 2023-04-14 21:27:25,352][0m Trial 10 finished with value: 1.8072038827946453 and parameters: {'colsample_bytree': 0.12564512295103042, 'colsample_bynode': 0.14910921354247603, 'max_depth': 10, 'learning_rate': 0.0985986032252408, 'alpha': 6.152583204674475, 'lambda': 9.771032367449173, 'min_child_weight': 248}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8072


[32m[I 2023-04-14 21:27:41,023][0m Trial 11 finished with value: 1.8067008322337845 and parameters: {'colsample_bytree': 0.1002090050748849, 'colsample_bynode': 0.14089114605772923, 'max_depth': 8, 'learning_rate': 0.0823428507291614, 'alpha': 4.626372392856667, 'lambda': 5.673181776263391, 'min_child_weight': 93}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8067


[32m[I 2023-04-14 21:28:09,020][0m Trial 12 finished with value: 1.809980119680862 and parameters: {'colsample_bytree': 0.27507990701484786, 'colsample_bynode': 0.2618529938083808, 'max_depth': 8, 'learning_rate': 0.0644493798369822, 'alpha': 2.74358613273084, 'lambda': 7.057967771693754, 'min_child_weight': 97}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8100


[32m[I 2023-04-14 21:28:40,411][0m Trial 13 finished with value: 1.8041517405499838 and parameters: {'colsample_bytree': 0.2899925761382155, 'colsample_bynode': 0.22808660835517652, 'max_depth': 9, 'learning_rate': 0.07845204895371492, 'alpha': 4.714974622844053, 'lambda': 4.860425723447842, 'min_child_weight': 89}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8042


[32m[I 2023-04-14 21:28:57,351][0m Trial 14 finished with value: 1.817219927990374 and parameters: {'colsample_bytree': 0.2726904513520999, 'colsample_bynode': 0.10002118637172003, 'max_depth': 5, 'learning_rate': 0.09849447863173595, 'alpha': 1.8819185918764898, 'lambda': 9.939059561139535, 'min_child_weight': 136}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8172


[32m[I 2023-04-14 21:29:29,108][0m Trial 15 finished with value: 1.8070408612375513 and parameters: {'colsample_bytree': 0.3851920268652001, 'colsample_bynode': 0.4056552716757093, 'max_depth': 7, 'learning_rate': 0.05840356173504955, 'alpha': 7.233879314269442, 'lambda': 7.924305313633008, 'min_child_weight': 77}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8070


[32m[I 2023-04-14 21:30:02,485][0m Trial 16 finished with value: 1.807770577873298 and parameters: {'colsample_bytree': 0.6731101633671689, 'colsample_bynode': 0.2564802577098606, 'max_depth': 10, 'learning_rate': 0.0358345720356659, 'alpha': 4.844376479873517, 'lambda': 6.672596485777712, 'min_child_weight': 212}. Best is trial 4 with value: 1.8023944964579424.[0m


Mean SMAPE: 1.8078


[32m[I 2023-04-14 21:30:27,579][0m Trial 17 finished with value: 1.797837432080779 and parameters: {'colsample_bytree': 0.1855095282932457, 'colsample_bynode': 0.4767718115446537, 'max_depth': 7, 'learning_rate': 0.049571666682710215, 'alpha': 3.9080858816277497, 'lambda': 4.8380412070727985, 'min_child_weight': 117}. Best is trial 17 with value: 1.797837432080779.[0m


Mean SMAPE: 1.7978


[32m[I 2023-04-14 21:30:56,736][0m Trial 18 finished with value: 1.8133464667972459 and parameters: {'colsample_bytree': 0.37588198210016677, 'colsample_bynode': 0.5150589517352075, 'max_depth': 5, 'learning_rate': 0.07410277178128263, 'alpha': 3.2256701142538433, 'lambda': 4.00769844613148, 'min_child_weight': 119}. Best is trial 17 with value: 1.797837432080779.[0m


Mean SMAPE: 1.8133


[32m[I 2023-04-14 21:31:20,094][0m Trial 19 finished with value: 1.823068282980378 and parameters: {'colsample_bytree': 0.20585844831985012, 'colsample_bynode': 0.6666972960081703, 'max_depth': 7, 'learning_rate': 0.030065556249230234, 'alpha': 5.608142891394996, 'lambda': 8.959388838705339, 'min_child_weight': 153}. Best is trial 17 with value: 1.797837432080779.[0m


Mean SMAPE: 1.8231
XGBoost best trial: 1.797837432080779 with parameters {'colsample_bytree': 0.1855095282932457, 'colsample_bynode': 0.4767718115446537, 'max_depth': 7, 'learning_rate': 0.049571666682710215, 'alpha': 3.9080858816277497, 'lambda': 4.8380412070727985, 'min_child_weight': 117}


In [5]:
train_df = pd.read_csv('train.csv')
params = {'colsample_bytree': 0.1855095282932457, 'colsample_bynode': 0.4767718115446537, 'max_depth': 7, 'learning_rate': 0.049571666682710215, 'alpha': 3.9080858816277497, 'lambda': 4.8380412070727985, 'min_child_weight': 117}
xgb_model = xgb.XGBRegressor(**params)

In [6]:
xgb_model.fit(df_train, target_train)
y_pred = np.exp(xgb_model.predict(df_test)) * df_test.microbusiness_density

In [9]:
my_cfips = df_train['cfips'].unique()
train = train_df[train_df.cfips.isin(my_cfips)]
pred = y_pred.values
y_true = train[train.row_id.str.contains('2022-07-01')].microbusiness_density.values
print(smape_loss(y_true, pred))

2.0149112484480667


In [None]:
# clip_err = True

# if clip_err:
#     df_record = y_pred.merge(y_base, on = 'row_id', how='inner')
#     df_record = df_record.merge(y_true, on = 'row_id', how='inner')
#     def smape_(y_true, y_pred):
#         return 100  * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
    
#     base_err = df_record.apply(lambda x: smape_(x.y_true, x.y_base), axis=1)
#     pred_err = df_record.apply(lambda x: smape_(x.y_true, x.microbusiness_density), axis=1)
#     blacklist = base_err[(base_err + 1e-3) < pred_err].index
#     y_pred.iloc[blacklist] = y_base.iloc[blacklist]
#     avg_base_err = base_err.mean()
#     avg_pred_err = pred_err.mean()
#     print(avg_base_err, avg_pred_err)
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')
# else:
#     df_sub = pd.read_csv('sample_submission.csv')
#     df_sub = df_sub.drop(columns='microbusiness_density')
#     df_sub['cfips'] = df_sub.row_id.str.split('_', expand=True)[0]
#     y_pred['cfips'] = y_pred.row_id.str.split('_', expand=True)[0]
#     merged_df = df_sub.merge(y_pred, on='cfips', how='left', suffixes=('', '_drop'))
#     merged_df = merged_df.loc[:,~merged_df.columns.str.endswith('_drop')]
#     # merged_df = merged_df.drop(columns='cfips')