In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv',index_col = "id")
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv',index_col = "id")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv',index_col = "id").iloc[:1000]

train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]

def date_preprocessing(df, yearT = 360):
    df = df.copy()
    df["Policy_Start_Date"] = pd.to_datetime(df["Policy_Start_Date"])
    days_in_month = df["Policy_Start_Date"].dt.days_in_month
    
    df["Year"] = df["Policy_Start_Date"].dt.year
    df["Month"] = df["Policy_Start_Date"].dt.month
    df["Day"] = df["Policy_Start_Date"].dt.day
    
    df["Sin_Month"] = np.sin(df["Month"] * np.pi * 2 / 12)
    df["Cos_Month"] = np.cos(df["Month"] * np.pi * 2 / 12)
    

    df["Day_of_Year"] = df["Policy_Start_Date"].dt.dayofyear
    df["Sin_Year_Day"] = np.sin(df["Day_of_Year"] * np.pi * 2 / yearT)
    df["Cos_Year_Day"] = np.cos(df["Day_of_Year"] * np.pi * 2 / yearT)
    
    df["Sin_Day"] = np.sin(df["Day"] * np.pi * 2 / days_in_month)
    df["Cos_Day"] = np.cos(df["Day"] * np.pi * 2 / days_in_month)

    df.drop("Policy_Start_Date",axis = 1,inplace = True)

    return df


train = date_preprocessing(train)
test = date_preprocessing(test)

cat_features = [col for col in train.columns if train[col].dtype == 'object']

for col in cat_features:
    train[col] = train[col].fillna("None").astype("category")
    test[col] = test[col].fillna("None").astype("category")

In [3]:
xgb_params = {'booster': "gbtree",
             'verbosity': 0,
             'tree_method': "hist",
             'objective': 'reg:squarederror',
             'eval_metric': "rmse",
             'seed':42,
             'enable_categorical': True}

X,y = train.drop("Premium_Amount",axis = 1).copy(),train["Premium_Amount"].copy()
kf = KFold(n_splits = 5,random_state = 42,shuffle = True)

xgb_oof_preds = np.zeros(len(y))
xgb_test_preds = np.zeros(len(test))
xgb_train_scores =[]


for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    y_log = np.log1p(y)
    X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[val_idx]


    xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical = True)
    xgb_valid = xgb.DMatrix(X_valid, y_valid, enable_categorical = True)

    
    print("--------------")
    model = xgb.train(
        xgb_params,
        xgb_train,
        evals=[(xgb_train, "train"), (xgb_valid, "valid")],
        verbose_eval=0,
        num_boost_round = 3000,
        early_stopping_rounds=50,
    )

    y_pred = model.predict(xgb.DMatrix(X_valid,enable_categorical=True),
                           iteration_range=(0, model.best_iteration + 1))
    
    y_pred = np.maximum(0,y_pred)
    y_valid = np.maximum(0,y_valid)
    
    rmsle = mean_squared_error(y_valid, y_pred,squared=False)
    print("Fold:", fold + 1,"RMSLE:", rmsle)
    xgb_train_scores.append(rmsle)

    xgb_test = xgb.DMatrix(test, enable_categorical=True)
    y_test_preds = model.predict(xgb_test, iteration_range=(0, model.best_iteration + 1))
    xgb_oof_preds[val_idx] = y_pred
    xgb_test_preds += np.expm1(y_test_preds)

xgb_test_preds /= kf.get_n_splits()
print("Mean RMSLE:", np.mean(xgb_train_scores),"±", np.std(xgb_train_scores))

--------------
Fold: 1 RMSLE: 1.0469992863275432
--------------
Fold: 2 RMSLE: 1.0461114440610348
--------------
Fold: 3 RMSLE: 1.0471152432996285
--------------
Fold: 4 RMSLE: 1.0452340735032741
--------------
Fold: 5 RMSLE: 1.0466784588548186
Mean RMSLE: 1.0464277012092598 ± 0.000690951337588659


In [4]:
experiment_results = pd.DataFrame(
    data=[xgb_train_scores],  #
    columns=[f"Fold_{i}" for i in range(1, len(xgb_train_scores) + 1)],  
    index=["XGB GBDT Base"]  
)

np.savetxt("xgb_gbdt_oof.txt",xgb_oof_preds)
np.savetxt("xgb_gbdt_test.txt",xgb_test_preds)

print(experiment_results)
experiment_results.to_csv("XGB_GBDT_Base.csv")

                 Fold_1    Fold_2    Fold_3    Fold_4    Fold_5
XGB GBDT Base  1.046999  1.046111  1.047115  1.045234  1.046678
