In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv',index_col = "id")
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv',index_col = "id")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv',index_col = "id")

train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]

In [3]:
def date_preprocessing(df, yearT = 360):
    df = df.copy()
    df["Policy_Start_Date"] = pd.to_datetime(df["Policy_Start_Date"])
    days_in_month = df["Policy_Start_Date"].dt.days_in_month
    
    df["Year"] = df["Policy_Start_Date"].dt.year
    df["Month"] = df["Policy_Start_Date"].dt.month
    df["Day"] = df["Policy_Start_Date"].dt.day
    
    df["Sin_Month"] = np.sin(df["Month"] * np.pi * 2 / 12)
    df["Cos_Month"] = np.cos(df["Month"] * np.pi * 2 / 12)
    

    df["Day_of_Year"] = df["Policy_Start_Date"].dt.dayofyear
    df["Sin_Year_Day"] = np.sin(df["Day_of_Year"] * np.pi * 2 / yearT)
    df["Cos_Year_Day"] = np.cos(df["Day_of_Year"] * np.pi * 2 / yearT)
    
    df["Sin_Day"] = np.sin(df["Day"] * np.pi * 2 / days_in_month)
    df["Cos_Day"] = np.cos(df["Day"] * np.pi * 2 / days_in_month)

    df.drop("Policy_Start_Date",axis = 1,inplace = True)

    return df


train = date_preprocessing(train)
test = date_preprocessing(test)

In [4]:
cat_features = [col for col in train.columns if train[col].dtype == 'object']

for col in cat_features:
    train[col] = train[col].fillna("None").astype("category")
    test[col] = test[col].fillna("None").astype("category")

In [5]:
X,y = train.drop("Premium_Amount",axis = 1).copy(),train["Premium_Amount"].copy()
kf = KFold(n_splits = 5,random_state = 42,shuffle = True)

lgb_oof_preds = np.zeros(len(y))
lgb_test_preds = np.zeros(len(test))
lgb_train_scores =[]

lgb_params = {
                'random_state': 42,
                'objective': 'regression',
                'metric': 'rmse',
                'boosting_type': 'gbdt',
                'verbosity': -1,
                'enable_categorical': True, 
                "feature_pre_filter": False,
                "lambda_l1": 4.940290299092639e-08,
                "lambda_l2": 4.624768708621535,
                "num_leaves": 73,
                "feature_fraction": 1.0,
                "bagging_fraction": 1.0,
                "bagging_freq": 0,
                "min_child_samples": 50,
             }

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    y_log = np.log1p(y)
    X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[val_idx]


    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(X_valid, y_valid, categorical_feature=cat_features, reference=lgb_train)

    
    print("--------------")
    model = lgb.train(
        lgb_params,
        lgb_train,
        valid_sets=[lgb_train,lgb_valid],
        valid_names=["Training","Validation"],
        num_boost_round = 3000,
        callbacks=[lgb.early_stopping(stopping_rounds=50)],  
    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    
    y_pred = np.maximum(0,y_pred)
    y_valid = np.maximum(0,y_valid)
    
    rmsle = mean_squared_error(y_valid, y_pred,squared=False)
    print("Fold:", fold + 1,"RMSLE:", rmsle)
    lgb_train_scores.append(rmsle)

    y_test_preds = np.expm1(model.predict(test))
    lgb_oof_preds[val_idx] = y_pred
    lgb_test_preds += y_test_preds

lgb_test_preds /= kf.get_n_splits()
print("Mean RMSLE:", np.mean(lgb_train_scores),"±", np.std(lgb_train_scores))

--------------
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[50]	Training's rmse: 1.04108	Validation's rmse: 1.04632
Fold: 1 RMSLE: 1.0463204322636546
--------------
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[76]	Training's rmse: 1.03931	Validation's rmse: 1.04567
Fold: 2 RMSLE: 1.045665457539552
--------------
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	Training's rmse: 1.04083	Validation's rmse: 1.04652
Fold: 3 RMSLE: 1.0465176152208147
--------------
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	Training's rmse: 1.03976	Validation's rmse: 1.04403
Fold: 4 RMSLE: 1.0440267680848214
--------------
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	Training's rmse: 1.04085	Validation's rmse: 1.04599
Fold: 5 RMSLE: 1.04599495

In [6]:
experiment_results = pd.DataFrame(
    data=[lgb_train_scores],
    columns=[f"Fold_{i}" for i in range(1, len(lgb_train_scores) + 1)],  
    index=["LGBM GBDT Tuned"]  
)

np.savetxt("lgb_gbdt_tuned_oof.txt",lgb_oof_preds)
np.savetxt("lgb_gbdt_tuned_test.txt",lgb_test_preds)
print(experiment_results)

experiment_results.to_csv("LGBM_GBDT_Tuned.csv")

sample_submission["Premium Amount"] = lgb_test_preds
sample_submission.to_csv("submission.csv")

                  Fold_1    Fold_2    Fold_3    Fold_4    Fold_5
LGBM GBDT Tuned  1.04632  1.045665  1.046518  1.044027  1.045995
