In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv',index_col = "id")
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv',index_col = "id")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv',index_col = "id")

train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]

In [3]:
def date_preprocessing(df, yearT = 360):
    df = df.copy()
    df["Policy_Start_Date"] = pd.to_datetime(df["Policy_Start_Date"])
    days_in_month = df["Policy_Start_Date"].dt.days_in_month
    
    df["Year"] = df["Policy_Start_Date"].dt.year
    df["Month"] = df["Policy_Start_Date"].dt.month
    df["Day"] = df["Policy_Start_Date"].dt.day
    
    df["Sin_Month"] = np.sin(df["Month"] * np.pi * 2 / 12)
    df["Cos_Month"] = np.cos(df["Month"] * np.pi * 2 / 12)
    

    df["Day_of_Year"] = df["Policy_Start_Date"].dt.dayofyear
    df["Sin_Year_Day"] = np.sin(df["Day_of_Year"] * np.pi * 2 / yearT)
    df["Cos_Year_Day"] = np.cos(df["Day_of_Year"] * np.pi * 2 / yearT)
    
    df["Sin_Day"] = np.sin(df["Day"] * np.pi * 2 / days_in_month)
    df["Cos_Day"] = np.cos(df["Day"] * np.pi * 2 / days_in_month)

    df.drop("Policy_Start_Date",axis = 1,inplace = True)

    return df


train = date_preprocessing(train)
test = date_preprocessing(test)

In [4]:
cat_features = [col for col in train.columns if train[col].dtype == 'object']

for col in cat_features:
    train[col] = train[col].fillna("None").astype("str")
    test[col] = test[col].fillna("None").astype("str")

In [5]:
X,y = train.drop("Premium_Amount",axis = 1).copy(),train["Premium_Amount"].copy()
kf = KFold(n_splits = 5,random_state = 42,shuffle = True)

cat_oof_preds = np.zeros(len(y))
cat_test_preds = np.zeros(len(test))
cat_train_scores =[]


cat_params = {
              'random_state': 42,
                'objective': 'RMSE',
                'boosting_type': 'Plain',
                'use_best_model': True, 
                'task_type': "GPU"
             }


for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    y_log = np.log1p(y)
    X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[val_idx]


    cat_train = cat.Pool(X_train, y_train, cat_features=cat_features)
    cat_valid = cat.Pool(X_valid, y_valid, cat_features=cat_features)

    
    print("--------------")
    model = cat.train(
        cat_train,
        cat_params,
        eval_set=[cat_valid],
        num_boost_round = 3000,
        early_stopping_rounds = 50,
        verbose = 0
        
    )

    y_pred = model.predict(X_valid)
    
    y_pred = np.maximum(0,y_pred)
    y_valid = np.maximum(0,y_valid)
    
    rmsle = mean_squared_error(y_valid, y_pred,squared=False)
    print("Fold:", fold + 1,"RMSLE:", rmsle)
    cat_train_scores.append(rmsle)

    y_test_preds = model.predict(test)
    cat_oof_preds[val_idx] = y_pred
    cat_test_preds += y_test_preds

cat_test_preds /= kf.get_n_splits()
print("Mean RMSLE:", np.mean(cat_train_scores),"±", np.std(cat_train_scores))

--------------
Fold: 1 RMSLE: 1.0487697868203678
--------------
Fold: 2 RMSLE: 1.0481439607707566
--------------
Fold: 3 RMSLE: 1.0487131338513997
--------------
Fold: 4 RMSLE: 1.046949432229331
--------------
Fold: 5 RMSLE: 1.0482351287173692
Mean RMSLE: 1.048162288477845 ± 0.0006556137195994901


In [6]:
experiment_results = pd.DataFrame(
    data=[cat_train_scores],  
    columns=[f"Fold_{i}" for i in range(1, len(cat_train_scores) + 1)],  
    index=["CAT Plain Base"]  
)

np.savetxt("cat_plain_base_oof.txt",cat_oof_preds)
np.savetxt("cat_plain_base_test.txt",cat_test_preds)


print(experiment_results)
experiment_results.to_csv("CAT_Plain_Base.csv")

                 Fold_1    Fold_2    Fold_3    Fold_4    Fold_5
CAT Plain Base  1.04877  1.048144  1.048713  1.046949  1.048235
