## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/he-ffi-preprocess-data-v2-2/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

0

In [3]:
cat_cols = ['Insurance_company','expiry_dt_year','expiry_dt_quarter','Condition',
            'expiry_dt_month','expiry_dt_day_week','expiry_dt_day_weekend']

train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)
cat_cols_indices = [train_df.columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

[0, 6, 7, 3, 8, 12, 13]


In [4]:
Xtrain = train_df.loc[:, ~train_df.columns.isin(['Image_path','Amount'])].copy()
Ytrain = train_df['Amount'].copy()
Ytrain_strat = pd.qcut(train_df['Amount'].values, q=5, labels=range(0,5))
Xtest = test_df.loc[:, ~test_df.columns.isin(['Image_path'])].copy()

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (1382, 1691) 
Ytrain: (1382,) 
Xtest: (600, 1691)


0

## Build and validate the model

In [5]:
FOLD = 5
NUM_SEED = 3

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_cb = np.zeros((Ytrain.shape[0], 1))
y_pred_final_cb = 0
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain_strat)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        model = CatBoostRegressor(
            objective='RMSE',
            eval_metric='RMSE',
            num_boost_round=8000,
            learning_rate=0.0177,
            reg_lambda=0.8454,
            bootstrap_type='Poisson',
            subsample=0.9221,
            max_depth=11, 
            grow_policy='Lossguide',
            min_data_in_leaf=7, 
            max_leaves=54,
            task_type='GPU',
            verbose=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  cat_features=cat_cols_indices,
                  early_stopping_rounds=200, verbose=200)

        y_pred = model.predict(val_x)
        y_pred_meta_cb[val] += np.array([y_pred]).T
        y_pred_final_cb += model.predict(Xtest)
        
        score = 100 * r2_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_cb = y_pred_meta_cb / float(counter)
y_pred_final_cb = y_pred_final_cb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

0:	learn: 2676.2045874	test: 2712.9200701	best: 2712.9200701 (0)	total: 54ms	remaining: 7m 11s
200:	learn: 544.6360136	test: 2463.3699790	best: 2457.2884354 (107)	total: 8.4s	remaining: 5m 25s
bestTest = 2457.288435
bestIteration = 107
Shrink model to first 108 iterations.
Seed-24 | Fold-0 | OOF Score: 18.454469795539573
0:	learn: 2691.6699790	test: 2650.2673988	best: 2650.2673988 (0)	total: 52ms	remaining: 6m 55s
200:	learn: 564.4449322	test: 2366.0247396	best: 2365.3293556 (192)	total: 9.15s	remaining: 5m 55s
400:	learn: 179.8114951	test: 2370.7925772	best: 2364.4112722 (229)	total: 18s	remaining: 5m 40s
bestTest = 2364.411272
bestIteration = 229
Shrink model to first 230 iterations.
Seed-24 | Fold-1 | OOF Score: 20.861542976719093
0:	learn: 2689.1118858	test: 2664.4220408	best: 2664.4220408 (0)	total: 52.8ms	remaining: 7m 1s
200:	learn: 552.1501748	test: 2432.7933489	best: 2430.3771282 (164)	total: 9.22s	remaining: 5m 57s
bestTest = 2430.377128
bestIteration = 164
Shrink model to fi

In [6]:
np.savez_compressed('./CB_Meta_Features.npz',
                    y_pred_meta_cb=y_pred_meta_cb, 
                    oof_score=oof_score,
                    y_pred_final_cb=y_pred_final_cb)

## Create submission file

In [7]:
with open("../input/he-ffi-preprocess-data/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
test_df = processed_data['test_df']

del processed_data
gc.collect()

40

In [8]:
test_df1 = pd.read_csv("../input/he-fast-furious-insured/dataset/test.csv")

submit_df = pd.DataFrame()
submit_df['Image_path'] = test_df1['Image_path']
submit_df['Condition'] = test_df['Condition']
submit_df['Amount'] = y_pred_final_cb

submit_df.to_csv("./CB_Submission_wo_Adjustment.csv", index=False)
submit_df.head()

Unnamed: 0,Image_path,Condition,Amount
0,img_4538519.jpg,1.0,3056.054086
1,img_7766002.jpg,1.0,4120.957088
2,img_4637390.jpg,1.0,4368.545914
3,img_4516108.jpg,1.0,4222.799556
4,img_4517008.jpg,1.0,3933.760296


In [9]:
test_df1 = pd.read_csv("../input/he-fast-furious-insured/dataset/test.csv")

submit_df = pd.DataFrame()
submit_df['Image_path'] = test_df1['Image_path']
submit_df['Condition'] = test_df['Condition']
submit_df['Amount'] = y_pred_final_cb
submit_df.loc[test_df['Condition']==0, 'Amount'] = 0

submit_df.to_csv("./CB_Submission_w_Adjustment.csv", index=False)
submit_df.head()

Unnamed: 0,Image_path,Condition,Amount
0,img_4538519.jpg,1.0,3056.054086
1,img_7766002.jpg,1.0,4120.957088
2,img_4637390.jpg,1.0,4368.545914
3,img_4516108.jpg,1.0,4222.799556
4,img_4517008.jpg,1.0,3933.760296
