## Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/he-ffi-preprocess-data-v2-2/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

20

In [3]:
cat_cols = ['Insurance_company','expiry_dt_year','expiry_dt_quarter','Condition',
            'expiry_dt_month','expiry_dt_day_week','expiry_dt_day_weekend']

train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)

for col in tqdm(cat_cols):
    dummy_val = pd.get_dummies(train_df[col], prefix='col')
    train_df = pd.concat([train_df, dummy_val], axis=1)
    train_df.drop([col], inplace=True, axis=1)

print("train_df: {}".format(train_df.shape))

for col in tqdm(cat_cols):
    dummy_val = pd.get_dummies(test_df[col], prefix='col')
    test_df = pd.concat([test_df, dummy_val], axis=1)
    test_df.drop([col], inplace=True, axis=1)

print("test_df: {}".format(test_df.shape))

100%|██████████| 7/7 [00:00<00:00, 102.08it/s]
100%|██████████| 7/7 [00:00<00:00, 145.11it/s]

train_df: (1382, 1732)
test_df: (600, 1731)





In [4]:
Xtrain = train_df.loc[:, ~train_df.columns.isin(['Image_path','Amount'])].values
Ytrain = train_df['Amount'].values
Ytrain_strat = pd.qcut(train_df['Amount'].values, q=5, labels=range(0,5))
Xtest = test_df.loc[:, ~test_df.columns.isin(['Image_path'])].values

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (1382, 1730) 
Ytrain: (1382,) 
Xtest: (600, 1730)


72

## Build and validate the model

In [5]:
FOLD = 5
NUM_SEED = 3

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], 1))
y_pred_final_xgb = 0
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain_strat)):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]
        
        model = XGBRegressor(
            objective='reg:squarederror',
            eval_metric='rmse',
            booster='gbtree',
            sample_type='weighted',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            max_depth=6, 
            max_leaves=983,
            learning_rate=0.0423,
            subsample=0.9267,
            colsample_bytree=0.8243,
            min_child_weight=8,
            reg_lambda=0.2859,
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)

        y_pred = model.predict(val_x, iteration_range=(0, model.best_iteration))
        y_pred_meta_xgb[val] += np.array([y_pred]).T
        y_pred_final_xgb += model.predict(Xtest, iteration_range=(0, model.best_iteration))
        
        score = 100 * r2_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_xgb = y_pred_meta_xgb / float(NUM_SEED)
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-rmse:4718.38721	validation_1-rmse:4703.30713
[50]	validation_0-rmse:1515.21558	validation_1-rmse:2490.67749
[99]	validation_0-rmse:909.45233	validation_1-rmse:2460.43701

Seed-24 | Fold-0 | OOF Score: 18.71126452590306

[0]	validation_0-rmse:4720.63232	validation_1-rmse:4695.21484
[50]	validation_0-rmse:1518.04675	validation_1-rmse:2445.36328
[99]	validation_0-rmse:896.34448	validation_1-rmse:2424.23364

Seed-24 | Fold-1 | OOF Score: 17.284151400313174

[0]	validation_0-rmse:4712.61719	validation_1-rmse:4731.92725
[50]	validation_0-rmse:1518.38843	validation_1-rmse:2483.24268
[99]	validation_0-rmse:860.86505	validation_1-rmse:2461.87207

Seed-24 | Fold-2 | OOF Score: 15.297085289355783

[0]	validation_0-rmse:4711.09033	validation_1-rmse:4748.74365
[50]	validation_0-rmse:1475.87427	validation_1-rmse:2470.92163
[99]	validation_0-rmse:858.29462	validation_1-rmse:2414.27222

Seed-24 | Fold-3 | OOF Score: 22.97234105303392

[0]	validation_0-rmse:4701.42725	validation_1-rmse

In [6]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score=oof_score,
                    y_pred_final_xgb=y_pred_final_xgb)

## Create submission file

In [7]:
with open("../input/he-ffi-preprocess-data/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
test_df = processed_data['test_df']

del processed_data
gc.collect()

733

In [8]:
test_df1 = pd.read_csv("../input/he-fast-furious-insured/dataset/test.csv")

submit_df = pd.DataFrame()
submit_df['Image_path'] = test_df1['Image_path']
submit_df['Condition'] = test_df['Condition']
submit_df['Amount'] = y_pred_final_xgb

submit_df.to_csv("./XGB_Submission_wo_Adjustment.csv", index=False)
submit_df.head()

Unnamed: 0,Image_path,Condition,Amount
0,img_4538519.jpg,1.0,2959.886963
1,img_7766002.jpg,1.0,3864.057373
2,img_4637390.jpg,1.0,4116.391113
3,img_4516108.jpg,1.0,4265.087891
4,img_4517008.jpg,1.0,3620.954102


In [9]:
test_df1 = pd.read_csv("../input/he-fast-furious-insured/dataset/test.csv")

submit_df = pd.DataFrame()
submit_df['Image_path'] = test_df1['Image_path']
submit_df['Condition'] = test_df['Condition']
submit_df['Amount'] = y_pred_final_xgb
submit_df.loc[test_df['Condition']==0, 'Amount'] = 0

submit_df.to_csv("./XGB_Submission_w_Adjustment.csv", index=False)
submit_df.head()

Unnamed: 0,Image_path,Condition,Amount
0,img_4538519.jpg,1.0,2959.886963
1,img_7766002.jpg,1.0,3864.057373
2,img_4637390.jpg,1.0,4116.391113
3,img_4516108.jpg,1.0,4265.087891
4,img_4517008.jpg,1.0,3620.954102
