In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import klib

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('data/train.csv', index_col=[0])
test = pd.read_csv('data/test.csv', index_col=[0])

train_df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [3]:
test['Response'] = 0
test.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228,0
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123,0
11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271,0
11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115,0
11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148,0


In [4]:
df = pd.concat([train_df, test], axis=0)
df

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0
...,...,...,...,...,...,...,...,...,...,...,...
19174659,Male,57,1,28.0,0,1-2 Year,Yes,51661.0,124.0,109,0
19174660,Male,28,1,50.0,1,< 1 Year,No,25651.0,152.0,184,0
19174661,Male,47,1,33.0,1,1-2 Year,No,2630.0,138.0,63,0
19174662,Male,30,1,28.0,0,< 1 Year,Yes,38866.0,124.0,119,0


In [5]:
df['Vehicle_Age'].unique()

array(['1-2 Year', '> 2 Years', '< 1 Year'], dtype=object)

In [6]:
df['Gender'] = df.apply(lambda x: 0 if x['Gender'] == 'Female' else 1, axis=1)
df['Vehicle_Age'].replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years':2}, inplace=True)
df['Vehicle_Damage'].replace({'Yes': 1, 'No': 0}, inplace=True)
trans_to_int = ['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel']
df[trans_to_int] = df[trans_to_int].astype(int)
df.dtypes

Gender                  int64
Age                     int64
Driving_License         int64
Region_Code             int32
Previously_Insured      int64
Vehicle_Age             int64
Vehicle_Damage          int64
Annual_Premium          int32
Policy_Sales_Channel    int32
Vintage                 int64
Response                int64
dtype: object

In [7]:
train = df[:train_df.shape[0] + 2]
test = df[train_df.shape[0] + 2:]

aucs = []
preds = []
kold_data = StratifiedKFold(shuffle=True, n_splits=5, random_state=34)

CatBoostmodel = CatBoostClassifier(loss_function='Logloss', eval_metric='AUC',
                                learning_rate=0.05, iterations=5000, depth=9,
                                random_strength=0, l2_leaf_reg=0.5, task_type='CPU',
                                random_seed=42, verbose=False)
for fold, (train_idx, valid_idx) in enumerate(kold_data.split(train, train['Response'])):
    X_train = train.loc[train_idx, train.columns.drop('Response')]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test[X_train.columns]

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)
    X_test_pool = Pool(X_test, cat_features=X_test.columns.values)

    CatBoostmodel.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)
    pred_valid = CatBoostmodel.predict_proba(X_valid_pool)[:, 1]
    preds.append(CatBoostmodel.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    aucs.append(auc)

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')



0:	test: 0.8684675	best: 0.8684675 (0)	total: 23.4s	remaining: 1d 8h 25m 27s


KeyboardInterrupt: 

In [None]:
## create submission
submission = test[['id']]
submission['Response'] = np.mean(preds, axis=0)

submission.to_csv('submission.csv', index=False)
submission