In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from utils import create_client_profile_features, get_input, one_hot_encode, catboost_cross_validation, calculate_feature_separating_ability
from sklearn.preprocessing import Normalizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold



In [2]:
history = get_input('./data/history_prep.csv')
bki = get_input('./data/bki_prep2.csv')
client_profile = get_input('./data/client_prep2.csv')
payments = get_input('./data/payments_prep2.csv')
prev_count = get_input('./data/app_prev_number.csv')

./data/history_prep.csv: shape = 1670214 rows, 89 cols
./data/bki_prep2.csv: shape = 945234 rows, 36 cols
./data/client_prep2.csv: shape = 250000 rows, 59 cols
./data/payments_prep2.csv: shape = 1023932 rows, 8 cols
./data/app_prev_number.csv: shape = 338857 rows, 2 cols


In [3]:
history = history.sort_values(by="application_number").drop_duplicates(subset=["application_number"], keep="last")
bki = bki.sort_values(by="application_number").drop_duplicates(subset=["application_number"], keep="last")
payments = payments.sort_values(by="application_number").drop_duplicates(subset=["application_number"], keep="last")

### train, test prep

In [4]:
train = get_input("./data/train.csv")
test = get_input("./data/test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)

./data/train.csv: shape = 110093 rows, 3 cols
./data/test.csv: shape = 165141 rows, 2 cols


In [5]:
data.shape

(275234, 3)

In [6]:
data = data \
    .merge(client_profile, on='application_number', how='left') \
    .merge(payments, on='application_number', how='left') \
    .merge(bki, on='application_number', how='left') \
    .merge(prev_count, on='application_number', how='left')

In [7]:
data = one_hot_encode(data)

In [8]:
data.shape

(275234, 105)

In [9]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

test = test.replace(np.inf, np.nan)
test = test.replace(-np.inf, np.nan)

In [10]:
len(numerical), len(categorial)

(103, 0)

In [11]:
train = train.fillna(0)
test = test.fillna(0)

In [12]:
# norm = Normalizer().fit(train)

In [13]:
# train = norm.transform(train)
# test = norm.transform(test)

In [13]:
scores = calculate_feature_separating_ability(train, target)

In [16]:
scores

x0_m                                 0.071453
x1_secondary / secondary special     0.068214
ratio_annuity_to_age                 0.052216
childrens                            0.035298
x0_Cash                              0.033864
                                       ...   
external_scoring_rating_2           -0.208180
external_scoring_rating_max         -0.250446
external_scoring_rating_min         -0.255171
external_scoring_rating_nanmedian   -0.281325
external_scoring_rating_mean        -0.288431
Length: 103, dtype: float64

In [19]:
feats = []

for i, x in enumerate(scores):
    if abs(x) > 0.01:
        feats.append(scores.index[i])

In [None]:
len(feats), len(train.columns)

In [20]:
%%time

cb_params = {
    "n_estimators": 2500,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 30,
    "verbose": 50,
    "max_depth": 5,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 100,
    "thread_count": 6,
    "random_seed": 1234123
}

cv = StratifiedKFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train[feats], y=target, cv=cv, categorical=[]
)

Tue Oct  6 19:13:05 2020, Cross-Validation, 110093 rows, 53 cols
0:	test: 0.6491432	test1: 0.6423007	best: 0.6423007 (0)	total: 86.3ms	remaining: 3m 35s
50:	test: 0.7011023	test1: 0.7013670	best: 0.7017893 (49)	total: 1.93s	remaining: 1m 32s
100:	test: 0.7043676	test1: 0.7042131	best: 0.7044559 (98)	total: 3.76s	remaining: 1m 29s
150:	test: 0.7080530	test1: 0.7083834	best: 0.7083842 (149)	total: 5.68s	remaining: 1m 28s
200:	test: 0.7110771	test1: 0.7104400	best: 0.7104400 (200)	total: 7.54s	remaining: 1m 26s
250:	test: 0.7136025	test1: 0.7121117	best: 0.7121117 (250)	total: 9.38s	remaining: 1m 24s
300:	test: 0.7168445	test1: 0.7140811	best: 0.7140811 (300)	total: 11.3s	remaining: 1m 22s
350:	test: 0.7197259	test1: 0.7159582	best: 0.7159582 (350)	total: 13.2s	remaining: 1m 21s
400:	test: 0.7222793	test1: 0.7178941	best: 0.7178941 (400)	total: 15.2s	remaining: 1m 19s
450:	test: 0.7244333	test1: 0.7194545	best: 0.7194545 (450)	total: 17.1s	remaining: 1m 17s
500:	test: 0.7260478	test1: 0.7

600:	test: 0.7295632	test1: 0.7217320	best: 0.7217648 (599)	total: 23.2s	remaining: 1m 13s
650:	test: 0.7309797	test1: 0.7221344	best: 0.7221344 (650)	total: 25.2s	remaining: 1m 11s
700:	test: 0.7321957	test1: 0.7225595	best: 0.7225595 (700)	total: 27.2s	remaining: 1m 9s
750:	test: 0.7333473	test1: 0.7228583	best: 0.7228583 (750)	total: 29.3s	remaining: 1m 8s
800:	test: 0.7344777	test1: 0.7230472	best: 0.7230747 (799)	total: 31.3s	remaining: 1m 6s
850:	test: 0.7353558	test1: 0.7232051	best: 0.7232188 (831)	total: 33.2s	remaining: 1m 4s
900:	test: 0.7362481	test1: 0.7234674	best: 0.7234674 (900)	total: 35.3s	remaining: 1m 2s
950:	test: 0.7371888	test1: 0.7236065	best: 0.7236065 (950)	total: 37.3s	remaining: 1m
1000:	test: 0.7381596	test1: 0.7238908	best: 0.7239178 (996)	total: 39.3s	remaining: 58.9s
1050:	test: 0.7390948	test1: 0.7238847	best: 0.7239178 (996)	total: 41.3s	remaining: 57s
1100:	test: 0.7400186	test1: 0.7240954	best: 0.7240954 (1100)	total: 43.4s	remaining: 55.1s
1150:	tes

1600:	test: 0.7483776	test1: 0.7246328	best: 0.7246499 (1599)	total: 1m 4s	remaining: 36.2s
1650:	test: 0.7490790	test1: 0.7247206	best: 0.7247279 (1649)	total: 1m 6s	remaining: 34.1s
1700:	test: 0.7497773	test1: 0.7248399	best: 0.7248399 (1700)	total: 1m 8s	remaining: 32.1s
1750:	test: 0.7504609	test1: 0.7249739	best: 0.7249904 (1749)	total: 1m 10s	remaining: 30s
1800:	test: 0.7511202	test1: 0.7250865	best: 0.7250950 (1777)	total: 1m 12s	remaining: 27.9s
1850:	test: 0.7517938	test1: 0.7251160	best: 0.7251160 (1850)	total: 1m 13s	remaining: 25.9s
1900:	test: 0.7525159	test1: 0.7251575	best: 0.7252077 (1871)	total: 1m 15s	remaining: 23.9s
1950:	test: 0.7531811	test1: 0.7251926	best: 0.7252261 (1936)	total: 1m 17s	remaining: 21.9s
2000:	test: 0.7537516	test1: 0.7252094	best: 0.7252541 (1972)	total: 1m 19s	remaining: 19.8s
2050:	test: 0.7543701	test1: 0.7253077	best: 0.7253151 (2044)	total: 1m 21s	remaining: 17.8s
2100:	test: 0.7550942	test1: 0.7254295	best: 0.7254662 (2090)	total: 1m 23s

In [21]:
oof_score = roc_auc_score(
    target, oof_preds
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.71874, 0.72732, 0.73022, 0.71888, 0.7242]

OOF-score = 0.72575


In [22]:
y_pred = np.zeros(test.shape[0])

for estimator in estimators:
    y_pred += estimator.predict_proba(test[feats])[:, 1]
    
y_pred /= len(estimators)

In [23]:
# y_pred = estimators[3].predict_proba(test[feats])[:, 1]

In [24]:
df_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred
})
df_pred.to_csv("./data/submit.csv", index=False)