__Описание источников данных:__

* train.csv - пары "заявка - целевая переменная", для этой выборки нужно собрать признаки и обучить модель;
* test.csv - пары "заявки - прогнозное значение", для этой выборки нужно собрать признаки и построить прогнозы;
* bki.csv - данные БКИ о предыдущих кредитах клиента;
* client_profile.csv - клиентский профиль, некоторые знания, которые есть у компании о клиенте;
* payments.csv - история платежей клиента;
* applications_history.csv - история предыдущих заявок клиента.

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns

import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold

from typing import List, Optional

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("train.shape = {} rows, {} cols".format(*train.shape))
print("test.shape = {} rows, {} cols".format(*test.shape))

train.shape = 110093 rows, 3 cols
test.shape = 165141 rows, 2 cols


__train.csv__

In [3]:
train.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
0,123687442,0,Cash
1,123597908,1,Cash
2,123526683,0,Cash
3,123710391,1,Cash
4,123590329,1,Cash


Номер заявки уникальный идентификатор в таблице

In [4]:
train["APPLICATION_NUMBER"].nunique()

110093

In [5]:
train["APPLICATION_NUMBER"].min()

123423341

In [6]:
train["APPLICATION_NUMBER"].max()

123730843

In [7]:
123730843-123423341

307502

In [8]:
train["NAME_CONTRACT_TYPE"].nunique()

2

In [9]:
train["NAME_CONTRACT_TYPE"].value_counts()

Cash           99551
Credit Card    10542
Name: NAME_CONTRACT_TYPE, dtype: int64

In [10]:
train["TARGET"].value_counts()

0    101196
1      8897
Name: TARGET, dtype: int64

__test.csv__

In [11]:
test.head()

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE
0,123724268,Cash
1,123456549,Cash
2,123428178,Credit Card
3,123619984,Cash
4,123671104,Cash


Номер заявки уникальный идентификатор в таблице

In [12]:
test["APPLICATION_NUMBER"].nunique()

165141

In [13]:
test["APPLICATION_NUMBER"].min()

123423342

In [14]:
test["APPLICATION_NUMBER"].max()

123730851

In [15]:
123730851-123423342

307509

In [16]:
test["NAME_CONTRACT_TYPE"].value_counts()

Cash           149432
Credit Card     15709
Name: NAME_CONTRACT_TYPE, dtype: int64

In [17]:
set(test["APPLICATION_NUMBER"]) & set(train["APPLICATION_NUMBER"])

set()

__bki.csv__

In [18]:
bki = pd.read_csv("bki.csv")
print("bki.shape = {} rows, {} cols".format(*bki.shape))

bki.shape = 945234 rows, 17 cols


In [19]:
bki.head()

Unnamed: 0,APPLICATION_NUMBER,BUREAU_ID,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,123538884,5223613,Active,currency 1,718.0,0,377.0,,19386.81,0,675000.0,320265.495,0.0,0.0,Consumer credit,39.0,
1,123436670,6207544,Closed,currency 1,696.0,0,511.0,511.0,0.0,0,93111.66,0.0,0.0,0.0,Consumer credit,505.0,
2,123589020,6326395,Closed,currency 1,165.0,0,149.0,160.0,,0,36000.0,0.0,0.0,0.0,Consumer credit,150.0,0.0
3,123494590,6606618,Active,currency 1,55.0,0,310.0,,,0,38664.0,37858.5,,0.0,Consumer credit,15.0,
4,123446603,5046832,Active,currency 1,358.0,0,35.0,,,0,67500.0,0.0,0.0,0.0,Credit card,116.0,


In [20]:
bki["APPLICATION_NUMBER"].nunique()

273131

In [21]:
bki["APPLICATION_NUMBER"].value_counts()

123444199    63
123493043    51
123641404    50
123603494    39
123604794    36
             ..
123465619     1
123459476     1
123449243     1
123756771     1
123734017     1
Name: APPLICATION_NUMBER, Length: 273131, dtype: int64

In [22]:
bki["CREDIT_ACTIVE"].value_counts()

Closed      594315
Active      347323
Sold          3583
Bad debt        13
Name: CREDIT_ACTIVE, dtype: int64

In [23]:
bki[bki["APPLICATION_NUMBER"]==123604794]

Unnamed: 0,APPLICATION_NUMBER,BUREAU_ID,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
2441,123604794,5022625,Closed,currency 1,276.0,0,214.0,234.0,,0,36000.0,0.0,,0.0,Consumer credit,234.0,
25276,123604794,5026176,Closed,currency 1,515.0,0,454.0,486.0,,0,40500.0,0.0,,0.0,Consumer credit,486.0,
26902,123604794,5095767,Closed,currency 1,663.0,0,602.0,640.0,,0,18000.0,0.0,,0.0,Consumer credit,640.0,
62594,123604794,5079117,Active,currency 1,220.0,0,26.0,,0.0,0,67765.5,19903.5,0.0,0.0,Consumer credit,10.0,10505.7
120928,123604794,5050396,Closed,currency 1,914.0,0,853.0,899.0,,0,13500.0,0.0,,0.0,Consumer credit,899.0,0.0
149705,123604794,5018643,Closed,currency 1,234.0,0,173.0,185.0,,0,40500.0,0.0,,0.0,Consumer credit,185.0,0.0
150803,123604794,5066395,Closed,currency 1,821.0,0,762.0,797.0,,0,27000.0,0.0,,0.0,Consumer credit,797.0,0.0
187962,123604794,5026552,Closed,currency 1,30.0,0,,1.0,,0,13500.0,0.0,,0.0,Microloan,1.0,0.0
210264,123604794,5032011,Closed,currency 1,1124.0,0,1063.0,1108.0,,0,4500.0,0.0,,0.0,Consumer credit,1108.0,0.0
219874,123604794,5054762,Closed,currency 1,365.0,0,304.0,346.0,,0,40500.0,0.0,,0.0,Consumer credit,346.0,0.0


Отношения между таблицей train и bki один ко многим, аналогично test и bki

In [24]:
len(set(test["APPLICATION_NUMBER"]) & set(bki["APPLICATION_NUMBER"]))

126469

In [25]:
len(set(train["APPLICATION_NUMBER"]) & set(bki["APPLICATION_NUMBER"]))

84508

Не все заявки из трейна и теста имеют кредитную историю в бюро кредитных историй. Проверим гипотезу о том, что вероятность не выплаты кредита больше у тех заявок, по которым нет кредитной истории

In [26]:
app_without_bki = set(train["APPLICATION_NUMBER"]) - set(bki["APPLICATION_NUMBER"])

In [27]:
print("Число заявок, по которым нет кредитной истории: {}".format(len(app_without_bki)))

Число заявок, по которым нет кредитной истории: 25585


In [28]:
train[train["APPLICATION_NUMBER"].isin(app_without_bki)]["TARGET"].value_counts()

0    23592
1     1993
Name: TARGET, dtype: int64

Вывод: стоит сделать признак "наличие кредитной истории", если у заявки нет кредитной истории вероятность дефолта по ней 22.4%

__client_profile.csv__

In [29]:
client_profile = pd.read_csv("client_profile.csv")
print("client_profile.shape = {} rows, {} cols".format(*client_profile.shape))

client_profile.shape = 250000 rows, 24 cols


In [30]:
client_profile.head()

Unnamed: 0,APPLICATION_NUMBER,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,AGE,...,FAMILY_SIZE,EXTERNAL_SCORING_RATING_1,EXTERNAL_SCORING_RATING_2,EXTERNAL_SCORING_RATING_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,...,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,...,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0
2,123501780,M,1,427500.0,239850.0,23850.0,Incomplete higher,Married,0.072508,14387,...,3.0,0.409017,0.738159,,,,,,,
3,123588799,M,0,112500.0,254700.0,17149.5,Secondary / secondary special,Married,0.019101,14273,...,2.0,,0.308994,0.590233,0.0,0.0,0.0,0.0,0.0,3.0
4,123647485,M,0,130500.0,614574.0,19822.5,Lower secondary,Married,0.022625,22954,...,2.0,,0.739408,0.15664,0.0,0.0,1.0,0.0,0.0,6.0


In [31]:
df = pd.merge(train, client_profile, on='APPLICATION_NUMBER', how='inner')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89539 entries, 0 to 89538
Data columns (total 26 columns):
APPLICATION_NUMBER            89539 non-null int64
TARGET                        89539 non-null int64
NAME_CONTRACT_TYPE            89539 non-null object
GENDER                        89539 non-null object
CHILDRENS                     89539 non-null int64
TOTAL_SALARY                  89539 non-null float64
AMOUNT_CREDIT                 89539 non-null float64
AMOUNT_ANNUITY                89534 non-null float64
EDUCATION_LEVEL               89539 non-null object
FAMILY_STATUS                 89539 non-null object
REGION_POPULATION             89539 non-null float64
AGE                           89539 non-null int64
DAYS_ON_LAST_JOB              89539 non-null int64
OWN_CAR_AGE                   30533 non-null float64
FLAG_PHONE                    89539 non-null int64
FLAG_EMAIL                    89539 non-null int64
FAMILY_SIZE                   89538 non-null float64
EXTERNAL

In [32]:
app_without_cf = set(train["APPLICATION_NUMBER"]) - set(client_profile["APPLICATION_NUMBER"])

In [33]:
len(app_without_cf)

20554

In [34]:
#те заявки, по которым нет не редитной истории ни профиля клиента
len(app_without_cf & app_without_bki)

4769

In [35]:
#те заявки, по которым либо нет кредитной истории либо нет профиля клиента
len(app_without_cf | app_without_bki)

41370

In [36]:
train[train["APPLICATION_NUMBER"].isin(app_without_cf)]["TARGET"].value_counts()

0    18865
1     1689
Name: TARGET, dtype: int64

In [37]:
train[train["APPLICATION_NUMBER"].isin(app_without_cf | app_without_bki)]["TARGET"].value_counts()

0    38049
1     3321
Name: TARGET, dtype: int64

__payments.csv__

In [38]:
pay = pd.read_csv("payments.csv")
print("payments.shape = {} rows, {} cols".format(*pay.shape))

payments.shape = 1023932 rows, 8 cols


In [39]:
pay.head()

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,49011181,123664960,1.0,5,1002.0,1015.0,12156.615,12156.615
1,48683432,123497205,1.0,13,442.0,432.0,18392.535,10047.645
2,48652024,123749925,1.0,10,8.0,23.0,5499.945,5499.945
3,48398897,123550846,0.0,82,398.0,398.0,7082.145,7082.145
4,49867197,123562174,0.0,63,1359.0,1359.0,156.735,156.735


In [40]:
pay.describe()

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
count,1023932.0,1023932.0,1023932.0,1023932.0,1023932.0,1023729.0,1023932.0,1023729.0
mean,49172530.0,123601800.0,0.8569114,18.88052,1042.505,1051.378,17142.63,17339.31
std,536309.3,102741.2,1.05318,26.69996,801.358,800.9974,51306.5,55573.38
min,48268220.0,123423300.0,0.0,1.0,2.0,2.0,0.0,0.0
25%,48703250.0,123513000.0,0.0,4.0,361.0,370.0,4217.445,3386.205
50%,49166020.0,123602000.0,1.0,8.0,817.0,826.0,8878.5,8115.39
75%,49637970.0,123691000.0,1.0,19.0,1655.0,1664.0,16712.29,16111.4
max,50111710.0,123779600.0,178.0,275.0,2922.0,3054.0,2860377.0,2860377.0


In [41]:
df = pd.merge(train, pay, on='APPLICATION_NUMBER', how='inner')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316939 entries, 0 to 316938
Data columns (total 10 columns):
APPLICATION_NUMBER         316939 non-null int64
TARGET                     316939 non-null int64
NAME_CONTRACT_TYPE         316939 non-null object
PREV_APPLICATION_NUMBER    316939 non-null int64
NUM_INSTALMENT_VERSION     316939 non-null float64
NUM_INSTALMENT_NUMBER      316939 non-null int64
DAYS_INSTALMENT            316939 non-null float64
DAYS_ENTRY_PAYMENT         316882 non-null float64
AMT_INSTALMENT             316939 non-null float64
AMT_PAYMENT                316882 non-null float64
dtypes: float64(5), int64(4), object(1)
memory usage: 26.6+ MB


__applications_history.csv__

In [42]:
history = pd.read_csv("applications_history.csv")
print("applications_history.shape = {} rows, {} cols".format(*history.shape))

applications_history.shape = 1670214 rows, 26 cols


In [43]:
history.head()

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,AMOUNT_ANNUITY,AMT_APPLICATION,AMOUNT_CREDIT,AMOUNT_PAYMENT,AMOUNT_GOODS_PAYMENT,NAME_CONTRACT_STATUS,DAYS_DECISION,...,NAME_PRODUCT_TYPE,SELLERPLACE_AREA,CNT_PAYMENT,NAME_YIELD_GROUP,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,49298709,123595216,,1730.43,17145.0,17145.0,0.0,17145.0,Approved,73,...,XNA,35,12.0,middle,365243.0,42.0,300.0,42.0,37.0,0.0
1,50070639,123431468,Cash,25188.615,607500.0,679671.0,,607500.0,Approved,164,...,x-sell,-1,36.0,low_action,365243.0,134.0,916.0,365243.0,365243.0,1.0
2,49791680,123445379,Cash,15060.735,112500.0,136444.5,,112500.0,Approved,301,...,x-sell,-1,12.0,high,365243.0,271.0,59.0,365243.0,365243.0,1.0
3,50087457,123499497,Cash,47041.335,450000.0,470790.0,,450000.0,Approved,512,...,x-sell,-1,12.0,middle,365243.0,482.0,152.0,182.0,177.0,1.0
4,49052479,123525393,Cash,31924.395,337500.0,404055.0,,337500.0,Refused,781,...,walk-in,-1,24.0,high,,,,,,


__baseline__

In [44]:
data = pd.merge(train, client_profile, on='APPLICATION_NUMBER', how='left')

In [45]:
#data["EDUCATION_FAMILY_STATUS"] = data["EDUCATION_LEVEL"] + " | " + data["FAMILY_STATUS"]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110093 entries, 0 to 110092
Data columns (total 26 columns):
APPLICATION_NUMBER            110093 non-null int64
TARGET                        110093 non-null int64
NAME_CONTRACT_TYPE            110093 non-null object
GENDER                        89539 non-null object
CHILDRENS                     89539 non-null float64
TOTAL_SALARY                  89539 non-null float64
AMOUNT_CREDIT                 89539 non-null float64
AMOUNT_ANNUITY                89534 non-null float64
EDUCATION_LEVEL               89539 non-null object
FAMILY_STATUS                 89539 non-null object
REGION_POPULATION             89539 non-null float64
AGE                           89539 non-null float64
DAYS_ON_LAST_JOB              89539 non-null float64
OWN_CAR_AGE                   30533 non-null float64
FLAG_PHONE                    89539 non-null float64
FLAG_EMAIL                    89539 non-null float64
FAMILY_SIZE                   89538 non-null f

In [46]:
categorical = data.select_dtypes(include=["object"]).columns.tolist()

In [47]:
categorical

['NAME_CONTRACT_TYPE', 'GENDER', 'EDUCATION_LEVEL', 'FAMILY_STATUS']

In [48]:
data[categorical] = data[categorical].astype(str)

In [49]:
x_train, x_valid = train_test_split(
    data, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    data["TARGET"], train_size=0.7, random_state=27, shuffle=True
)

In [50]:
to_drop = [
    "TARGET",
    "APPLICATION_NUMBER"
]

In [51]:
x_train = x_train.drop(to_drop, axis=1)
x_valid = x_valid.drop(to_drop, axis=1)

In [52]:
x_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33028 entries, 6863 to 44012
Data columns (total 24 columns):
NAME_CONTRACT_TYPE            33028 non-null object
GENDER                        33028 non-null object
CHILDRENS                     26917 non-null float64
TOTAL_SALARY                  26917 non-null float64
AMOUNT_CREDIT                 26917 non-null float64
AMOUNT_ANNUITY                26915 non-null float64
EDUCATION_LEVEL               33028 non-null object
FAMILY_STATUS                 33028 non-null object
REGION_POPULATION             26917 non-null float64
AGE                           26917 non-null float64
DAYS_ON_LAST_JOB              26917 non-null float64
OWN_CAR_AGE                   9194 non-null float64
FLAG_PHONE                    26917 non-null float64
FLAG_EMAIL                    26917 non-null float64
FAMILY_SIZE                   26917 non-null float64
EXTERNAL_SCORING_RATING_1     11713 non-null float64
EXTERNAL_SCORING_RATING_2     26857 non-null 

In [53]:
print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print(*y_valid.shape)

x_train.shape = 77065 rows, 24 cols
x_valid.shape = 33028 rows, 24 cols
33028


__Добавлю еще один источник данных__

In [64]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [65]:
history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 26 columns):
PREV_APPLICATION_NUMBER      1670214 non-null int64
APPLICATION_NUMBER           1670214 non-null int64
NAME_CONTRACT_TYPE           940717 non-null object
AMOUNT_ANNUITY               1297979 non-null float64
AMT_APPLICATION              1670214 non-null float64
AMOUNT_CREDIT                1670213 non-null float64
AMOUNT_PAYMENT               774370 non-null float64
AMOUNT_GOODS_PAYMENT         1284699 non-null float64
NAME_CONTRACT_STATUS         1670214 non-null object
DAYS_DECISION                1670214 non-null int64
NAME_PAYMENT_TYPE            1670214 non-null object
CODE_REJECT_REASON           1670214 non-null object
NAME_TYPE_SUITE              849809 non-null object
NAME_CLIENT_TYPE             1670214 non-null object
NAME_GOODS_CATEGORY          1670214 non-null object
NAME_PORTFOLIO               1670214 non-null object
NAME_PRODUCT_TYPE            1670214 non

In [66]:
aggs = {
    "AMT_APPLICATION": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_ANNUITY": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_CREDIT": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_PAYMENT": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_GOODS_PAYMENT": [np.mean, np.std, np.sum, "max"],
    "DAYS_FIRST_DRAWING": [np.mean, np.std, np.sum, "max"],
    "DAYS_DECISION": [np.mean, np.std, np.sum, "max"],
    "DAYS_FIRST_DUE": [np.mean, np.std, np.sum, "max"],
    "DAYS_LAST_DUE_1ST_VERSION": [np.mean, np.std, np.sum, "max"],
    "DAYS_TERMINATION": [np.mean, np.std, np.sum, "max"],
    "DAYS_FIRST_DUE": [np.mean, np.std, np.sum, "max"],
    "NFLAG_INSURED_ON_APPROVAL": [np.mean, np.std, np.sum, "max"],
    "CNT_PAYMENT": [np.mean]
}

stats = create_numerical_aggs(
    history, groupby_id="APPLICATION_NUMBER", aggs=aggs, prefix="history_"
)
stats.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,HISTORY_AMT_APPLICATION_MEAN,HISTORY_AMT_APPLICATION_STD,HISTORY_AMT_APPLICATION_SUM,HISTORY_AMT_APPLICATION_MAX,HISTORY_AMOUNT_ANNUITY_MEAN,HISTORY_AMOUNT_ANNUITY_STD,HISTORY_AMOUNT_ANNUITY_SUM,HISTORY_AMOUNT_ANNUITY_MAX,HISTORY_AMOUNT_CREDIT_MEAN,...,HISTORY_DAYS_LAST_DUE_1ST_VERSION_MAX,HISTORY_DAYS_TERMINATION_MEAN,HISTORY_DAYS_TERMINATION_STD,HISTORY_DAYS_TERMINATION_SUM,HISTORY_DAYS_TERMINATION_MAX,HISTORY_NFLAG_INSURED_ON_APPROVAL_MEAN,HISTORY_NFLAG_INSURED_ON_APPROVAL_STD,HISTORY_NFLAG_INSURED_ON_APPROVAL_SUM,HISTORY_NFLAG_INSURED_ON_APPROVAL_MAX,HISTORY_CNT_PAYMENT_MEAN
0,123423340,24835.5,,24835.5,24835.5,3951.0,,3951.0,3951.0,23787.0,...,1499.0,1612.0,,1612.0,1612.0,0.0,,0.0,0.0,8.0
1,123423341,179055.0,,179055.0,179055.0,9251.775,,9251.775,9251.775,179055.0,...,125.0,17.0,,17.0,17.0,0.0,,0.0,0.0,24.0


In [67]:
data = pd.merge(data, stats, on='APPLICATION_NUMBER', how='left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110093 entries, 0 to 110092
Data columns (total 71 columns):
APPLICATION_NUMBER                        110093 non-null int64
TARGET                                    110093 non-null int64
NAME_CONTRACT_TYPE                        110093 non-null object
GENDER                                    110093 non-null object
CHILDRENS                                 89539 non-null float64
TOTAL_SALARY                              89539 non-null float64
AMOUNT_CREDIT                             89539 non-null float64
AMOUNT_ANNUITY                            89534 non-null float64
EDUCATION_LEVEL                           110093 non-null object
FAMILY_STATUS                             110093 non-null object
REGION_POPULATION                         89539 non-null float64
AGE                                       89539 non-null float64
DAYS_ON_LAST_JOB                          89539 non-null float64
OWN_CAR_AGE                               3053

In [68]:
data[categorical] = data[categorical].astype(str)

In [69]:
x_train, x_valid = train_test_split(
    data, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    data["TARGET"], train_size=0.7, random_state=27, shuffle=True
)

In [70]:
x_train = x_train.drop(to_drop, axis=1)
x_valid = x_valid.drop(to_drop, axis=1)

In [71]:
eval_set = [(x_train, y_train)]
model = cb.CatBoostClassifier(**cb_params_1000)
model.fit(X=x_train, y=y_train, eval_set=eval_set, cat_features=categorical)

0:	test: 0.5181136	best: 0.5181136 (0)	total: 162ms	remaining: 2m 42s
10:	test: 0.6522426	best: 0.6522426 (10)	total: 1.55s	remaining: 2m 19s
20:	test: 0.6583787	best: 0.6583787 (20)	total: 3s	remaining: 2m 19s
30:	test: 0.6749141	best: 0.6750537 (25)	total: 4.31s	remaining: 2m 14s
40:	test: 0.6787671	best: 0.6787671 (40)	total: 5.63s	remaining: 2m 11s
50:	test: 0.6832798	best: 0.6832798 (50)	total: 6.93s	remaining: 2m 8s
60:	test: 0.6863612	best: 0.6866289 (58)	total: 8.26s	remaining: 2m 7s
70:	test: 0.6885305	best: 0.6885323 (69)	total: 9.57s	remaining: 2m 5s
80:	test: 0.6912159	best: 0.6912159 (80)	total: 10.9s	remaining: 2m 3s
90:	test: 0.6926580	best: 0.6926580 (90)	total: 12.2s	remaining: 2m 1s
100:	test: 0.6954728	best: 0.6954728 (100)	total: 13.5s	remaining: 2m
110:	test: 0.6959925	best: 0.6960144 (109)	total: 14.8s	remaining: 1m 58s
120:	test: 0.6968800	best: 0.6968800 (120)	total: 16.2s	remaining: 1m 57s
130:	test: 0.6982128	best: 0.6982951 (129)	total: 17.5s	remaining: 1m 56

<catboost.core.CatBoostClassifier at 0x1a687e796c8>

In [72]:
y_pred_valid = model.predict_proba(x_valid)[:, 1]

In [73]:
roc_auc_score(y_valid, y_pred_valid)

0.7255344451177922

In [74]:
def calculate_permutation_importance(estimator, 
                                     metric: callable,
                                     x_valid: pd.DataFrame,
                                     y_valid: pd.Series) -> pd.Series:
    """
    Расчет пермутированной важности признаков.
    """
    scores = {}
    y_pred = estimator.predict(x_valid)
    base_score = metric(y_valid, y_pred)

    for feature in x_valid.columns:
        x_valid_copy = x_valid.copy()
        x_valid_copy[feature] = np.random.permutation(x_valid_copy[feature])

        y_pred = estimator.predict(x_valid_copy)
        score = metric(y_valid, y_pred)
        scores[feature] = base_score - score

    scores = pd.Series(scores)
    scores = scores.sort_values(ascending=False)

    return scores

In [75]:
perm_importance = calculate_permutation_importance(
    estimator=model, metric=roc_auc_score, x_valid=x_valid, y_valid=y_valid
)

In [76]:
len(perm_importance)

69

In [77]:
colums_perm_importance = list(perm_importance[perm_importance > 0].index)
colums_perm_importance

['EXTERNAL_SCORING_RATING_3',
 'DAYS_ON_LAST_JOB',
 'AMOUNT_ANNUITY',
 'EDUCATION_LEVEL',
 'NAME_CONTRACT_TYPE',
 'FLAG_EMAIL',
 'EXTERNAL_SCORING_RATING_1',
 'AMOUNT_CREDIT',
 'EXTERNAL_SCORING_RATING_2',
 'FLAG_PHONE',
 'CHILDRENS',
 'FAMILY_STATUS',
 'GENDER',
 'HISTORY_DAYS_TERMINATION_MAX',
 'HISTORY_AMOUNT_GOODS_PAYMENT_SUM',
 'HISTORY_AMOUNT_ANNUITY_SUM',
 'HISTORY_AMT_APPLICATION_MAX']

In [78]:
data = data[colums_perm_importance]

In [79]:
data.head()

Unnamed: 0,EXTERNAL_SCORING_RATING_3,DAYS_ON_LAST_JOB,AMOUNT_ANNUITY,EDUCATION_LEVEL,NAME_CONTRACT_TYPE,FLAG_EMAIL,EXTERNAL_SCORING_RATING_1,AMOUNT_CREDIT,EXTERNAL_SCORING_RATING_2,FLAG_PHONE,CHILDRENS,FAMILY_STATUS,GENDER,HISTORY_DAYS_TERMINATION_MAX,HISTORY_AMOUNT_GOODS_PAYMENT_SUM,HISTORY_AMOUNT_ANNUITY_SUM,HISTORY_AMT_APPLICATION_MAX
0,0.71657,1719.0,25128.0,Secondary / secondary special,Cash,0.0,0.700784,855000.0,0.645914,0.0,1.0,Married,M,365243.0,206361.54,23111.415,72634.14
1,,,,,Cash,,,,,,,,,518.0,1327635.0,111676.005,495000.0
2,0.267869,3618.0,42660.0,Higher education,Cash,0.0,,1006920.0,0.682149,1.0,0.0,Married,F,2440.0,2123145.0,130153.905,1395000.0
3,0.170446,365243.0,22972.5,Secondary / secondary special,Cash,0.0,,518562.0,0.171299,0.0,0.0,Married,M,365243.0,122413.5,8475.39,100858.5
4,,,,,Cash,,,,,,,,,1335.0,2401578.0,116668.98,697500.0


In [80]:
x_train, x_valid = train_test_split(
    data, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["TARGET"], train_size=0.7, random_state=27, shuffle=True
)

In [81]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}

eval_set = [(x_train, y_train)]
model = cb.CatBoostClassifier(**cb_params_1000)
model.fit(X=x_train, y=y_train, eval_set=eval_set, cat_features=categorical)

0:	test: 0.6289087	best: 0.6289087 (0)	total: 143ms	remaining: 2m 23s
10:	test: 0.6881825	best: 0.6881825 (10)	total: 1.41s	remaining: 2m 7s
20:	test: 0.6935209	best: 0.6937566 (15)	total: 2.69s	remaining: 2m 5s
30:	test: 0.6933546	best: 0.6952175 (26)	total: 3.77s	remaining: 1m 58s
40:	test: 0.6949591	best: 0.6952175 (26)	total: 4.99s	remaining: 1m 56s
50:	test: 0.6958059	best: 0.6958059 (50)	total: 6.16s	remaining: 1m 54s
60:	test: 0.6956067	best: 0.6958059 (50)	total: 7.38s	remaining: 1m 53s
70:	test: 0.6958665	best: 0.6959787 (68)	total: 8.53s	remaining: 1m 51s
80:	test: 0.6964354	best: 0.6966691 (78)	total: 9.73s	remaining: 1m 50s
90:	test: 0.6965709	best: 0.6967160 (89)	total: 10.9s	remaining: 1m 49s
100:	test: 0.6972193	best: 0.6976414 (96)	total: 12.1s	remaining: 1m 47s
110:	test: 0.6985831	best: 0.6986060 (109)	total: 13.3s	remaining: 1m 46s
120:	test: 0.6991266	best: 0.6991266 (120)	total: 14.5s	remaining: 1m 45s
130:	test: 0.6995427	best: 0.6995427 (130)	total: 15.8s	remaini

<catboost.core.CatBoostClassifier at 0x1a686ec3a08>

In [83]:
y_pred_valid = model.predict_proba(x_valid)[:, 1]
roc_auc_score(y_valid, y_pred_valid)

0.723181923431726

In [89]:
def fit_catBoost(X, categorical):
    
    x_train, x_valid = train_test_split(X, train_size=0.7, random_state=27, shuffle=True)
    y_train, y_valid = train_test_split(X["TARGET"], train_size=0.7, random_state=27, shuffle=True)
    cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
    }

    eval_set = [(x_train, y_train)]
    model = cb.CatBoostClassifier(**cb_params_1000)
    model.fit(X=x_train, y=y_train, eval_set=eval_set, cat_features=categorical)
    return model

In [90]:
def prepare_data(X, categorical, to_drop):
    """
    Преобразование данных для передачи в модель.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков для передачи в модель.

    categorical: List[str]
        Список с названием категориальных признаков.

    drop_features: List[str]
        Список с названием признаков, которые не должны
        участвовать в обучении.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Матрица признаков, подготовленная для передачи в модель.

    """
    data = pd.merge(X, client_profile, on='APPLICATION_NUMBER', how='left')
    
    aggs = {
    "AMT_APPLICATION": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_ANNUITY": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_CREDIT": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_PAYMENT": [np.mean, np.std, np.sum, "max"],
    "AMOUNT_GOODS_PAYMENT": [np.mean, np.std, np.sum, "max"],
    "DAYS_FIRST_DRAWING": [np.mean, np.std, np.sum, "max"],
    "DAYS_DECISION": [np.mean, np.std, np.sum, "max"],
    "DAYS_FIRST_DUE": [np.mean, np.std, np.sum, "max"],
    "DAYS_LAST_DUE_1ST_VERSION": [np.mean, np.std, np.sum, "max"],
    "DAYS_TERMINATION": [np.mean, np.std, np.sum, "max"],
    "DAYS_FIRST_DUE": [np.mean, np.std, np.sum, "max"],
    "NFLAG_INSURED_ON_APPROVAL": [np.mean, np.std, np.sum, "max"],
    "CNT_PAYMENT": [np.mean]
    }

    stats = create_numerical_aggs(history, groupby_id="APPLICATION_NUMBER", aggs=aggs, prefix="history_")

    data = pd.merge(data, stats, on='APPLICATION_NUMBER', how='left')
    
    to_drop = set(X.columns) & set(to_drop)

    if to_drop:
        data = data.drop(to_drop, axis=1)

    data[categorical] = data[categorical].astype(str)
    
    return data

In [91]:
def features_selection(X, categorical):
    data = X

    
    model = fit_catBoost(data, categorical)
    
    perm_importance = calculate_permutation_importance(estimator=model, metric=roc_auc_score, x_valid=x_valid, y_valid=y_valid)
    colums_perm_importance = list(perm_importance[perm_importance > 0].index)
    
    return colums_perm_importance    

In [94]:
#для теста
data = prepare_data(train, categorical, to_drop)
categorical = data.select_dtypes(include=["object"]).columns.tolist()
data_test = prepare_data(test, categorical, to_drop)
categorical = data.select_dtypes(include=["object"]).columns.tolist()
cols = features_selection(train, categorical)
data = data[cols]
categorical = data.select_dtypes(include=["object"]).columns.tolist()

model = fit_catBoost(data, categorical)
y_pred_test = model.predict_proba(data_test)[:, 1]
sample_submission = pd.DataFrame()
sample_submission["APPLICATION_NUMBER"] = app_nums
sample_submission["TARGET"] = y_pred_test
sample_submission.info()
sample_submission.to_csv('sample_submission.csv', index=False)

ValueError: 'GENDER' is not in list