## Import libraries

In [38]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import torch
import random
import os
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler

## Constants

In [39]:
SEED_VALUE = 666

## Set all seeds

In [40]:
def InitSeeds(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)

InitSeeds(SEED_VALUE)

## Import data

In [41]:
data_train = pd.read_csv("data/train_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')
data_test = pd.read_csv("data/test_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')
print("Train data shape without Nans:", data_train.dropna(axis = 0).shape)
print("Test data shape without Nans:", data_test.dropna(axis = 0).shape)
data_train.head()

Train data shape without Nans: (0, 124)
Test data shape without Nans: (0, 123)


Unnamed: 0,id_contract,id_client,SIGN_DATE,IP_flag,TARGET,F1100,F1110,F1150,F1160,F1170,...,WINNERNUMBER_95_EVER,SIGNEDNUMBER_95_EVER,SUM_95_EVER,FLAG_DISQUALIFICATION,COUNT_CHANGE_YEAR,COUNT_CHANGE_EVER,BIRTHDATE,AGE,SEX_NAME,CITIZENSHIP_NAME
0,1,1847,01JAN2018:00:00:00,0,0,1298961000.0,2154000.0,1125573000.0,,150010000.0,...,,,,,,,,,,
1,2,4650,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,
2,3,4770,01JAN2018:00:00:00,0,0,73374000.0,,73374000.0,,,...,169.0,168.0,18351739.0,,,1.0,,,,
3,4,12237,01JAN2018:00:00:00,0,0,1937488000.0,122828000.0,610328000.0,,809426000.0,...,,,,,,,,,,
4,5,9988,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,


## EDA

In [42]:
#profile = ProfileReport(data_train, title = "Train Data Stats")
#profile.to_file("TrainDataReport.html")

In [43]:
#profile = ProfileReport(data_test, title = "Test Data Stats")
#profile.to_file("TestDataReport.html")

## Smoothed (regularized) target encoding

In [44]:
def smoothed_target_encoding(alpha: float, mean_prob: float, y: pd.Series) -> pd.Series:
    """Реализация регуляризованного таргед энкодинга.

    Принцип такой - чем меньше исходных данных, тем сильнее будет регуляризация
    Параметр регуляризации регуляризует мин. кол-во необходимых данных
    :param y: pd.Series с ценой
    :return: pd.Series с регуляризованной ценой
    """
    nrows = y.notnull().sum()
    return (y.mean() * nrows + alpha * mean_prob) / (nrows + alpha)

## Data processing and feature engineering

In [45]:
def ProcessData(df_train: pd.DataFrame,\
                df_test: pd.DataFrame,\
                seed_value: int) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, list]):
    
    TARGET_COLUMNS = ['TARGET']
    TMP_COLUMNS = ['id_client', 'id_contract']
    EXCLUDE_COLUMNS = ['BIRTHDATE', 'CITIZENSHIP_NAME', 'COUNT_CHANGE_EVER', 'COUNT_CHANGE_YEAR',\
                       'F1110', 'F1150', 'F1150_LAG1', 'F1160', 'F1170', 'F1180', 'F1190', 'F1210',\
                       'F1220', 'F1230', 'F1230_LAG1', 'F1240', 'F1300', 'F1320', 'F1350', 'F1360',\
                       'F1400', 'F1410', 'F1410_LAG1', 'F1420', 'F1450', 'F1500', 'F1510_LAG1',\
                       'F1520', 'F1520_LAG1', 'F1530', 'F1550', 'F1600', 'F1700', 'F2100', 'F2110_LAG1',\
                       'F2120', 'F2120_LAG1', 'F2200', 'F2200_LAG1', 'F2210', 'F2210_LAG1', 'F2220',\
                       'F2220_LAG1', 'F2310', 'F2320', 'F2320_LAG1', 'F2330', 'F2330_LAG1', 'F2340',\
                       'F2400', 'F2400_LAG1', 'FLAG_DISQUALIFICATION',\
                       'NOTADMITTEDNUMBER_233_EVER', 'NOTADMITTEDNUMBER_233_YEAR', 'OKTMO_CODE', 'OKTMO_FED',\
                       'PLAINTIFF_SUM_EVER', 'PLAINTIFF_SUM_YEAR', 'SIGNEDNUMBER_233_EVER',\
                       'SIGNEDNUMBER_233_YEAR', 'SIGNEDNUMBER_95_EVER', 'SIGNEDNUMBER_95_YEAR',\
                       'SUM_233_YEAR', 'SUM_95_EVER', 'SUM_95_YEAR', 'WINNERNUMBER_233_EVER',\
                       'WINNERNUMBER_233_YEAR', 'WINNERNUMBER_95_EVER', 'WINNERNUMBER_95_YEAR',\
                       'EGRPOINCLUDED', 'COUNTBRANCH']
    DATE_COLUMNS = ['SIGN_DATE', 'TAXREG_REGDATE', 'DATEFIRSTREG', 'TAXREGPAY_REGDATE']
    BINARY_COLUMNS = ['IP_flag']
    CAT_COLUNMS = ['OKATO_REGIONCODE', 'OKATO_FED']
    NUM_COLUMNS = []
    
    df_train_fixed = df_train.drop(columns = EXCLUDE_COLUMNS)
    df_test_fixed = df_test.drop(columns = EXCLUDE_COLUMNS)
    
    # Parse DATE columns into datetime format
    for col in DATE_COLUMNS:
        df_train_fixed[col] = [datetime.strptime(x[:9], '%d%b%Y') if type(x) != float else x for x in list(df_train_fixed[col])]
        df_test_fixed[col] = [datetime.strptime(x[:9], '%d%b%Y') if type(x) != float else x for x in list(df_test_fixed[col])]
        
    # Calculate some date diffs
    df_train_fixed['DATE_DELTA_1'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_train_fixed['SIGN_DATE']),\
                                              list(df_train_fixed['TAXREG_REGDATE']))
    ]
    df_test_fixed['DATE_DELTA_1'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_test_fixed['SIGN_DATE']),\
                                              list(df_test_fixed['TAXREG_REGDATE']))
    ]
    df_train_fixed['DATE_DELTA_2'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_train_fixed['SIGN_DATE']),\
                                              list(df_train_fixed['DATEFIRSTREG']))
    ]
    df_test_fixed['DATE_DELTA_2'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_test_fixed['SIGN_DATE']),\
                                              list(df_test_fixed['DATEFIRSTREG']))
    ]
    df_train_fixed['DATE_DELTA_3'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_train_fixed['DATEFIRSTREG']),\
                                              list(df_train_fixed['TAXREGPAY_REGDATE']))
    ]
    df_test_fixed['DATE_DELTA_3'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_test_fixed['DATEFIRSTREG']),\
                                              list(df_test_fixed['TAXREGPAY_REGDATE']))
    ]
    df_train_fixed = df_train_fixed.drop(columns = DATE_COLUMNS)
    df_test_fixed = df_test_fixed.drop(columns = DATE_COLUMNS)
    
    # Count contracts for each client
    df_train_fixed['contract_count'] = 1
    df_train_fixed_contr_count = df_train_fixed.groupby(by = ['id_client']).sum().reset_index()[['id_client',\
                                                                                                 'contract_count']]
    df_train_fixed = df_train_fixed.drop(columns = ['contract_count'])
    df_train_fixed = df_train_fixed.merge(df_train_fixed_contr_count, on = ['id_client'])
    
    df_test_fixed['contract_count'] = 1
    df_test_fixed_contr_count = df_test_fixed.groupby(by = ['id_client']).sum().reset_index()[['id_client',\
                                                                                                 'contract_count']]
    df_test_fixed = df_test_fixed.drop(columns = ['contract_count'])
    df_test_fixed = df_test_fixed.merge(df_test_fixed_contr_count, on = ['id_client'])
    
    # Make some categorical columns binary
    df_train_fixed['OKVED_CODE'] = [x.split('.')[0] if type(x) != float else x for x in list(df_train_fixed['OKVED_CODE'])]
    df_test_fixed['OKVED_CODE'] = [x.split('.')[0] if type(x) != float else x for x in list(df_test_fixed['OKVED_CODE'])]
    
    tmp_list = []
    for x in df_train_fixed['OKFS_GROUP']:
        if x == x:
            if x == 'Частная собственность':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['OKFS_GROUP'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['OKFS_GROUP']:
        if x == x:
            if x == 'Частная собственность':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['OKFS_GROUP'] = tmp_list
    
    tmp_list = []
    for x in df_train_fixed['OKOGU_GROUP']:
        if x == x:
            if x == 'Группировки хозяйствующих субъектов и общественных объединений':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['OKOGU_GROUP'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['OKOGU_GROUP']:
        if x == x:
            if x == 'Группировки хозяйствующих субъектов и общественных объединений':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['OKOGU_GROUP'] = tmp_list
    
    tmp_list = []
    for x in df_train_fixed['OKOPF_GROUP']:
        if x == x:
            if x.split()[0] == 'Коммерческая':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['OKOPF_GROUP'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['OKOPF_GROUP']:
        if x == x:
            if x.split()[0] == 'Коммерческая':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['OKOPF_GROUP'] = tmp_list
    
    # Sort columns into categorical, binary and numerical
    for col in list(df_test_fixed.columns):
        #print(df_test_fixed[col].dtype)
        if (df_test_fixed[col].dtype == 'object') and (col not in TMP_COLUMNS+BINARY_COLUMNS+CAT_COLUNMS):
            categories = set(list(df_test_fixed[col]))
            categories = [x for x in categories if x==x]
            if (len(categories) > 2):
                CAT_COLUNMS.append(col)
            else:
                BINARY_COLUMNS.append(col)
        elif col not in CAT_COLUNMS+BINARY_COLUMNS+TMP_COLUMNS:
            NUM_COLUMNS.append(col)
    
    # Label encode categorical and binary columns
    for col in CAT_COLUNMS+BINARY_COLUMNS:        
        orig_vals = [x for x in set(list(df_train_fixed[col])) if x == x]
        encoded_vals = list(range(len(orig_vals)))
        replace_dict = {key: val for key, val in zip(orig_vals, encoded_vals)}
        df_train_fixed[col] = df_train_fixed[col].replace(replace_dict)
        
        orig_vals_test = [x for x in set(list(df_test_fixed[col])) if x == x]
        unseen_vals = [x for x in orig_vals_test if x not in orig_vals]
        unseen_dict = {val: len(orig_vals) for val in unseen_vals}
        full_dict = replace_dict.copy()
        full_dict.update(unseen_dict)
        df_test_fixed[col] = df_test_fixed[col].replace(full_dict)
    
    # Impute missing values
    imputer = KNNImputer(n_neighbors=3, weights="distance")
    X_train = imputer.fit_transform(df_train_fixed[CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS])
    df_train_fixed = pd.concat([
        pd.DataFrame(data = X_train, columns = CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS),
        df_train_fixed['TARGET']
    ], axis = 1)
    for col in CAT_COLUNMS+BINARY_COLUMNS:
        df_train_fixed[col] = [int(round(x, 0)) for x in list(df_train_fixed[col])]
    
    X_test = imputer.fit_transform(df_test_fixed[CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS])
    df_test_fixed = pd.concat([
        pd.DataFrame(data = X_test, columns = CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS),
        df_test_fixed[TMP_COLUMNS]
    ], axis = 1)
    for col in CAT_COLUNMS+BINARY_COLUMNS:
        df_test_fixed[col] = [int(round(x, 0)) for x in list(df_test_fixed[col])]
        
    # Smoothed (regularized) target encoding for high-cardinality features
    #train_mean_prob = df_test_fixed['TARGET'].mean()
    #mean_prob_by_cat = {}
    #encoded_preffix = 'STE_'
    #for col in CAT_COLUNMS:
    #   self.mean_price_by_cat[col] = (
    #            df_train_fixed.groupby(col)['TARGET'].apply(lambda x: smoothed_target_encoding(x)).fillna(mean_price)
    #        )
    
    
    
    
    # Normalize numerical features
    for col in NUM_COLUMNS:
        df_train_fixed[col] = (df_train_fixed[col] - df_train_fixed[col].mean())/df_train_fixed[col].std()
        df_test_fixed[col] = (df_test_fixed[col] - df_test_fixed[col].mean())/df_test_fixed[col].std()
        
    # Split training data into train a validation datasets
    df_train_fixed, df_val_fixed = train_test_split(df_train_fixed, test_size = 0.1, random_state = seed_value,\
                                                    stratify = df_train_fixed['TARGET'])
    
    print("Categorical columns:", CAT_COLUNMS)
    print("Binary columns:", BINARY_COLUMNS)
    print("Numerical columns:", NUM_COLUMNS)
    
    return df_train_fixed, df_val_fixed, df_test_fixed, [CAT_COLUNMS, BINARY_COLUMNS, NUM_COLUMNS]

In [46]:
train_df, val_df, test_df, columns_df = ProcessData(data_train, data_test, SEED_VALUE)
train_df.head()

Categorical columns: ['OKATO_REGIONCODE', 'OKATO_FED', 'WORKERSRANGE', 'OKVED_CODE']
Binary columns: ['IP_flag', 'OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP', 'SEX_NAME']
Numerical columns: ['F1100', 'F1200', 'F1250', 'F1260', 'F1310', 'F1370', 'F1510', 'F2110', 'F2300', 'F2350', 'F2410', 'F2300_LAG1', 'COUNTCOOWNERFCSM', 'COUNTCOOWNERROSSTAT', 'COUNTCOOWNEREGRUL', 'COUNTBRANCHROSSTAT', 'COUNTBRANCHEGRUL', 'TELEPHONECOUNT', 'MANAGERCOUNTINCOUNTRY', 'MANAGERCOUNTINREGION', 'MANAGERINNCOUNT', 'PLAINTIFF_CASESNUMBER_YEAR', 'DEFENDANT_CASESNUMBER_YEAR', 'DEFENDANT_SUM_YEAR', 'THIRDOROTHERPERSON_YEAR', 'PLAINTIFF_CASESNUMBER_EVER', 'DEFENDANT_CASESNUMBER_EVER', 'DEFENDANT_SUM_EVER', 'THIRDOROTHERPERSON_EVER', 'ADMITTEDNUMBER_233_YEAR', 'ADMITTEDNUMBER_233_EVER', 'SUM_233_EVER', 'ADMITTEDNUMBER_95_YEAR', 'NOTADMITTEDNUMBER_95_YEAR', 'ADMITTEDNUMBER_95_EVER', 'NOTADMITTEDNUMBER_EVER', 'AGE', 'DATE_DELTA_1', 'DATE_DELTA_2', 'DATE_DELTA_3', 'contract_count']


Unnamed: 0,OKATO_REGIONCODE,OKATO_FED,WORKERSRANGE,OKVED_CODE,IP_flag,OKFS_GROUP,OKOPF_GROUP,OKOGU_GROUP,SEX_NAME,F1100,...,ADMITTEDNUMBER_95_YEAR,NOTADMITTEDNUMBER_95_YEAR,ADMITTEDNUMBER_95_EVER,NOTADMITTEDNUMBER_EVER,AGE,DATE_DELTA_1,DATE_DELTA_2,DATE_DELTA_3,contract_count,TARGET
7799,66,7,11,43,0,0,0,0,1,-0.328329,...,-0.210514,-0.175744,-0.239445,-0.198036,-1.22802,-0.39152,-0.484171,0.567152,-0.473024,0
11495,40,2,1,81,0,0,0,0,0,-0.328381,...,-0.263365,-0.253247,-0.292652,-0.233535,0.732025,-1.148349,-0.969747,0.819641,1.557409,1
5931,2,3,1,13,0,0,0,0,0,-0.328381,...,-0.268421,-0.2926,-0.293954,-0.243812,0.730573,0.437738,0.047875,0.315287,-0.473024,0
9311,20,0,11,81,0,0,0,0,0,-0.328362,...,-0.245638,-0.2926,-0.266822,-0.213294,0.730825,-1.061749,0.754688,0.026728,-0.430723,0
14105,0,3,5,3,1,0,0,0,1,-0.320255,...,-0.223804,-0.2926,-0.224209,-0.238725,-0.937575,0.851667,1.898811,-0.549765,-0.473024,0


## Models training and evaluation

In [47]:
# CatBosot

model_catboost = CatBoostClassifier(
    iterations = 75,
    learning_rate = 1e-2,
    cat_features = columns_df[0],
    loss_function = 'Logloss'
)

def objective(trial):
    param = {
        "loss_function": trial.suggest_categorical("loss_function", ["Logloss"]),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        #"iterations": trial.suggest_int("depth", 50, 75),
        "depth": trial.suggest_int("depth", 5, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])
        #"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20), 
    }
    # Conditional Hyper-Parameters
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    reg = CatBoostClassifier(**param, cat_features=columns_df[0])
    reg.fit(train_df[columns_df[0]+columns_df[1]+columns_df[2]], train_df['TARGET'],\
            eval_set=[(val_df[columns_df[0]+columns_df[1]+columns_df[2]], val_df['TARGET'])],\
            verbose=0, early_stopping_rounds=100)
    y_pred = reg.predict_proba(val_df[columns_df[0]+columns_df[1]+columns_df[2]])
    score = roc_auc_score(val_df['TARGET'], y_pred[:, 1])
    return score

study = optuna.create_study(sampler=TPESampler(), direction="maximize")
study.optimize(objective, n_trials=15, timeout=600) # Run for 10 minutes
print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


'''
params_catboost = {'depth': [10]}
GS_catboost = GridSearchCV(
    model_catboost,
    params_catboost,
    cv = 5,
    n_jobs = -1, 
    scoring = 'roc_auc',
    verbose = 2
)
GS_catboost.fit(train_df[columns_df[0]+columns_df[1]+columns_df[2]], train_df['TARGET'])
print("Best params:\n", GS_catboost.best_params_)
print(f"Best ROC AUC score: {GS_catboost.best_score_}")
'''

[32m[I 2021-11-19 13:15:57,324][0m A new study created in memory with name: no-name-94975bbb-f1a9-4242-b823-8070668cc8b5[0m
[32m[I 2021-11-19 13:16:17,806][0m Trial 0 finished with value: 0.9546871398976939 and parameters: {'loss_function': 'Logloss', 'l2_leaf_reg': 0.08677609425099944, 'colsample_bylevel': 0.09100740868237198, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8264191123372424}. Best is trial 0 with value: 0.9546871398976939.[0m
[32m[I 2021-11-19 13:16:32,998][0m Trial 1 finished with value: 0.9377060831297984 and parameters: {'loss_function': 'Logloss', 'l2_leaf_reg': 0.3230829192867728, 'colsample_bylevel': 0.054125280981806766, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.81254245684007}. Best is trial 0 with value: 0.9546871398976939.[0m
[32m[I 2021-11-19 13:18:31,883][0m Trial 2 finished with value: 0.9557532036632422 and parameters: {'loss_function': 'Logloss', 'l2_le

Number of completed trials: 12
Best trial:
	Best Score: 0.9567793544134138
	Best Params: 
    loss_function: Logloss
    l2_leaf_reg: 0.12066279477313135
    colsample_bylevel: 0.11865572990538993
    depth: 12
    boosting_type: Ordered
    bootstrap_type: Bernoulli
    subsample: 0.8407888862212045


'\nparams_catboost = {\'depth\': [10]}\nGS_catboost = GridSearchCV(\n    model_catboost,\n    params_catboost,\n    cv = 5,\n    n_jobs = -1, \n    scoring = \'roc_auc\',\n    verbose = 2\n)\nGS_catboost.fit(train_df[columns_df[0]+columns_df[1]+columns_df[2]], train_df[\'TARGET\'])\nprint("Best params:\n", GS_catboost.best_params_)\nprint(f"Best ROC AUC score: {GS_catboost.best_score_}")\n'

In [48]:
print(trial.params)
clf = CatBoostClassifier(**trial.params, iterations = 1500,cat_features=columns_df[0])
clf.fit(
    pd.concat([
        train_df[columns_df[0]+columns_df[1]+columns_df[2]],
        val_df[columns_df[0]+columns_df[1]+columns_df[2]]
    ], axis = 0),
    pd.concat([
        train_df['TARGET'],\
        val_df['TARGET']
    ], axis = 0)
)

{'loss_function': 'Logloss', 'l2_leaf_reg': 0.12066279477313135, 'colsample_bylevel': 0.11865572990538993, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8407888862212045}
0:	learn: 0.6653812	total: 54ms	remaining: 1m 20s
1:	learn: 0.6288762	total: 538ms	remaining: 6m 42s
2:	learn: 0.6004052	total: 971ms	remaining: 8m 4s
3:	learn: 0.5670362	total: 1.48s	remaining: 9m 12s
4:	learn: 0.5441190	total: 1.85s	remaining: 9m 12s
5:	learn: 0.5194893	total: 2.31s	remaining: 9m 34s
6:	learn: 0.4968973	total: 2.73s	remaining: 9m 41s
7:	learn: 0.4814355	total: 3.22s	remaining: 10m
8:	learn: 0.4642809	total: 3.58s	remaining: 9m 53s
9:	learn: 0.4484225	total: 4.03s	remaining: 9m 59s
10:	learn: 0.4357497	total: 4.47s	remaining: 10m 5s
11:	learn: 0.4233311	total: 4.87s	remaining: 10m 4s
12:	learn: 0.4194300	total: 4.89s	remaining: 9m 19s
13:	learn: 0.4066634	total: 5.43s	remaining: 9m 36s
14:	learn: 0.3936435	total: 5.84s	remaining: 9m 38s
15:	learn: 0.3823002	to

159:	learn: 0.1878840	total: 48s	remaining: 6m 42s
160:	learn: 0.1878809	total: 48s	remaining: 6m 39s
161:	learn: 0.1875742	total: 48.5s	remaining: 6m 40s
162:	learn: 0.1872226	total: 48.9s	remaining: 6m 40s
163:	learn: 0.1871457	total: 48.9s	remaining: 6m 38s
164:	learn: 0.1868031	total: 49.4s	remaining: 6m 39s
165:	learn: 0.1867904	total: 49.4s	remaining: 6m 36s
166:	learn: 0.1862105	total: 49.8s	remaining: 6m 37s
167:	learn: 0.1860784	total: 50.2s	remaining: 6m 37s
168:	learn: 0.1856040	total: 50.5s	remaining: 6m 37s
169:	learn: 0.1855840	total: 50.5s	remaining: 6m 35s
170:	learn: 0.1853767	total: 50.6s	remaining: 6m 33s
171:	learn: 0.1853393	total: 50.6s	remaining: 6m 30s
172:	learn: 0.1852229	total: 50.6s	remaining: 6m 28s
173:	learn: 0.1845851	total: 51.1s	remaining: 6m 29s
174:	learn: 0.1845142	total: 51.1s	remaining: 6m 26s
175:	learn: 0.1843553	total: 51.2s	remaining: 6m 25s
176:	learn: 0.1838136	total: 51.5s	remaining: 6m 25s
177:	learn: 0.1838094	total: 51.5s	remaining: 6m 2

314:	learn: 0.1621792	total: 1m 17s	remaining: 4m 53s
315:	learn: 0.1621340	total: 1m 17s	remaining: 4m 52s
316:	learn: 0.1621307	total: 1m 18s	remaining: 4m 51s
317:	learn: 0.1621240	total: 1m 18s	remaining: 4m 49s
318:	learn: 0.1620697	total: 1m 18s	remaining: 4m 50s
319:	learn: 0.1618370	total: 1m 18s	remaining: 4m 49s
320:	learn: 0.1617341	total: 1m 18s	remaining: 4m 49s
321:	learn: 0.1615399	total: 1m 19s	remaining: 4m 49s
322:	learn: 0.1614405	total: 1m 19s	remaining: 4m 48s
323:	learn: 0.1614291	total: 1m 19s	remaining: 4m 47s
324:	learn: 0.1611718	total: 1m 19s	remaining: 4m 47s
325:	learn: 0.1611680	total: 1m 19s	remaining: 4m 46s
326:	learn: 0.1610023	total: 1m 20s	remaining: 4m 47s
327:	learn: 0.1607925	total: 1m 20s	remaining: 4m 47s
328:	learn: 0.1605531	total: 1m 20s	remaining: 4m 48s
329:	learn: 0.1605350	total: 1m 20s	remaining: 4m 47s
330:	learn: 0.1603417	total: 1m 21s	remaining: 4m 47s
331:	learn: 0.1602686	total: 1m 21s	remaining: 4m 46s
332:	learn: 0.1600541	total:

467:	learn: 0.1441586	total: 1m 59s	remaining: 4m 23s
468:	learn: 0.1441513	total: 1m 59s	remaining: 4m 22s
469:	learn: 0.1440700	total: 1m 59s	remaining: 4m 22s
470:	learn: 0.1439284	total: 2m	remaining: 4m 22s
471:	learn: 0.1436286	total: 2m	remaining: 4m 22s
472:	learn: 0.1436137	total: 2m 1s	remaining: 4m 22s
473:	learn: 0.1435195	total: 2m 1s	remaining: 4m 22s
474:	learn: 0.1435169	total: 2m 1s	remaining: 4m 22s
475:	learn: 0.1434619	total: 2m 2s	remaining: 4m 22s
476:	learn: 0.1434202	total: 2m 2s	remaining: 4m 22s
477:	learn: 0.1432619	total: 2m 2s	remaining: 4m 22s
478:	learn: 0.1430023	total: 2m 3s	remaining: 4m 22s
479:	learn: 0.1430016	total: 2m 3s	remaining: 4m 22s
480:	learn: 0.1429310	total: 2m 3s	remaining: 4m 22s
481:	learn: 0.1426981	total: 2m 4s	remaining: 4m 22s
482:	learn: 0.1426714	total: 2m 4s	remaining: 4m 21s
483:	learn: 0.1425061	total: 2m 4s	remaining: 4m 21s
484:	learn: 0.1424405	total: 2m 5s	remaining: 4m 21s
485:	learn: 0.1422662	total: 2m 5s	remaining: 4m 

621:	learn: 0.1284416	total: 2m 51s	remaining: 4m 2s
622:	learn: 0.1282790	total: 2m 52s	remaining: 4m 2s
623:	learn: 0.1282779	total: 2m 52s	remaining: 4m 1s
624:	learn: 0.1282615	total: 2m 52s	remaining: 4m 1s
625:	learn: 0.1282597	total: 2m 52s	remaining: 4m 1s
626:	learn: 0.1281295	total: 2m 53s	remaining: 4m
627:	learn: 0.1279785	total: 2m 53s	remaining: 4m
628:	learn: 0.1279136	total: 2m 53s	remaining: 4m
629:	learn: 0.1278807	total: 2m 54s	remaining: 4m
630:	learn: 0.1277922	total: 2m 54s	remaining: 4m
631:	learn: 0.1274453	total: 2m 54s	remaining: 4m
632:	learn: 0.1274403	total: 2m 55s	remaining: 4m
633:	learn: 0.1274282	total: 2m 55s	remaining: 3m 59s
634:	learn: 0.1274238	total: 2m 55s	remaining: 3m 59s
635:	learn: 0.1273730	total: 2m 55s	remaining: 3m 59s
636:	learn: 0.1273670	total: 2m 56s	remaining: 3m 58s
637:	learn: 0.1272072	total: 2m 56s	remaining: 3m 58s
638:	learn: 0.1272070	total: 2m 56s	remaining: 3m 58s
639:	learn: 0.1271016	total: 2m 57s	remaining: 3m 58s
640:	le

775:	learn: 0.1182367	total: 3m 42s	remaining: 3m 27s
776:	learn: 0.1182311	total: 3m 42s	remaining: 3m 26s
777:	learn: 0.1182305	total: 3m 42s	remaining: 3m 26s
778:	learn: 0.1182284	total: 3m 42s	remaining: 3m 26s
779:	learn: 0.1181708	total: 3m 43s	remaining: 3m 25s
780:	learn: 0.1180623	total: 3m 43s	remaining: 3m 25s
781:	learn: 0.1179853	total: 3m 44s	remaining: 3m 25s
782:	learn: 0.1179823	total: 3m 44s	remaining: 3m 25s
783:	learn: 0.1178086	total: 3m 44s	remaining: 3m 25s
784:	learn: 0.1176400	total: 3m 45s	remaining: 3m 25s
785:	learn: 0.1176345	total: 3m 45s	remaining: 3m 24s
786:	learn: 0.1173199	total: 3m 45s	remaining: 3m 24s
787:	learn: 0.1173085	total: 3m 46s	remaining: 3m 24s
788:	learn: 0.1173083	total: 3m 46s	remaining: 3m 23s
789:	learn: 0.1172298	total: 3m 46s	remaining: 3m 23s
790:	learn: 0.1171750	total: 3m 46s	remaining: 3m 23s
791:	learn: 0.1170054	total: 3m 47s	remaining: 3m 23s
792:	learn: 0.1170044	total: 3m 47s	remaining: 3m 22s
793:	learn: 0.1166670	total:

929:	learn: 0.1081198	total: 4m 35s	remaining: 2m 48s
930:	learn: 0.1081066	total: 4m 36s	remaining: 2m 48s
931:	learn: 0.1081043	total: 4m 36s	remaining: 2m 48s
932:	learn: 0.1079836	total: 4m 36s	remaining: 2m 48s
933:	learn: 0.1079825	total: 4m 36s	remaining: 2m 47s
934:	learn: 0.1078357	total: 4m 37s	remaining: 2m 47s
935:	learn: 0.1078341	total: 4m 37s	remaining: 2m 47s
936:	learn: 0.1078095	total: 4m 37s	remaining: 2m 46s
937:	learn: 0.1078007	total: 4m 38s	remaining: 2m 46s
938:	learn: 0.1077390	total: 4m 38s	remaining: 2m 46s
939:	learn: 0.1077365	total: 4m 38s	remaining: 2m 46s
940:	learn: 0.1077358	total: 4m 39s	remaining: 2m 45s
941:	learn: 0.1077258	total: 4m 39s	remaining: 2m 45s
942:	learn: 0.1077253	total: 4m 40s	remaining: 2m 45s
943:	learn: 0.1076460	total: 4m 40s	remaining: 2m 45s
944:	learn: 0.1076167	total: 4m 40s	remaining: 2m 44s
945:	learn: 0.1075682	total: 4m 41s	remaining: 2m 44s
946:	learn: 0.1075556	total: 4m 41s	remaining: 2m 44s
947:	learn: 0.1074107	total:

1081:	learn: 0.1001986	total: 5m 26s	remaining: 2m 6s
1082:	learn: 0.1001793	total: 5m 27s	remaining: 2m 6s
1083:	learn: 0.1001793	total: 5m 27s	remaining: 2m 5s
1084:	learn: 0.1001421	total: 5m 27s	remaining: 2m 5s
1085:	learn: 0.1001408	total: 5m 28s	remaining: 2m 5s
1086:	learn: 0.0999884	total: 5m 28s	remaining: 2m 4s
1087:	learn: 0.0999835	total: 5m 28s	remaining: 2m 4s
1088:	learn: 0.0998742	total: 5m 29s	remaining: 2m 4s
1089:	learn: 0.0995844	total: 5m 29s	remaining: 2m 3s
1090:	learn: 0.0994794	total: 5m 29s	remaining: 2m 3s
1091:	learn: 0.0993624	total: 5m 30s	remaining: 2m 3s
1092:	learn: 0.0993573	total: 5m 30s	remaining: 2m 3s
1093:	learn: 0.0992949	total: 5m 31s	remaining: 2m 2s
1094:	learn: 0.0992785	total: 5m 31s	remaining: 2m 2s
1095:	learn: 0.0992761	total: 5m 31s	remaining: 2m 2s
1096:	learn: 0.0991884	total: 5m 32s	remaining: 2m 2s
1097:	learn: 0.0991864	total: 5m 32s	remaining: 2m 1s
1098:	learn: 0.0991843	total: 5m 32s	remaining: 2m 1s
1099:	learn: 0.0991735	total

1232:	learn: 0.0931761	total: 6m 17s	remaining: 1m 21s
1233:	learn: 0.0931086	total: 6m 17s	remaining: 1m 21s
1234:	learn: 0.0929956	total: 6m 18s	remaining: 1m 21s
1235:	learn: 0.0929837	total: 6m 18s	remaining: 1m 20s
1236:	learn: 0.0929784	total: 6m 19s	remaining: 1m 20s
1237:	learn: 0.0929759	total: 6m 19s	remaining: 1m 20s
1238:	learn: 0.0929742	total: 6m 19s	remaining: 1m 19s
1239:	learn: 0.0929157	total: 6m 20s	remaining: 1m 19s
1240:	learn: 0.0929155	total: 6m 20s	remaining: 1m 19s
1241:	learn: 0.0929140	total: 6m 20s	remaining: 1m 19s
1242:	learn: 0.0927588	total: 6m 21s	remaining: 1m 18s
1243:	learn: 0.0927580	total: 6m 21s	remaining: 1m 18s
1244:	learn: 0.0927573	total: 6m 21s	remaining: 1m 18s
1245:	learn: 0.0926539	total: 6m 21s	remaining: 1m 17s
1246:	learn: 0.0924260	total: 6m 22s	remaining: 1m 17s
1247:	learn: 0.0923867	total: 6m 22s	remaining: 1m 17s
1248:	learn: 0.0923815	total: 6m 22s	remaining: 1m 16s
1249:	learn: 0.0923795	total: 6m 23s	remaining: 1m 16s
1250:	lear

1385:	learn: 0.0863243	total: 7m 10s	remaining: 35.4s
1386:	learn: 0.0862513	total: 7m 11s	remaining: 35.1s
1387:	learn: 0.0862156	total: 7m 11s	remaining: 34.8s
1388:	learn: 0.0862085	total: 7m 11s	remaining: 34.5s
1389:	learn: 0.0861177	total: 7m 12s	remaining: 34.2s
1390:	learn: 0.0860599	total: 7m 12s	remaining: 33.9s
1391:	learn: 0.0860558	total: 7m 13s	remaining: 33.6s
1392:	learn: 0.0860540	total: 7m 13s	remaining: 33.3s
1393:	learn: 0.0860536	total: 7m 13s	remaining: 32.9s
1394:	learn: 0.0860532	total: 7m 13s	remaining: 32.6s
1395:	learn: 0.0860505	total: 7m 13s	remaining: 32.3s
1396:	learn: 0.0860457	total: 7m 13s	remaining: 32s
1397:	learn: 0.0860443	total: 7m 14s	remaining: 31.7s
1398:	learn: 0.0860415	total: 7m 14s	remaining: 31.4s
1399:	learn: 0.0859712	total: 7m 15s	remaining: 31.1s
1400:	learn: 0.0859640	total: 7m 15s	remaining: 30.8s
1401:	learn: 0.0859352	total: 7m 15s	remaining: 30.5s
1402:	learn: 0.0858906	total: 7m 16s	remaining: 30.2s
1403:	learn: 0.0858813	total: 

<catboost.core.CatBoostClassifier at 0x7fd3b093b6a0>

In [49]:
preds = list(clf.predict_proba(test_df[columns_df[0]+columns_df[1]+columns_df[2]])[:, 1])
ids = list(test_df['id_contract'])
submit_data = {'id_contract': ids, 'TARGET': preds}
submit_df = pd.DataFrame.from_dict(submit_data, )
submit_df.to_csv('submit_file.csv', sep=';', index=False)

In [50]:
test_df.shape

(7330, 52)

In [51]:
feat_imp = pd.DataFrame.from_dict(
    {
        'feature': columns_df[0]+columns_df[1]+columns_df[2],
        'importance': list(clf.feature_importances_)
    }
)
feat_imp

Unnamed: 0,feature,importance
0,OKATO_REGIONCODE,3.887928
1,OKATO_FED,2.083656
2,WORKERSRANGE,3.509903
3,OKVED_CODE,8.393652
4,IP_flag,0.308181
5,OKFS_GROUP,0.768512
6,OKOPF_GROUP,0.501657
7,OKOGU_GROUP,0.376004
8,SEX_NAME,0.406509
9,F1100,1.207475
