## Import libraries

In [5]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import torch
import random
import os
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler
import lightgbm as lgbm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

## Constants

In [2]:
SEED_VALUE = 666

## Set all seeds

In [3]:
def InitSeeds(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)

InitSeeds(SEED_VALUE)

## Import data

In [14]:
data_train = pd.read_csv("data/train_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')
data_test = pd.read_csv("data/test_dataset_hackathon_mkb.csv", sep = ';', encoding = 'cp1251')
print("Train data shape without Nans:", data_train.dropna(axis = 0).shape)
print("Test data shape without Nans:", data_test.dropna(axis = 0).shape)
data_train.head()

Train data shape without Nans: (0, 124)
Test data shape without Nans: (0, 123)


Unnamed: 0,id_contract,id_client,SIGN_DATE,IP_flag,TARGET,F1100,F1110,F1150,F1160,F1170,...,WINNERNUMBER_95_EVER,SIGNEDNUMBER_95_EVER,SUM_95_EVER,FLAG_DISQUALIFICATION,COUNT_CHANGE_YEAR,COUNT_CHANGE_EVER,BIRTHDATE,AGE,SEX_NAME,CITIZENSHIP_NAME
0,1,1847,01JAN2018:00:00:00,0,0,1298961000.0,2154000.0,1125573000.0,,150010000.0,...,,,,,,,,,,
1,2,4650,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,
2,3,4770,01JAN2018:00:00:00,0,0,73374000.0,,73374000.0,,,...,169.0,168.0,18351739.0,,,1.0,,,,
3,4,12237,01JAN2018:00:00:00,0,0,1937488000.0,122828000.0,610328000.0,,809426000.0,...,,,,,,,,,,
4,5,9988,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,


## EDA

In [5]:
#profile = ProfileReport(data_train, title = "Train Data Stats")
#profile.to_file("TrainDataReport.html")

In [6]:
#profile = ProfileReport(data_test, title = "Test Data Stats")
#profile.to_file("TestDataReport.html")

## Smoothed (regularized) target encoding

In [7]:
def smoothed_target_encoding(alpha: float, mean_prob: float, y: pd.Series) -> pd.Series:
    """Реализация регуляризованного таргед энкодинга.

    Принцип такой - чем меньше исходных данных, тем сильнее будет регуляризация
    Параметр регуляризации регуляризует мин. кол-во необходимых данных
    :param y: pd.Series с ценой
    :return: pd.Series с регуляризованной ценой
    """
    nrows = y.notnull().sum()
    return (y.mean() * nrows + alpha * mean_prob) / (nrows + alpha)

## Data processing and feature engineering

In [8]:
def ProcessData(df_train: pd.DataFrame,\
                df_test: pd.DataFrame,\
                seed_value: int) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, list]):
    
    TARGET_COLUMNS = ['TARGET']
    TMP_COLUMNS = ['id_client', 'id_contract']
    EXCLUDE_COLUMNS = ['BIRTHDATE', 'CITIZENSHIP_NAME', 'COUNT_CHANGE_EVER', 'COUNT_CHANGE_YEAR',\
                       'F1110', 'F1150', 'F1150_LAG1', 'F1160', 'F1170', 'F1180', 'F1190', 'F1210',\
                       'F1220', 'F1230', 'F1230_LAG1', 'F1240', 'F1300', 'F1320', 'F1350', 'F1360',\
                       'F1400', 'F1410', 'F1410_LAG1', 'F1420', 'F1450', 'F1500', 'F1510_LAG1',\
                       'F1520', 'F1520_LAG1', 'F1530', 'F1550', 'F1600', 'F1700', 'F2100', 'F2110_LAG1',\
                       'F2120', 'F2120_LAG1', 'F2200', 'F2200_LAG1', 'F2210', 'F2210_LAG1', 'F2220',\
                       'F2220_LAG1', 'F2310', 'F2320', 'F2320_LAG1', 'F2330', 'F2330_LAG1', 'F2340',\
                       'F2400', 'F2400_LAG1', 'FLAG_DISQUALIFICATION',\
                       'NOTADMITTEDNUMBER_233_EVER', 'NOTADMITTEDNUMBER_233_YEAR', 'OKTMO_CODE', 'OKTMO_FED',\
                       'PLAINTIFF_SUM_EVER', 'PLAINTIFF_SUM_YEAR', 'SIGNEDNUMBER_233_EVER',\
                       'SIGNEDNUMBER_233_YEAR', 'SIGNEDNUMBER_95_EVER', 'SIGNEDNUMBER_95_YEAR',\
                       'SUM_233_YEAR', 'SUM_95_EVER', 'SUM_95_YEAR', 'WINNERNUMBER_233_EVER',\
                       'WINNERNUMBER_233_YEAR', 'WINNERNUMBER_95_EVER', 'WINNERNUMBER_95_YEAR',\
                       'EGRPOINCLUDED', 'COUNTBRANCH']
    
    DATE_COLUMNS = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']
    BINARY_COLUMNS = ['IP_flag']
    CAT_COLUNMS = ['OKATO_REGIONCODE', 'OKATO_FED', 'OKFS_GROUP', 'OKOPF_GROUP',\
                   'OKOGU_GROUP', 'WORKERSRANGE', 'OKVED_CODE']
    NUM_COLUMNS = []
    
    # Parse DATE columns into datetime format
    for col in DATE_COLUMNS+['SIGN_DATE']:
        df_train_fixed[col] = [datetime.strptime(x[:9], '%d%b%Y') if type(x) != float else x for x in list(df_train_fixed[col])]
        df_test_fixed[col] = [datetime.strptime(x[:9], '%d%b%Y') if type(x) != float else x for x in list(df_test_fixed[col])]
        
    # Calculate some date diffs
    df_train_fixed['DATE_DELTA_1'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_train_fixed['SIGN_DATE']),\
                                              list(df_train_fixed['TAXREG_REGDATE']))
    ]
    df_test_fixed['DATE_DELTA_1'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_test_fixed['SIGN_DATE']),\
                                              list(df_test_fixed['TAXREG_REGDATE']))
    ]
    df_train_fixed['DATE_DELTA_2'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_train_fixed['SIGN_DATE']),\
                                              list(df_train_fixed['DATEFIRSTREG']))
    ]
    df_test_fixed['DATE_DELTA_2'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_test_fixed['SIGN_DATE']),\
                                              list(df_test_fixed['DATEFIRSTREG']))
    ]
    df_train_fixed['DATE_DELTA_3'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_train_fixed['DATEFIRSTREG']),\
                                              list(df_train_fixed['TAXREGPAY_REGDATE']))
    ]
    df_test_fixed['DATE_DELTA_3'] = [
        abs((t1 - t2).days) for t1, t2 in zip(list(df_test_fixed['DATEFIRSTREG']),\
                                              list(df_test_fixed['TAXREGPAY_REGDATE']))
    ]
    df_train_fixed = df_train_fixed.drop(columns = DATE_COLUMNS+['SIGN_DATE'])
    df_test_fixed = df_test_fixed.drop(columns = DATE_COLUMNS+['SIGN_DATE'])
    
    # Count contracts for each client
    df_train_fixed['contract_count'] = 1
    df_train_fixed_contr_count = df_train_fixed.groupby(by = ['id_client']).sum().reset_index()[['id_client',\
                                                                                                 'contract_count']]
    df_train_fixed = df_train_fixed.drop(columns = ['contract_count'])
    df_train_fixed = df_train_fixed.merge(df_train_fixed_contr_count, on = ['id_client'])
    
    df_test_fixed['contract_count'] = 1
    df_test_fixed_contr_count = df_test_fixed.groupby(by = ['id_client']).sum().reset_index()[['id_client',\
                                                                                                 'contract_count']]
    df_test_fixed = df_test_fixed.drop(columns = ['contract_count'])
    df_test_fixed = df_test_fixed.merge(df_test_fixed_contr_count, on = ['id_client'])
    
    # Count client ids
    #tmp = data[name].value_counts()
    #tmp = tmp + 0.1 * np.random.randn(len(tmp))
    #data[name] = data[name].map(tmp)
    
    # Make some categorical columns binary
    df_train_fixed['OKVED_CODE'] = [x.split('.')[0] if type(x) != float else x for x in list(df_train_fixed['OKVED_CODE'])]
    df_test_fixed['OKVED_CODE'] = [x.split('.')[0] if type(x) != float else x for x in list(df_test_fixed['OKVED_CODE'])]
    
    tmp_list = []
    for x in df_train_fixed['OKFS_GROUP']:
        if x == x:
            if x == 'Частная собственность':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['OKFS_GROUP'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['OKFS_GROUP']:
        if x == x:
            if x == 'Частная собственность':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['OKFS_GROUP'] = tmp_list
    
    tmp_list = []
    for x in df_train_fixed['OKOGU_GROUP']:
        if x == x:
            if x == 'Группировки хозяйствующих субъектов и общественных объединений':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['OKOGU_GROUP'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['OKOGU_GROUP']:
        if x == x:
            if x == 'Группировки хозяйствующих субъектов и общественных объединений':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['OKOGU_GROUP'] = tmp_list
    
    tmp_list = []
    for x in df_train_fixed['OKOPF_GROUP']:
        if x == x:
            if x.split()[0] == 'Коммерческая':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['OKOPF_GROUP'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['OKOPF_GROUP']:
        if x == x:
            if x.split()[0] == 'Коммерческая':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['OKOPF_GROUP'] = tmp_list
    
     tmp_list = []
    for x in df_train_fixed['CITIZENSHIP_NAME']:
        if x == x:
            if x.split()[0] == 'Российская Федерация':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_train_fixed['CITIZENSHIP_NAME'] = tmp_list
    tmp_list = []
    for x in df_test_fixed['CITIZENSHIP_NAME']:
        if x == x:
            if x.split()[0] == 'Российская Федерация':
                tmp_list.append('1')
            else:
                tmp_list.append('0')
        else:
            tmp_list.append(x)
    df_test_fixed['CITIZENSHIP_NAME'] = tmp_list
    
    # Sort columns into categorical, binary and numerical
    for col in list(df_test_fixed.columns):
        #print(df_test_fixed[col].dtype)
        if (df_test_fixed[col].dtype == 'object') and (col not in TMP_COLUMNS+BINARY_COLUMNS+CAT_COLUNMS):
            categories = set(list(df_test_fixed[col]))
            categories = [x for x in categories if x==x]
            if (len(categories) > 2):
                CAT_COLUNMS.append(col)
            else:
                BINARY_COLUMNS.append(col)
        elif col not in CAT_COLUNMS+BINARY_COLUMNS+TMP_COLUMNS:
            NUM_COLUMNS.append(col)
    
    # Label encode categorical and binary columns
    for col in CAT_COLUNMS+BINARY_COLUMNS:        
        orig_vals = [x for x in set(list(df_train_fixed[col])) if x == x]
        encoded_vals = list(range(len(orig_vals)))
        replace_dict = {key: val for key, val in zip(orig_vals, encoded_vals)}
        df_train_fixed[col] = df_train_fixed[col].replace(replace_dict)
        
        orig_vals_test = [x for x in set(list(df_test_fixed[col])) if x == x]
        unseen_vals = [x for x in orig_vals_test if x not in orig_vals]
        unseen_dict = {val: len(orig_vals) for val in unseen_vals}
        full_dict = replace_dict.copy()
        full_dict.update(unseen_dict)
        df_test_fixed[col] = df_test_fixed[col].replace(full_dict)
    
    
    # Impute missing values
    imputer = KNNImputer(n_neighbors=3, weights="distance")
    X_train = imputer.fit_transform(df_train_fixed[CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS])
    df_train_fixed = pd.concat([
        pd.DataFrame(data = X_train, columns = CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS),
        df_train_fixed['TARGET'],
        df_train_fixed[['id_contract', 'id_client']]
    ], axis = 1)
    for col in CAT_COLUNMS+BINARY_COLUMNS:
        df_train_fixed[col] = [int(round(x, 0)) for x in list(df_train_fixed[col])]
    
    X_test = imputer.fit_transform(df_test_fixed[CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS])
    df_test_fixed = pd.concat([
        pd.DataFrame(data = X_test, columns = CAT_COLUNMS+BINARY_COLUMNS+NUM_COLUMNS),
        df_test_fixed[TMP_COLUMNS]
    ], axis = 1)
    for col in CAT_COLUNMS+BINARY_COLUMNS:
        df_test_fixed[col] = [int(round(x, 0)) for x in list(df_test_fixed[col])]
    
        
    # Smoothed (regularized) target encoding for high-cardinality features
    #train_mean_prob = df_test_fixed['TARGET'].mean()
    #mean_prob_by_cat = {}
    #encoded_preffix = 'STE_'
    #for col in CAT_COLUNMS:
    #   self.mean_price_by_cat[col] = (
    #            df_train_fixed.groupby(col)['TARGET'].apply(lambda x: smoothed_target_encoding(x)).fillna(mean_price)
    #        )
    
    # Target encode high-cardinality features
    TE_cols = ['OKATO_REGIONCODE', 'OKATO_FED', 'OKVED_CODE']
    enc = TargetEncoder(cols = TE_cols, handle_unknown = 'value', handle_missing = 'value')
    df_train_target = df_train_fixed[['TARGET']]
    df_train_fixed = enc.fit_transform(df_train_fixed.drop(columns = ['TARGET']), df_train_fixed['TARGET'])
    df_train_fixed['TARGET'] = df_train_target['TARGET']
    df_test_fixed = enc.transform(df_test_fixed)
    
    # Normalize numerical features
    for col in NUM_COLUMNS:
        df_train_fixed[col] = (df_train_fixed[col] - df_train_fixed[col].mean())/df_train_fixed[col].std()
        df_test_fixed[col] = (df_test_fixed[col] - df_test_fixed[col].mean())/df_test_fixed[col].std()
        
    # Split training data into train a validation datasets
    df_train_fixed, df_val_fixed = train_test_split(df_train_fixed, test_size = 0.1, random_state = seed_value,\
                                                    stratify = df_train_fixed['TARGET'])
    
    NUM_COLUMNS += TE_cols
    CAT_COLUNMS = [x for x in CAT_COLUNMS if x not in TE_cols]
    print("Categorical columns:", CAT_COLUNMS)
    print("Binary columns:", BINARY_COLUMNS)
    print("Numerical columns:", NUM_COLUMNS)
    
    return df_train_fixed, df_val_fixed, df_test_fixed, [CAT_COLUNMS, BINARY_COLUMNS, NUM_COLUMNS]

In [9]:
train_df, val_df, test_df, columns_df = ProcessData(data_train, data_test, SEED_VALUE)
train_df.head()

Categorical columns: ['WORKERSRANGE']
Binary columns: ['IP_flag', 'OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP', 'SEX_NAME']
Numerical columns: ['F1100', 'F1200', 'F1250', 'F1260', 'F1310', 'F1370', 'F1510', 'F2110', 'F2300', 'F2350', 'F2410', 'F2300_LAG1', 'COUNTCOOWNERFCSM', 'COUNTCOOWNERROSSTAT', 'COUNTCOOWNEREGRUL', 'COUNTBRANCHROSSTAT', 'COUNTBRANCHEGRUL', 'TELEPHONECOUNT', 'MANAGERCOUNTINCOUNTRY', 'MANAGERCOUNTINREGION', 'MANAGERINNCOUNT', 'PLAINTIFF_CASESNUMBER_YEAR', 'DEFENDANT_CASESNUMBER_YEAR', 'DEFENDANT_SUM_YEAR', 'THIRDOROTHERPERSON_YEAR', 'PLAINTIFF_CASESNUMBER_EVER', 'DEFENDANT_CASESNUMBER_EVER', 'DEFENDANT_SUM_EVER', 'THIRDOROTHERPERSON_EVER', 'ADMITTEDNUMBER_233_YEAR', 'ADMITTEDNUMBER_233_EVER', 'SUM_233_EVER', 'ADMITTEDNUMBER_95_YEAR', 'NOTADMITTEDNUMBER_95_YEAR', 'ADMITTEDNUMBER_95_EVER', 'NOTADMITTEDNUMBER_EVER', 'AGE', 'DATE_DELTA_1', 'DATE_DELTA_2', 'DATE_DELTA_3', 'contract_count', 'OKATO_REGIONCODE', 'OKATO_FED', 'OKVED_CODE']


Unnamed: 0,OKATO_REGIONCODE,OKATO_FED,WORKERSRANGE,OKVED_CODE,IP_flag,OKFS_GROUP,OKOPF_GROUP,OKOGU_GROUP,SEX_NAME,F1100,...,ADMITTEDNUMBER_95_EVER,NOTADMITTEDNUMBER_EVER,AGE,DATE_DELTA_1,DATE_DELTA_2,DATE_DELTA_3,contract_count,id_contract,id_client,TARGET
7799,0.166667,0.315087,1,0.342857,0,0,0,0,0,-0.328399,...,-0.250776,-0.205965,-1.222646,-0.39152,-0.484171,0.567149,-0.473024,3426,635,0
11495,0.66582,0.471978,4,0.536959,0,0,0,0,1,-0.328451,...,-0.308651,-0.243031,0.732518,-1.148349,-0.969747,0.819638,1.557409,10193,8677,1
5931,0.109091,0.293062,4,0.268358,0,0,0,0,1,-0.328451,...,-0.309537,-0.253756,0.731061,0.437738,0.047875,0.315285,-0.473024,2355,3171,0
9311,0.567829,0.523735,1,0.536959,0,0,0,0,1,-0.328432,...,-0.280289,-0.221895,0.731324,-1.061749,0.754688,0.026726,-0.430723,11611,12681,0
14105,0.238723,0.293062,11,0.009302,1,0,0,0,0,-0.320325,...,-0.234351,-0.248446,-0.932925,0.851667,1.898811,-0.549767,-0.473024,11355,9449,0


## Models training and evaluation

In [10]:
# CatBosot

model_catboost = CatBoostClassifier(
    iterations = 75,
    learning_rate = 1e-2,
    cat_features = columns_df[0],
    loss_function = 'Logloss'
)

def objective(trial):
    param = {
        "loss_function": trial.suggest_categorical("loss_function", ["Logloss"]),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        #"iterations": trial.suggest_int("depth", 50, 75),
        "depth": trial.suggest_int("depth", 5, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])
        #"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20), 
    }
    # Conditional Hyper-Parameters
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    reg = CatBoostClassifier(**param, cat_features=columns_df[0])
    reg.fit(train_df[columns_df[0]+columns_df[1]+columns_df[2]], train_df['TARGET'],\
            eval_set=[(val_df[columns_df[0]+columns_df[1]+columns_df[2]], val_df['TARGET'])],\
            verbose=0, early_stopping_rounds=100)
    y_pred = reg.predict_proba(val_df[columns_df[0]+columns_df[1]+columns_df[2]])
    score = roc_auc_score(val_df['TARGET'], y_pred[:, 1])
    return score

study = optuna.create_study(sampler=TPESampler(), direction="maximize")
study.optimize(objective, n_trials=15, timeout=600) # Run for 10 minutes
print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


'''
params_catboost = {'depth': [10]}
GS_catboost = GridSearchCV(
    model_catboost,
    params_catboost,
    cv = 5,
    n_jobs = -1, 
    scoring = 'roc_auc',
    verbose = 2
)
GS_catboost.fit(train_df[columns_df[0]+columns_df[1]+columns_df[2]], train_df['TARGET'])
print("Best params:\n", GS_catboost.best_params_)
print(f"Best ROC AUC score: {GS_catboost.best_score_}")
'''

[32m[I 2021-11-24 01:40:43,475][0m A new study created in memory with name: no-name-e4afdf35-3411-4d12-b14b-73db12908027[0m
[32m[I 2021-11-24 01:40:50,122][0m Trial 0 finished with value: 0.9530404061342441 and parameters: {'loss_function': 'Logloss', 'l2_leaf_reg': 0.1568448821338875, 'colsample_bylevel': 0.15106154748334172, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9530404061342441.[0m
[32m[I 2021-11-24 01:40:53,857][0m Trial 1 finished with value: 0.9322483001561757 and parameters: {'loss_function': 'Logloss', 'l2_leaf_reg': 0.030810198330573874, 'colsample_bylevel': 0.016140820940956594, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9530404061342441.[0m
[32m[I 2021-11-24 01:41:13,137][0m Trial 2 finished with value: 0.9499967168326061 and parameters: {'loss_function': 'Logloss', 'l2_leaf_reg': 0.594481794347021, 'colsample_bylevel': 0.033328644843646185, 'depth': 11, 'boo

Number of completed trials: 15
Best trial:
	Best Score: 0.9554081492077268
	Best Params: 
    loss_function: Logloss
    l2_leaf_reg: 0.433408933172998
    colsample_bylevel: 0.1863999913282285
    depth: 6
    boosting_type: Plain
    bootstrap_type: MVS


'\nparams_catboost = {\'depth\': [10]}\nGS_catboost = GridSearchCV(\n    model_catboost,\n    params_catboost,\n    cv = 5,\n    n_jobs = -1, \n    scoring = \'roc_auc\',\n    verbose = 2\n)\nGS_catboost.fit(train_df[columns_df[0]+columns_df[1]+columns_df[2]], train_df[\'TARGET\'])\nprint("Best params:\n", GS_catboost.best_params_)\nprint(f"Best ROC AUC score: {GS_catboost.best_score_}")\n'

In [24]:
print(trial.params)
clf = CatBoostClassifier(**trial.params, iterations = 1000,cat_features=columns_df[0])
clf.fit(
    pd.concat([
        train_df[columns_df[0]+columns_df[1]+columns_df[2]],
        val_df[columns_df[0]+columns_df[1]+columns_df[2]]
    ], axis = 0),
    pd.concat([
        train_df['TARGET'],\
        val_df['TARGET']
    ], axis = 0)
)

{'loss_function': 'Logloss', 'l2_leaf_reg': 0.433408933172998, 'colsample_bylevel': 0.1863999913282285, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}
0:	learn: 0.6752733	total: 12.7ms	remaining: 12.7s
1:	learn: 0.6472846	total: 22.9ms	remaining: 11.4s
2:	learn: 0.6328422	total: 41.2ms	remaining: 13.7s
3:	learn: 0.6126193	total: 60.2ms	remaining: 15s
4:	learn: 0.5941514	total: 80.3ms	remaining: 16s
5:	learn: 0.5758940	total: 100ms	remaining: 16.6s
6:	learn: 0.5692096	total: 130ms	remaining: 18.5s
7:	learn: 0.5591056	total: 151ms	remaining: 18.7s
8:	learn: 0.5423258	total: 184ms	remaining: 20.2s
9:	learn: 0.5344730	total: 204ms	remaining: 20.2s
10:	learn: 0.5276653	total: 247ms	remaining: 22.2s
11:	learn: 0.5149143	total: 293ms	remaining: 24.1s
12:	learn: 0.5035810	total: 314ms	remaining: 23.8s
13:	learn: 0.4923054	total: 333ms	remaining: 23.5s
14:	learn: 0.4854776	total: 354ms	remaining: 23.2s
15:	learn: 0.4800325	total: 373ms	remaining: 23s
16:	learn: 0.4718046	total: 

163:	learn: 0.2515066	total: 3.51s	remaining: 17.9s
164:	learn: 0.2512137	total: 3.54s	remaining: 17.9s
165:	learn: 0.2508615	total: 3.56s	remaining: 17.9s
166:	learn: 0.2506600	total: 3.58s	remaining: 17.8s
167:	learn: 0.2503532	total: 3.59s	remaining: 17.8s
168:	learn: 0.2501343	total: 3.61s	remaining: 17.7s
169:	learn: 0.2500056	total: 3.63s	remaining: 17.7s
170:	learn: 0.2498314	total: 3.64s	remaining: 17.6s
171:	learn: 0.2495355	total: 3.66s	remaining: 17.6s
172:	learn: 0.2493361	total: 3.67s	remaining: 17.5s
173:	learn: 0.2491737	total: 3.69s	remaining: 17.5s
174:	learn: 0.2489840	total: 3.71s	remaining: 17.5s
175:	learn: 0.2488549	total: 3.74s	remaining: 17.5s
176:	learn: 0.2484958	total: 3.76s	remaining: 17.5s
177:	learn: 0.2481912	total: 3.78s	remaining: 17.5s
178:	learn: 0.2478231	total: 3.79s	remaining: 17.4s
179:	learn: 0.2475346	total: 3.81s	remaining: 17.4s
180:	learn: 0.2473220	total: 3.83s	remaining: 17.3s
181:	learn: 0.2469795	total: 3.84s	remaining: 17.3s
182:	learn: 

322:	learn: 0.2204071	total: 6.53s	remaining: 13.7s
323:	learn: 0.2202502	total: 6.56s	remaining: 13.7s
324:	learn: 0.2201185	total: 6.61s	remaining: 13.7s
325:	learn: 0.2198875	total: 6.67s	remaining: 13.8s
326:	learn: 0.2197409	total: 6.71s	remaining: 13.8s
327:	learn: 0.2196086	total: 6.78s	remaining: 13.9s
328:	learn: 0.2194127	total: 6.84s	remaining: 13.9s
329:	learn: 0.2192048	total: 6.9s	remaining: 14s
330:	learn: 0.2189756	total: 6.92s	remaining: 14s
331:	learn: 0.2187101	total: 6.95s	remaining: 14s
332:	learn: 0.2186069	total: 6.96s	remaining: 13.9s
333:	learn: 0.2184321	total: 6.98s	remaining: 13.9s
334:	learn: 0.2183044	total: 7s	remaining: 13.9s
335:	learn: 0.2181642	total: 7.04s	remaining: 13.9s
336:	learn: 0.2179858	total: 7.06s	remaining: 13.9s
337:	learn: 0.2178279	total: 7.08s	remaining: 13.9s
338:	learn: 0.2176949	total: 7.11s	remaining: 13.9s
339:	learn: 0.2174277	total: 7.13s	remaining: 13.8s
340:	learn: 0.2172807	total: 7.15s	remaining: 13.8s
341:	learn: 0.2171550	

487:	learn: 0.1961065	total: 10.1s	remaining: 10.6s
488:	learn: 0.1959123	total: 10.1s	remaining: 10.6s
489:	learn: 0.1957883	total: 10.2s	remaining: 10.6s
490:	learn: 0.1956216	total: 10.2s	remaining: 10.6s
491:	learn: 0.1955404	total: 10.2s	remaining: 10.5s
492:	learn: 0.1953922	total: 10.2s	remaining: 10.5s
493:	learn: 0.1953264	total: 10.2s	remaining: 10.5s
494:	learn: 0.1952369	total: 10.3s	remaining: 10.5s
495:	learn: 0.1951186	total: 10.3s	remaining: 10.4s
496:	learn: 0.1949832	total: 10.3s	remaining: 10.4s
497:	learn: 0.1948167	total: 10.3s	remaining: 10.4s
498:	learn: 0.1946481	total: 10.3s	remaining: 10.4s
499:	learn: 0.1945946	total: 10.3s	remaining: 10.3s
500:	learn: 0.1944448	total: 10.4s	remaining: 10.3s
501:	learn: 0.1942851	total: 10.4s	remaining: 10.3s
502:	learn: 0.1942087	total: 10.4s	remaining: 10.3s
503:	learn: 0.1940727	total: 10.4s	remaining: 10.2s
504:	learn: 0.1940018	total: 10.4s	remaining: 10.2s
505:	learn: 0.1938222	total: 10.4s	remaining: 10.2s
506:	learn: 

651:	learn: 0.1767529	total: 13.6s	remaining: 7.23s
652:	learn: 0.1766561	total: 13.6s	remaining: 7.21s
653:	learn: 0.1765908	total: 13.6s	remaining: 7.2s
654:	learn: 0.1764938	total: 13.6s	remaining: 7.18s
655:	learn: 0.1763982	total: 13.6s	remaining: 7.16s
656:	learn: 0.1762676	total: 13.7s	remaining: 7.14s
657:	learn: 0.1761839	total: 13.7s	remaining: 7.12s
658:	learn: 0.1760705	total: 13.7s	remaining: 7.1s
659:	learn: 0.1760000	total: 13.7s	remaining: 7.07s
660:	learn: 0.1759053	total: 13.8s	remaining: 7.05s
661:	learn: 0.1757963	total: 13.8s	remaining: 7.03s
662:	learn: 0.1757223	total: 13.8s	remaining: 7.02s
663:	learn: 0.1756314	total: 13.8s	remaining: 6.99s
664:	learn: 0.1755602	total: 13.8s	remaining: 6.97s
665:	learn: 0.1754743	total: 13.9s	remaining: 6.95s
666:	learn: 0.1753458	total: 13.9s	remaining: 6.93s
667:	learn: 0.1751557	total: 13.9s	remaining: 6.91s
668:	learn: 0.1750447	total: 13.9s	remaining: 6.89s
669:	learn: 0.1749339	total: 13.9s	remaining: 6.87s
670:	learn: 0.

811:	learn: 0.1616052	total: 16.4s	remaining: 3.79s
812:	learn: 0.1615156	total: 16.4s	remaining: 3.77s
813:	learn: 0.1614588	total: 16.4s	remaining: 3.75s
814:	learn: 0.1613278	total: 16.4s	remaining: 3.73s
815:	learn: 0.1612368	total: 16.4s	remaining: 3.71s
816:	learn: 0.1611641	total: 16.4s	remaining: 3.68s
817:	learn: 0.1610798	total: 16.5s	remaining: 3.66s
818:	learn: 0.1609722	total: 16.5s	remaining: 3.64s
819:	learn: 0.1608670	total: 16.5s	remaining: 3.62s
820:	learn: 0.1607725	total: 16.5s	remaining: 3.6s
821:	learn: 0.1606684	total: 16.5s	remaining: 3.58s
822:	learn: 0.1605767	total: 16.5s	remaining: 3.56s
823:	learn: 0.1604749	total: 16.6s	remaining: 3.54s
824:	learn: 0.1604210	total: 16.6s	remaining: 3.52s
825:	learn: 0.1603189	total: 16.6s	remaining: 3.5s
826:	learn: 0.1602378	total: 16.6s	remaining: 3.48s
827:	learn: 0.1601446	total: 16.6s	remaining: 3.46s
828:	learn: 0.1600650	total: 16.7s	remaining: 3.43s
829:	learn: 0.1600032	total: 16.7s	remaining: 3.42s
830:	learn: 0.

978:	learn: 0.1482061	total: 19.3s	remaining: 414ms
979:	learn: 0.1481587	total: 19.3s	remaining: 395ms
980:	learn: 0.1481055	total: 19.4s	remaining: 375ms
981:	learn: 0.1480317	total: 19.4s	remaining: 355ms
982:	learn: 0.1479883	total: 19.4s	remaining: 335ms
983:	learn: 0.1479091	total: 19.4s	remaining: 316ms
984:	learn: 0.1478380	total: 19.4s	remaining: 296ms
985:	learn: 0.1477528	total: 19.4s	remaining: 276ms
986:	learn: 0.1476750	total: 19.5s	remaining: 256ms
987:	learn: 0.1476203	total: 19.5s	remaining: 236ms
988:	learn: 0.1475623	total: 19.5s	remaining: 217ms
989:	learn: 0.1474916	total: 19.5s	remaining: 197ms
990:	learn: 0.1474111	total: 19.5s	remaining: 177ms
991:	learn: 0.1473402	total: 19.5s	remaining: 158ms
992:	learn: 0.1472865	total: 19.6s	remaining: 138ms
993:	learn: 0.1471787	total: 19.6s	remaining: 118ms
994:	learn: 0.1470790	total: 19.6s	remaining: 98.4ms
995:	learn: 0.1470163	total: 19.6s	remaining: 78.7ms
996:	learn: 0.1469523	total: 19.6s	remaining: 59ms
997:	learn:

<catboost.core.CatBoostClassifier at 0x7fdac651d9d0>

In [25]:
preds = list(clf.predict_proba(test_df[columns_df[0]+columns_df[1]+columns_df[2]])[:, 1])
ids = list(test_df['id_contract'])
submit_data = {'id_contract': ids, 'TARGET': preds}
submit_df = pd.DataFrame.from_dict(submit_data, )
submit_df.to_csv('submit_file.csv', sep=';', index=False)

In [22]:
test_df.shape

(7330, 52)

In [23]:
feat_imp = pd.DataFrame.from_dict(
    {
        'feature': columns_df[0]+columns_df[1]+columns_df[2],
        'importance': list(clf.feature_importances_)
    }
)
feat_imp

Unnamed: 0,feature,importance
0,WORKERSRANGE,3.106202
1,IP_flag,0.269859
2,OKFS_GROUP,0.31252
3,OKOPF_GROUP,0.241567
4,OKOGU_GROUP,0.344173
5,SEX_NAME,0.339718
6,F1100,1.404466
7,F1200,1.763597
8,F1250,2.123248
9,F1260,2.222117


## Basic solution

In [15]:
def makeX(data_in):
    # предобработка данных
    data = data_in.copy()
    data['CITIZENSHIP_NAME'] = data['CITIZENSHIP_NAME'].fillna(-1).map({-1: -1, 'Российская Федерация': 4, 'Таджикистан': 3, 'Казахстан': 2, 'Армения': 1})
    data['SEX_NAME'] = data['CITIZENSHIP_NAME'].fillna(0).map({0: 0, 'мужской': 1, 'женский': -1})
    group_names = ['OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP'] + ['WORKERSRANGE', 'OKVED_CODE']
    date_names = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']
    for name in group_names + date_names + ['id_client']:
        data[name] = data[name].fillna(-1)
        tmp = data[name].value_counts()
        tmp = tmp + 0.1 * np.random.randn(len(tmp))
        data[name] = data[name].map(tmp)
    data.fillna(-1, inplace=True)
    return data

data_train_fixed = makeX(data_train)
data_test_fixed = makeX(data_test)
y = data_train_fixed.pop('TARGET').values
data_test_fixed = data_test_fixed[data_train_fixed.columns]

model = lgbm.LGBMClassifier(num_leaves=31,
                           learning_rate=0.05,
                           n_estimators=185)

model.fit(data_train_fixed, y)
preds = model.predict_proba(data_test_fixed)[:, 1]

df = pd.DataFrame({'id_contract': data_test.id_contract.values, 'TARGET': preds})
df.to_csv('submit_file.csv', sep=';', index=False)