In [1]:
import pandas as pd
import dill as dill
import numpy as np
import warnings
import time

from os import listdir
from os.path import isfile, join
from IPython.display import clear_output

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score

warnings.filterwarnings("ignore")

In [2]:
df0 = pd.read_parquet('train_data/train_data_0.pq')

In [3]:
df0

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1974719,249998,4,1,9,14,8,10,11,8,4,...,3,3,3,4,1,2,3,1,0,1
1974720,249998,5,7,9,4,8,1,11,19,1,...,3,3,3,4,1,2,4,1,1,1
1974721,249999,1,9,0,10,8,10,11,16,2,...,3,3,3,4,1,2,3,1,0,1
1974722,249999,2,9,16,10,13,10,4,12,2,...,3,3,3,4,1,2,3,1,0,0


In [3]:
def dill_dump(path, model):
    with open(path, 'wb') as file:
        dill.dump(model, file)

def dill_load(path):
    with open(path, 'rb') as file:
        model = dill.load(file)
    return model

In [4]:
def convert_one_features(df0=pd.DataFrame, column=str, len_of_credit_history=int, col_id=False) -> pd.DataFrame:
    df_col = df0[['id', column]]
    
    ohe = OneHotEncoder(sparse=False)
    df_col = df_col.groupby(['id']).tail(len_of_credit_history).reset_index(drop=True)
    df_col[ohe.get_feature_names_out()] = ohe.fit_transform(df_col[[column]])
    df_col = df_col.drop(columns=column)
    df_col = df_col.groupby(['id']).agg('sum').reset_index()
    if col_id == False:
        df_col = df_col.drop(columns='id')
    return df_col

In [5]:
def agg_rn(df=pd.DataFrame) -> pd.DataFrame:
    df_agg_rn = df[['id', 'rn']].copy()
    df_agg_rn = df_agg_rn.groupby(['id']).agg('count').reset_index()
    return df_agg_rn

In [6]:
def convert_df(df0=pd.DataFrame, len_of_credit_history=int) -> pd.DataFrame:
    df = pd.DataFrame()
    
    list_to_invert = [ 
        'is_zero_loans5',
        'is_zero_loans530',
        'is_zero_loans3060',
        'is_zero_loans6090',
        'is_zero_loans90'
    ]
    
    df[['id', 'rn']] = agg_rn(df0).astype(np.int32) # получаем новый агрегированный датафрейм по id с признаком rn - колич. кредитов
    
    columns = df0.columns.to_list()[2:] # так как id и rn уже сагрегированы, то обрезаим их из списка колонок
    for column in columns:
        df_one_columns = convert_one_features(df0, column, len_of_credit_history).astype(np.int8) # все фичи, кроме id и rn конвертируются в int8
        df = pd.concat([df, df_one_columns], axis=1)
    return df

# 2.1 Исследование моделей

In [7]:
def train_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    predict_proba_train = model.predict_proba(X_train)[:,1]
    predict_proba_test = model.predict_proba(X_test)[:,1]
    predict_test = model.predict(X_test)
    print(roc_auc_score(y_train, predict_proba_train))
    print(roc_auc_score(y_test, predict_proba_test))
    print(precision_score(y_test, predict_test))
    print(recall_score(y_test, predict_test))
    print(confusion_matrix(y_test, predict_test))
    return predict_proba_test, predict_test

In [10]:
max(df['rn'])

51

In [11]:
dict_score = dict()

for i in range(1,52):
    df = convert_df(df0, i)
    target = pd.read_csv('train_target.csv')[0:250000]
    X = df.drop(columns='id')
    y = target['flag']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
    
    lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

    predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)
    print(f' последние {i} кредитов')
    
    dict_score[i] = predict_proba_test_lgbm
    
dict_score

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 720
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 339
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.7622940506995411
0.7187286072585471
0.06012770485987939
0.6574014221073045
[[32556 15897]
 [  530  1017]]
 последние 1 кредитов
[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info

0.8285344121364177
0.7608248330694948
0.0688549814078728
0.6942469295410472
[[33929 14524]
 [  473  1074]]
 последние 13 кредитов
[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3693
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 380
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8278924444116433
0.7630231995924159
0.06938198638057304
0.6981254040077569
[[33967 14486]
 [  467  1080]]
 последние 14 кредитов
[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047115 seconds.
You ca

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4925
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 380
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8290287975943271
0.7637270384213753
0.06878340954466414
0.6903684550743374
[[33994 14459]
 [  479  1068]]
 последние 26 кредитов
[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5554
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 380
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.829188015050334
0.762543096328657
0.06901217861975643
0.6923076923076923
[[34005 14448]
 [  476  1071]]
 последние 38 кредитов
[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5686
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 380
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.829466243845739
0.762911261769464
0.0696774193548387
0.6981254040077569
[[34033 14420]
 [  467  1080]]
 последние 50 кредитов
[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info]

{1: array([0.74467353, 0.87880203, 0.53096123, ..., 0.35919521, 0.28110615,
        0.27351942]),
 2: array([0.75861682, 0.86601681, 0.61147829, ..., 0.26215503, 0.16350535,
        0.23636619]),
 3: array([0.75909724, 0.84190624, 0.56716271, ..., 0.30321741, 0.17649239,
        0.21704743]),
 4: array([0.79580972, 0.85528052, 0.53909563, ..., 0.30690355, 0.13043066,
        0.17301539]),
 5: array([0.79568465, 0.84480366, 0.5310801 , ..., 0.32718423, 0.12180779,
        0.22849208]),
 6: array([0.75870527, 0.81983258, 0.48411672, ..., 0.34456447, 0.15690356,
        0.17428588]),
 7: array([0.70850131, 0.82708237, 0.5651721 , ..., 0.35352561, 0.10522935,
        0.17050102]),
 8: array([0.70772254, 0.83567985, 0.53045298, ..., 0.35657876, 0.1150271 ,
        0.16113449]),
 9: array([0.71725026, 0.81377332, 0.47607076, ..., 0.33946677, 0.13394337,
        0.14951563]),
 10: array([0.67253244, 0.78433456, 0.53480993, ..., 0.31502835, 0.12240274,
        0.21929065]),
 11: array([0.66810

In [12]:
# лучшая метрика

In [13]:
dict_roc = dict()
for k in dict_score.keys():
    dict_roc[k] = roc_auc_score(y_test, list(dict_score[k]))
max(dict_roc.values())

0.7649534783312695

In [14]:
# поробуем сделать ансамбль из предсказаний

In [15]:
predict_proba = pd.DataFrame(dict_score).T.mean()
roc_auc_score(y_test, predict_proba.to_list())

0.7679399589024561

In [None]:
# создавать 51 датасет слишком долго. Выберем лучшие модели и создадим ансамбль предсказаний

In [None]:
# выбираем только нужные индексы из словаря 5,8,23,32,41 - наилучшие метрики

In [39]:
list_score = []
for k in [5,8,23]:
    list_score.append(list(dict_score[k]))

In [40]:
predict_proba = pd.DataFrame(list_score).mean()
roc_auc_score(y_test, predict_proba.to_list())

0.7697859624220039