In [1]:
import pandas as pd
import dill as dill
import numpy as np
import warnings

from os import listdir
from os.path import isfile, join

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score

warnings.filterwarnings("ignore")

In [None]:
"""Проведем исследование - улучшают ли новые признаки модель???"""

df0 = pd.read_parquet('train_data/train_data_0.pq')
target = pd.read_csv('train_target.csv')[0:250000]

In [7]:
def convert_one_features(df0=pd.DataFrame, column=str, col_id=False) -> pd.DataFrame:
    df_col = df0[['id', column]]
    ohe = OneHotEncoder(sparse=False)
    df_col[ohe.get_feature_names_out()] = ohe.fit_transform(df_col[[column]])
    df_col = df_col.drop(columns=column)
    df_col = df_col.groupby(['id']).agg('sum').astype(np.int8).reset_index()
    if col_id == False:
        df_col = df_col.drop(columns='id')
    return df_col

def agg_rn(df=pd.DataFrame) -> pd.DataFrame:
    df_agg_rn = df[['id', 'rn']].copy()
    df_agg_rn = df_agg_rn.groupby(['id']).agg('count').astype(np.int8).reset_index()
    return df_agg_rn

def del_feature(df=pd.DataFrame, col=str) -> pd.DataFrame:
    df_new = df.copy()
    return df_new.drop(columns=col)

def invert(df=pd.DataFrame, columns=list, col_id=False) -> pd.DataFrame: 
    df_invert = df[['id'] + columns].copy()
    for col in columns:
        df_invert[col] = df_invert[col].apply(lambda x: 1 if x==0 else 0) #делаем реверс значений по каждой колонке
    if col_id == False:
        return df_invert.drop(columns='id')
    else:
        return df_invert
    
def open_loans_credit(df=pd.DataFrame) -> pd.Series:
    df_open = df[['id', 'is_zero_util']].copy()
    df_open = invert(df_open, ['is_zero_util'], col_id=True)
    df_open = df_open.groupby(['id']).agg('sum').reset_index()
    df_open = df_open.rename(columns={'is_zero_util':'open_loans'})
    return df_open['open_loans']
    
def growth_limit_feature(df=pd.DataFrame) -> pd.Series:
    df_change = df[['id', 'pre_loans_credit_limit']].copy()
    df_change_first = df_change.drop_duplicates(subset=['id'], keep='first').rename(columns={
        'pre_loans_credit_limit': 'first'
    }).set_index('id')
    df_change_last = df_change.drop_duplicates(subset=['id'], keep='last').rename(columns={
        'pre_loans_credit_limit': 'last'
    }).set_index('id')
    df_change = pd.concat([df_change_first, df_change_last], axis=1).reset_index()
    df_change['growth_limit'] = df_change['last'] - df_change['first']
    return df_change['growth_limit']

In [8]:
def convert_df0(df0=pd.DataFrame) -> pd.DataFrame: # стоковая обработка
    df = pd.DataFrame()
    
    df[['id', 'rn']] = agg_rn(df0)   
    columns = df0.columns.to_list()[2:]
    for column in columns:
        df_one_columns = convert_one_features(df0, column)
        df = pd.concat([df, df_one_columns], axis=1)
    return df

def convert_df1(df0=pd.DataFrame) -> pd.DataFrame: # с удалением фичи с нулевыми значениями
    df = pd.DataFrame()
    
    df[['id', 'rn']] = agg_rn(df0)
    df0 = del_feature(df0, 'pre_loans_total_overdue')
    
    columns = df0.columns.to_list()[2:]
    for column in columns:
        df_one_columns = convert_one_features(df0, column)
        df = pd.concat([df, df_one_columns], axis=1)
    return df

def convert_df2(df0=pd.DataFrame) -> pd.DataFrame: # только инвертирование признаков
    df = pd.DataFrame()
    
    list_to_invert = [ 
        'is_zero_loans5',
        'is_zero_loans530',
        'is_zero_loans3060',
        'is_zero_loans6090',
        'is_zero_loans90'
    ]
    
    df[['id', 'rn']] = agg_rn(df0)
    df0[list_to_invert] = invert(df0, list_to_invert)
    
    columns = df0.columns.to_list()[2:]
    for column in columns:
        df_one_columns = convert_one_features(df0, column)
        df = pd.concat([df, df_one_columns], axis=1)
    return df

def convert_df3(df0=pd.DataFrame) -> pd.DataFrame: # только open_loans
    df = pd.DataFrame()
    
    df[['id', 'rn']] = agg_rn(df0)
    df['open_loans'] = open_loans_credit(df0)
    
    columns = df0.columns.to_list()[2:]
    for column in columns:
        df_one_columns = convert_one_features(df0, column)
        df = pd.concat([df, df_one_columns], axis=1)
    return df

def convert_df4(df0=pd.DataFrame) -> pd.DataFrame: # только growth_limit
    df = pd.DataFrame()
    
    df[['id', 'rn']] = agg_rn(df0)
    df['growth_limit'] = growth_limit_feature(df0)
    
    columns = df0.columns.to_list()[2:]
    for column in columns:
        df_one_columns = convert_one_features(df0, column)
        df = pd.concat([df, df_one_columns], axis=1)
    return df

In [9]:
def train_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    predict_proba_train = model.predict_proba(X_train)[:,1]
    predict_proba_test = model.predict_proba(X_test)[:,1]
    predict_test = model.predict(X_test)
    print(roc_auc_score(y_train, predict_proba_train))
    print(roc_auc_score(y_test, predict_proba_test))
    print(precision_score(y_test, predict_test))
    print(recall_score(y_test, predict_test))
    print(confusion_matrix(y_test, predict_test))
    return predict_proba_test, predict_test

In [None]:
'''Только стоковая обработка'''

In [10]:
df = convert_df0(df0)
df

Unnamed: 0,id,rn,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,pre_since_opened_7,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,10,0,1,1,1,1,2,0,1,...,7,0,0,10,0,0,9,1,8,2
1,1,14,0,0,1,0,0,0,0,1,...,8,0,0,14,0,0,13,1,12,2
2,2,3,1,0,0,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,15,0,3,1,0,2,1,3,0,...,9,1,0,15,0,0,10,5,9,6
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,13,2,1,2,0,0,0,1,1,...,7,3,0,13,0,0,12,1,10,3
249996,249996,23,0,1,2,1,2,2,3,1,...,9,3,0,23,0,0,18,5,18,5
249997,249997,7,0,2,0,0,0,0,0,0,...,2,0,0,6,1,0,6,1,5,2
249998,249998,5,0,2,0,0,0,0,0,1,...,2,0,0,5,0,0,3,2,2,3


In [11]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048295 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5688
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 380
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8290689069734112
0.7627924466510313
0.06987407168227316
0.6994182288299935
[[34050 14403]
 [  465  1082]]


In [None]:
'''C удалением фичи с нулевыми значениями'''

In [12]:
df = convert_df1(df0)
df

Unnamed: 0,id,rn,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,pre_since_opened_7,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,10,0,1,1,1,1,2,0,1,...,7,0,0,10,0,0,9,1,8,2
1,1,14,0,0,1,0,0,0,0,1,...,8,0,0,14,0,0,13,1,12,2
2,2,3,1,0,0,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,15,0,3,1,0,2,1,3,0,...,9,1,0,15,0,0,10,5,9,6
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,13,2,1,2,0,0,0,1,1,...,7,3,0,13,0,0,12,1,10,3
249996,249996,23,0,1,2,1,2,2,3,1,...,9,3,0,23,0,0,18,5,18,5
249997,249997,7,0,2,0,0,0,0,0,0,...,2,0,0,6,1,0,6,1,5,2
249998,249998,5,0,2,0,0,0,0,0,1,...,2,0,0,5,0,0,3,2,2,3


In [13]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5644
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 379
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8290689069734112
0.7627924466510313
0.06987407168227316
0.6994182288299935
[[34050 14403]
 [  465  1082]]


In [None]:
# не влдияет на результат но позволит немного сыкономить ресурсы

In [None]:
'''Только инвертирование признаков'''

In [14]:
df = convert_df2(df0)
df

Unnamed: 0,id,rn,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,pre_since_opened_7,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,10,0,1,1,1,1,2,0,1,...,7,0,0,10,0,0,9,1,8,2
1,1,14,0,0,1,0,0,0,0,1,...,8,0,0,14,0,0,13,1,12,2
2,2,3,1,0,0,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,15,0,3,1,0,2,1,3,0,...,9,1,0,15,0,0,10,5,9,6
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,13,2,1,2,0,0,0,1,1,...,7,3,0,13,0,0,12,1,10,3
249996,249996,23,0,1,2,1,2,2,3,1,...,9,3,0,23,0,0,18,5,18,5
249997,249997,7,0,2,0,0,0,0,0,0,...,2,0,0,6,1,0,6,1,5,2
249998,249998,5,0,2,0,0,0,0,0,1,...,2,0,0,5,0,0,3,2,2,3


In [15]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5688
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 380
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8290689069734112
0.7627924466510313
0.06987407168227316
0.6994182288299935
[[34050 14403]
 [  465  1082]]


In [None]:
# не влияет на модель

In [None]:
'''Только open_loans'''

In [16]:
df = convert_df3(df0)
df

Unnamed: 0,id,rn,open_loans,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,10,4,0,1,1,1,1,2,0,...,7,0,0,10,0,0,9,1,8,2
1,1,14,4,0,0,1,0,0,0,0,...,8,0,0,14,0,0,13,1,12,2
2,2,3,2,1,0,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,15,7,0,3,1,0,2,1,3,...,9,1,0,15,0,0,10,5,9,6
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,13,4,2,1,2,0,0,0,1,...,7,3,0,13,0,0,12,1,10,3
249996,249996,23,5,0,1,2,1,2,2,3,...,9,3,0,23,0,0,18,5,18,5
249997,249997,7,0,0,2,0,0,0,0,0,...,2,0,0,6,1,0,6,1,5,2
249998,249998,5,4,0,2,0,0,0,0,0,...,2,0,0,5,0,0,3,2,2,3


In [17]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5707
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 381
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8290689069734112
0.7627924466510313
0.06987407168227316
0.6994182288299935
[[34050 14403]
 [  465  1082]]


In [None]:
# не влияет на модель

In [None]:
'''Только growth_limit'''

In [18]:
df = convert_df4(df0)
df

Unnamed: 0,id,rn,growth_limit,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,10,5,0,1,1,1,1,2,0,...,7,0,0,10,0,0,9,1,8,2
1,1,14,10,0,0,1,0,0,0,0,...,8,0,0,14,0,0,13,1,12,2
2,2,3,0,1,0,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,15,4,0,3,1,0,2,1,3,...,9,1,0,15,0,0,10,5,9,6
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,13,14,2,1,2,0,0,0,1,...,7,3,0,13,0,0,12,1,10,3
249996,249996,23,10,0,1,2,1,2,2,3,...,9,3,0,23,0,0,18,5,18,5
249997,249997,7,5,0,2,0,0,0,0,0,...,2,0,0,6,1,0,6,1,5,2
249998,249998,5,0,0,2,0,0,0,0,0,...,2,0,0,5,0,0,3,2,2,3


In [19]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5727
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 381
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8299512265830655
0.7639072689224383
0.06971200309258424
0.6994182288299935
[[34014 14439]
 [  465  1082]]


In [None]:
# немного улучшает модель

In [20]:
def convert_df(df0=pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame()
    
    list_to_invert = [ 
        'is_zero_loans5',
        'is_zero_loans530',
        'is_zero_loans3060',
        'is_zero_loans6090',
        'is_zero_loans90'
    ]
    
    df[['id', 'rn']] = agg_rn(df0)
    df0[list_to_invert] = invert(df0, list_to_invert)
    df0 = del_feature(df0, 'pre_loans_total_overdue')
    df['open_loans'] = open_loans_credit(df0)
    df['growth_limit'] = growth_limit_feature(df0)
    
    columns = df0.columns.to_list()[2:]
    for column in columns:
        df_one_columns = convert_one_features(df0, column)
        df = pd.concat([df, df_one_columns], axis=1)
    return df

In [None]:
'''Все вместе'''

In [21]:
df = convert_df(df0)
df

Unnamed: 0,id,rn,open_loans,growth_limit,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,10,4,5,0,1,1,1,1,2,...,7,0,0,10,0,0,9,1,8,2
1,1,14,4,10,0,0,1,0,0,0,...,8,0,0,14,0,0,13,1,12,2
2,2,3,2,0,1,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,15,7,4,0,3,1,0,2,1,...,9,1,0,15,0,0,10,5,9,6
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,13,4,14,2,1,2,0,0,0,...,7,3,0,13,0,0,12,1,10,3
249996,249996,23,5,10,0,1,2,1,2,2,...,9,3,0,23,0,0,18,5,18,5
249997,249997,7,0,5,0,2,0,0,0,0,...,2,0,0,6,1,0,6,1,5,2
249998,249998,5,4,0,0,2,0,0,0,0,...,2,0,0,5,0,0,3,2,2,3


In [22]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049848 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5702
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 381
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8299512265830655
0.7639072689224383
0.06971200309258424
0.6994182288299935
[[34014 14439]
 [  465  1082]]


In [None]:
'''Признак rn, преобразованный в количество кредитов на один id'''

In [23]:
df = convert_df0(df0)
df = df.drop(columns='rn')
df

Unnamed: 0,id,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,pre_since_opened_7,pre_since_opened_8,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_account_cur_0,enc_loans_account_cur_1,enc_loans_account_cur_2,enc_loans_account_cur_3,pclose_flag_0,pclose_flag_1,fclose_flag_0,fclose_flag_1
0,0,0,1,1,1,1,2,0,1,0,...,7,0,0,10,0,0,9,1,8,2
1,1,0,0,1,0,0,0,0,1,2,...,8,0,0,14,0,0,13,1,12,2
2,2,1,0,0,0,0,0,0,0,0,...,1,0,0,3,0,0,1,2,1,2
3,3,0,3,1,0,2,1,3,0,0,...,9,1,0,15,0,0,10,5,9,6
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,249995,2,1,2,0,0,0,1,1,0,...,7,3,0,13,0,0,12,1,10,3
249996,249996,0,1,2,1,2,2,3,1,1,...,9,3,0,23,0,0,18,5,18,5
249997,249997,0,2,0,0,0,0,0,0,4,...,2,0,0,6,1,0,6,1,5,2
249998,249998,0,2,0,0,0,0,0,1,0,...,2,0,0,5,0,0,3,2,2,3


In [24]:
X = df.drop(columns='id')
y = target['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

lgbm = LGBMClassifier(class_weight={0:1, 1:32}, n_jobs=-1, random_state=42, reg_lambda=1000, reg_alpha=0.4)

predict_proba_test_lgbm, predict_test_lgbm = train_model(X_train, X_test, y_train, y_test, lgbm)

[LightGBM] [Info] Number of positive: 6187, number of negative: 193813
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5644
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 379
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505323 -> initscore=0.021292
[LightGBM] [Info] Start training from score 0.021292
0.8290689069734112
0.7627924466510313
0.06987407168227316
0.6994182288299935
[[34050 14403]
 [  465  1082]]


In [None]:
# признак rn - не влияет на модель

In [None]:
'''Итог - только признак growth_limit из созданных фичей улучшает модель