In [1]:
import pandas as pd

import fastparquet

import gc
import dill
import joblib

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_validate, GridSearchCV

In [3]:
from lightgbm import LGBMClassifier

In [4]:
from pipe_utils import *

## Load raw data

In [5]:
data = pd.DataFrame()
for name in [f"data/train_data/train_data_{i}.pq" for i in range(12)]:
    print(f"Read <{name.split('/')[-1]}> ... ", end='')
    data = pd.concat((data, pd.read_parquet(full_path(name))), axis=0, ignore_index=True)
    print('done')

Read <train_data_0.pq> ... done
Read <train_data_1.pq> ... done
Read <train_data_2.pq> ... done
Read <train_data_3.pq> ... done
Read <train_data_4.pq> ... done
Read <train_data_5.pq> ... done
Read <train_data_6.pq> ... done
Read <train_data_7.pq> ... done
Read <train_data_8.pq> ... done
Read <train_data_9.pq> ... done
Read <train_data_10.pq> ... done
Read <train_data_11.pq> ... done


In [6]:
data

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26162712,2999999,8,6,5,14,13,1,15,16,2,...,0,0,0,1,1,3,4,1,0,0
26162713,2999999,9,5,3,2,10,15,14,17,2,...,0,0,0,4,1,3,4,1,0,0
26162714,2999999,10,3,16,11,13,14,8,15,5,...,0,0,3,4,1,2,4,1,0,0
26162715,2999999,11,3,6,4,8,1,11,0,5,...,3,3,3,4,1,2,3,1,1,1


## Target loading

In [7]:
target = pd.read_csv(full_path('data/train_target.csv')).flag
target

0          0
1          0
2          0
3          0
4          0
          ..
2999995    0
2999996    0
2999997    0
2999998    0
2999999    0
Name: flag, Length: 3000000, dtype: int64

## Pipeline

In [8]:
gc.collect()

1305

In [9]:
lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=44,
    n_estimators=300,
    learning_rate=0.05,
    reg_lambda=10,
    max_depth=5,
    num_leaves=33
)

In [10]:
aggregator = Aggregator()

In [11]:
pipe = Pipeline(steps=[
    ('aggregator', aggregator),
    ('classifier', lgbm)
     ],
                verbose=True
               )

In [12]:
pipe.fit(data, target)

Aggregate flags: ▮
Categorial encoding: ▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮▮
Features from enc_paym_N: ▮▮▮123456788▮
Aggregate numerical ▮
Adding has_loans feature
Relative OH-encoded features
Added 3 missed columns
Aggregation completed. Result shape: (3000000, 385)
[Pipeline] ........ (step 1 of 2) Processing aggregator, total=32.0min
[LightGBM] [Info] Number of positive: 106442, number of negative: 2893558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 6.514989 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39971
[LightGBM] [Info] Number of data points in the train set: 3000000, number of used features: 368
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 4.4min


In [13]:
aggregator.df = None
aggregator.df_agg = None
gc.collect()

53

In [15]:
with open(full_path('model/pipe_11.pkl'), 'wb') as file:
    dill.dump(pipe, file)