In [1]:
import pandas as pd

import fastparquet

import joblib

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_validate, GridSearchCV

In [3]:
def full_path(path: str) -> str:
    path_to_project = r'C:\Users\Arseniy\Documents\Skillbox\diploma_ML/'
    return path_to_project + path 

## Load raw data

In [4]:
data = pd.DataFrame()
for name in [f"data/train_data/train_data_{i}.pq" for i in range(12)]:
    print(f"Read <{name.split('/')[-1]}> ... ", end='')
    data = pd.concat((data, pd.read_parquet(full_path(name))), axis=0, ignore_index=True)
    print('done')

Read <train_data_0.pq> ... done
Read <train_data_1.pq> ... done
Read <train_data_2.pq> ... done
Read <train_data_3.pq> ... done
Read <train_data_4.pq> ... done
Read <train_data_5.pq> ... done
Read <train_data_6.pq> ... done
Read <train_data_7.pq> ... done
Read <train_data_8.pq> ... done
Read <train_data_9.pq> ... done
Read <train_data_10.pq> ... done
Read <train_data_11.pq> ... done


In [5]:
data

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26162712,2999999,8,6,5,14,13,1,15,16,2,...,0,0,0,1,1,3,4,1,0,0
26162713,2999999,9,5,3,2,10,15,14,17,2,...,0,0,0,4,1,3,4,1,0,0
26162714,2999999,10,3,16,11,13,14,8,15,5,...,0,0,3,4,1,2,4,1,0,0
26162715,2999999,11,3,6,4,8,1,11,0,5,...,3,3,3,4,1,2,3,1,1,1


## Aggregation

In [6]:
def max_loan_months(s: pd.Series) -> int:
    chunks = [[]]
    for val in s.values:
        if val not in (1, 2):
            chunks.append([])
        else:
            chunks[-1].append(val)
    return max(map(len, chunks))

In [7]:
def frac_loan_months(df: pd.DataFrame) -> pd.DataFrame:
    total_pay_months = (df != 3).sum(axis=1)
    total_pay_months = total_pay_months.apply(lambda x: 24 if x == 0 else x)
    return ((df == 1).sum(axis=1) + (df == 2).sum(axis=1)) / total_pay_months

In [8]:
def credit_history_length(s: pd.Series) -> int:
    for col in reversed(s.index):
        if s[col] != 3: 
            return int(col.lstrip('enc_paym_'))
    return 0

In [9]:
def aggregate(df: pd.DataFrame, verbose=False) -> pd.DataFrame:    

    df_agg = pd.DataFrame(index=df.id.unique())
    
    ## Flags
    if verbose: print('Aggregate flags: ', end='')
    flags = [
        'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90', 
        'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'pclose_flag', 'fclose_flag', 
    ]
    df_agg.loc[:,flags] = df[['id'] + flags].groupby('id').agg('sum')
    if verbose: print('▮')
        
    ## Categorial encode
    if verbose: print('Categorial encoding: ', end='')
    
    cat = [x for x in df.columns if x not in ['id', 'rn'] + flags + ['pre_loans6090', 'pre_over2limit', 'enc_loans_account_cur']]
    
    for col in cat:

        res = pd.DataFrame({'id': df.id, col: df[col]}, index=df.index)

        ohe = OneHotEncoder(sparse_output=False, dtype='int8')
        # ohe.fit(res[[col]])

        res = res.join(
            pd.DataFrame(
                ohe.fit_transform(res[[col]]),
                index=res.index,
                columns=ohe.get_feature_names_out()
            )
        )
        # print(col, res.shape)

        res = res.drop(columns=col).groupby('id').agg('sum')   
        df_agg = df_agg.join(res, on=df_agg.index, how='left')
        
        if verbose: print('▮', end='')

    if verbose: print()
    
    ## F.eng from enc_paym_N
    if verbose: print('Feature eng. from enc_paym_N: ', end='')

    enc_paym = [x for x in df.columns if x.startswith('enc_paym')]
    for col in ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']:
        df[col] = df[col].apply(lambda x: x - 1)
    if verbose: print('▮', end='')
    
    df['num_loan_months'] = df[enc_paym].apply(max_loan_months, axis=1)
    if verbose: print('▮', end='')
    df['frac_loan_months'] = frac_loan_months(df[enc_paym])
    if verbose: print('▮', end='')
    
    chunk_size = 3_000_000
    n = 0
    while n < df.shape[0]:
        n_init = n
        n = min(n + chunk_size, df.shape[0])
        chunk = df[enc_paym].iloc[n_init:n].apply(credit_history_length, axis=1).astype('int').copy()
        df.loc[n_init:n, 'credit_history_length'] = chunk
        if verbose: print(n // chunk_size, end='')
    if verbose: print('▮', end='')
    
    if verbose: print()
    
    ## Aggregate numerical -> max
    if verbose: print("Aggregate numerical ", end='')
    
    num = ['rn', 'credit_history_length', 'num_loan_months', 'frac_loan_months']
    df_agg[[f"{col}_max" for col in num]] = df[['id'] + num].groupby('id').agg('max')
    if verbose: print('▮', end='')
    
    if verbose: print()
    if verbose: print(f"Aggregation completed. Result shape: {df_agg.shape}")
        
    return df_agg

## Target loading

In [10]:
target = pd.read_csv(full_path('data/train_target.csv')).flag
target

0          0
1          0
2          0
3          0
4          0
          ..
2999995    0
2999996    0
2999997    0
2999998    0
2999999    0
Name: flag, Length: 3000000, dtype: int64

## Pipeline

In [14]:
hgb = HistGradientBoostingClassifier(
    l2_regularization=10, 
    n_iter_no_change=30, 
    tol=1e-5, scoring='roc_auc',
    class_weight='balanced', 
    random_state=44,
)

In [15]:
pipe = Pipeline(steps=[
    ('aggregator', FunctionTransformer(aggregate)),
    ('classifier', hgb)
],
                verbose=True
               )

In [16]:
pipe.fit(data, target)

[Pipeline] ........ (step 1 of 2) Processing aggregator, total=31.7min
[Pipeline] ........ (step 2 of 2) Processing classifier, total=10.4min


In [17]:
joblib.dump(pipe, full_path('model/hgb_04.pkl'))

['C:\\Users\\Arseniy\\Documents\\Skillbox\\diploma_ML/model/hgb_04.pkl']