- 수치 -> 평균, 분산
- 범주 -> 발생횟수, 비율

In [1]:
from glob import glob
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
data_path = "../take-home-interview/"

In [3]:
credits = pd.read_csv(data_path + "defi_credits.csv")
credit_balance = pd.read_csv(data_path + "defi_credits_balance.csv")
balance = pd.read_csv(data_path + "aave_loan_balance.csv")
payments = pd.read_csv(data_path + "installments_payments.csv")
app_prev = pd.read_csv(data_path + "application_previous.csv")

### merge defi credit balance

In [4]:
defi_status_pivot = (
    credit_balance.groupby(["defi_id", "status"]).size()
    .reset_index(name="defi_status_count")
    .pivot(index='defi_id', columns='status', values='defi_status_count')
)

defi_features = pd.concat([
    credit_balance.groupby("defi_id").months_balance.min(),
    defi_status_pivot
])
defi_features.columns = ['defi_min_month'] + ("defi_status_" + defi_status_pivot.columns).tolist()

In [5]:
assert defi_features.max().max() < 32767
defi_features = defi_features.fillna(0).astype(np.int16)

In [6]:
credits = credits.merge(defi_features.reset_index(), on='defi_id', how='left')
for col in defi_features.columns:
    credits[col] = credits[col].fillna(0).astype(np.int16)

### Extrac simple features

In [7]:
def extract_features(df):
    int_cols = df.dtypes[df.dtypes == int].index.tolist()
    float_cols = df.dtypes[df.dtypes == float].index.tolist()
    cate_cols = df.dtypes[df.dtypes == 'object'].index.tolist()

    ## 칼럼 예외 처리
    int_cols = [col for col in int_cols if 'id' not in col]
    
    num_cols = int_cols + float_cols

    ## numeric -> 평균
    res_mean = []
    res_std = []
    for col in num_cols:
        col_mean = df.groupby("application_id_current")[col].mean()
        col_std = df.groupby("application_id_current")[col].std()
        
        res_mean.append(col_mean)
        res_std.append(col_std)
    
    numeric_mean_features = pd.concat(res_mean, axis=1)
    numeric_std_features = pd.concat(res_std, axis=1)
    numeric_mean_features.columns = [f"{col}_mean" for col in num_cols]
    numeric_std_features.columns = [f"{col}_std" for col in num_cols]

    ## cate -> 카운트해서 pivot
    res = []
    app_size_df = df.groupby("application_id_current").size()
    for col in tqdm(cate_cols):
        count_df = df.groupby(["application_id_current", col]).size().reset_index(name=f"{col}_count")
        count_pivot = count_df.pivot(index="application_id_current", columns=col, values=f"{col}_count").fillna(0)

        assert count_pivot.max().max() < 32767
        count_pivot = count_pivot.astype(np.int16)
        ratio_pivot = pd.concat([count_pivot[col]/app_size_df for col in count_pivot.columns], axis=1)

        count_pivot.columns = col + "_" + count_pivot.columns + "count"
        ratio_pivot.columns = col + "_" + count_pivot.columns + "ratio"
        res.append(count_pivot)
        res.append(ratio_pivot)
        
    if cate_cols:
        cate_count_features = pd.concat(res, axis=1)
    else:
        cate_count_features = None
        
    features = pd.concat([numeric_mean_features, numeric_std_features, cate_count_features], axis=1)
    print(f"{len(features)}개 관측치에 대해 {features.shape[1]}개 피쳐 생성")
    
    return features

In [8]:
%%time
credit_feature = extract_features(credits)
credit_feature.to_pickle(data_path+"credit_feature.pkl")
# del credits, credit_feature

balance_feature = extract_features(balance)
balance_feature.to_pickle(data_path+"balance_feature.pkl")
# del balance, balance_feature

payment_feature = extract_features(payments)
payment_feature.to_pickle(data_path+"payment_feature.pkl")
# del payments, payment_feature

app_prev_feature = extract_features(app_prev)
app_prev_feature.to_pickle(data_path+"app_prev_feature.pkl")
# del app_prev, app_prev_feature

100%|██████████| 2/2 [00:01<00:00,  1.70it/s]


305811개 관측치에 대해 40개 피쳐 생성


100%|██████████| 1/1 [00:00<00:00,  1.96it/s]


103558개 관측치에 대해 54개 피쳐 생성


0it [00:00, ?it/s]


339587개 관측치에 대해 12개 피쳐 생성


100%|██████████| 13/13 [00:08<00:00,  1.46it/s]


338857개 관측치에 대해 272개 피쳐 생성
CPU times: user 27.2 s, sys: 2.89 s, total: 30.1 s
Wall time: 30.1 s
