@curiosity
- 运行环境
    - 系统版本
        - win10
    - 包版本
        - numpy
        - pandas
        - sklearn
        - lightgbm
- 特征工程思路
    - 构造新特征。例如：有效贷款总数、主账户违约比率、二级账户违约比率、总违约比率、总未还贷款金额比率等。
    - 数值特征的统计特征。计算构造credit-score关于类别特征的mean统计特征。
    - 尝试对age/loan_to_asset_ratio_bin进行较粗粒度的序数编码，希望增强泛化能力（但好像没什么用）。
    - 目标编码（target encoding）。对employee_code_id/supplier_id/branch_id等类别特征做目标编码。
    - 计数编码（count encoding）。对employee_code_id/supplier_id/branch_id等类别特征做计数编码。

# Load data

In [None]:
## 导入第三方包
import pandas as pd
import numpy as np
import lightgbm as lgb
# import xgboost as xgb
# import catboost as cat
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
train_path = '/Users/dingchaoshun/python-file/iFLYTEK_CarLoan/data/train.csv'
test_path = '/Users/dingchaoshun/python-file/iFLYTEK_CarLoan/data/test.csv'

## 读取数据集，具体下载方式可见操作手册
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
train = train.rename(columns={'sub_account_inactive_loan_no':'total_inactive_loan_no', 'total_inactive_loan_no':'sub_account_inactive_loan_no'})
test = test.rename(columns={'sub_account_inactive_loan_no':'total_inactive_loan_no', 'total_inactive_loan_no':'sub_account_inactive_loan_no'})
target = 'loan_default'
df_feature = train.append(test, sort=False)

# feature Engineering

## create_feature

In [None]:
# create feature
def create_feature(df):
    df['total_active_loan_no'] = df['main_account_active_loan_no'] + df['sub_account_active_loan_no']
    df['total_inactive_loan_ratio'] = df['total_inactive_loan_no'] / (df['total_account_loan_no'] + 1e-5)
    df['main_overdue_ratio'] = df['main_account_overdue_no'] / (df['main_account_active_loan_no'] + 1e-5)
    df['sub_overdue_ratio'] = df['sub_account_overdue_no'] / (df['sub_account_active_loan_no'] + 1e-5)
    df['total_overdue_ratio'] = df['total_overdue_no'] / (df['total_active_loan_no'] + 1e-5)
    df['total_outstanding_ratio'] = df['total_outstanding_loan'] / (df['total_sanction_loan'] + 1e-5)
    return df

## make-ratio-bins

In [None]:
## make-ratio-bins
def make_data_ratio_bin(df):
    df['loan_to_asset_ratio_bin'] = pd.qcut(df['loan_to_asset_ratio'], 100)
    df['loan_to_asset_ratio_bin'] = df['loan_to_asset_ratio_bin'].cat.codes
    return df

## make_age_bins

In [None]:
# ## cut bins
def make_age_bins(x):
    if x <= 22: age_bin = 1
    elif x > 22 and x <= 35: age_bin = 2 
    elif x > 35 and x <= 45: age_bin = 3
    elif x > 45 and x <= 60: age_bin = 4
    else: age_bin = 5
    return age_bin
def make_data_age_bin(df):
    df['age_bin'] = df['age'].apply(make_age_bins)
    return df

## make_stats_feature

In [None]:
# statistic feature
# credit_score 的统计特征
# label_cols = ['credit_score','loan_to_asset_ratio','disbursed_amount']
label_cols = ['credit_score']
category_cols = ['branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','employment_type','Credit_level']
group_cols = category_cols
# make_stats_feature
def make_stats_feature(df, label_cols, group_cols):
    for label in label_cols:
        for group in group_cols:
            df[f'{label}_by_{group}_mean'] = df.groupby(group)[label].transform('mean')
    return df

## count encoding

In [None]:
## count encoding
count_cols = ['branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','employment_type','Credit_level']
def count_encoding(df, count_cols):
    for col in tqdm(count_cols):
        df[f'{col}_count'] = df[col].map(df[col].value_counts())
    return df

In [None]:
## process_data
def process_data(df):
    df = create_feature(df)
    df = make_data_age_bin(df)
    df = make_stats_feature(df, label_cols, group_cols)
    df = count_encoding(df, count_cols)
    return df
df_feature = process_data(df_feature)

## 5-fold target encoding

In [None]:
## Groupby statistic feature (mean\std\min\max\median)
target_encode_cols = ['branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','employment_type','Credit_level']

def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge

def statis_feat(df_know, df_unknow):
    for f in tqdm(target_encode_cols):
        df_unknow = stat(df_know, df_unknow, [f], {target: ['mean','std']})
    return df_unknow

## 5折交叉 target encoding
train = df_feature[~df_feature[target].isnull()]
train = train.reset_index(drop=True)
test = df_feature[df_feature[target].isnull()]

df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
for tra_index, val_index in kfold.split(train, train[target]):
    df_fold_train = train.iloc[tra_index]
    df_fold_val = train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

test = statis_feat(train, test)
df_feature = pd.concat([df_stas_feat, test], axis=0)

del(df_stas_feat)
del(train)
del(test)
gc.collect()

# model

## split data

In [None]:
## 训练数据及测试数据准备
train = df_feature[df_feature[target].notnull()]
test = df_feature[df_feature[target].isnull()]

useless_cols = ['customer_id','loan_default','mobileno_flag','idcard_flag','disbursed_date']
all_cols = [col for col in train.columns if col not in useless_cols]
x_train = train[all_cols]
x_test = test[all_cols]
y_train = train[target]

print(x_train.shape)
print(x_test.shape)
x_train.head()

## LGB Train

In [None]:
## 作为baseline部分仅使用经典的LightGBM作为训练模型，我们还能尝试XGBoost、CatBoost和NN（神经网络）
def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
    folds = 10
    seed = 2021
    kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

    train_pred_lst = np.zeros(train_x.shape[0])
    test_pred_lst = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kfold.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y = train_x.iloc[train_index], train_y.iloc[train_index]
        val_x, val_y = train_x.iloc[valid_index], train_y.iloc[valid_index]

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'min_child_weight': 5,
            'num_leaves': 2 ** 7,
            'lambda_l2': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 4,
            'learning_rate': 0.01,
            'seed': 2021,
            'n_jobs': -1,
            'silent': True,
            'verbose': -1,
        }

        model = clf.train(params, train_matrix, 10000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        train_pred_lst[valid_index] = val_pred
        test_pred_lst += test_pred / kfold.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
    
    mean_auc = round(np.mean(cv_scores), 6)
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train_pred_lst, test_pred_lst, mean_auc

lgb_train, lgb_test, lgb_score = cv_model(lgb, x_train, y_train, x_test)

# Submit

In [None]:
## 预测结果
submit = test[['customer_id']]
submit['loan_default'] = lgb_test
submit['loan_default'] = submit['loan_default'].apply(lambda x:1 if x>0.25 else 0).values
submit.to_csv(f'mac_submit_{lgb_score}.csv', index=False)