In [None]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')

字段表

Field 	Description

id 	为贷款清单分配的唯一信用证标识

loanAmnt 	贷款金额

term 	贷款期限（year）

interestRate 	贷款利率

installment 	分期付款金额

grade 	贷款等级

subGrade 	贷款等级之子级

employmentTitle 	就业职称

employmentLength 	就业年限（年）

homeOwnership 	借款人在登记时提供的房屋所有权状况

annualIncome 	年收入

verificationStatus 	验证状态

issueDate 	贷款发放的月份

purpose 	借款人在贷款申请时的贷款用途类别

postCode 	借款人在贷款申请中提供的邮政编码的前3位数字

regionCode 	地区编码

dti 	债务收入比

delinquency_2years 	借款人过去2年信用档案中逾期30天以上的违约事件数

ficoRangeLow 	借款人在贷款发放时的fico所属的下限范围

ficoRangeHigh 	借款人在贷款发放时的fico所属的上限范围

openAcc 	借款人信用档案中未结信用额度的数量

pubRec 	贬损公共记录的数量

pubRecBankruptcies 	公开记录清除的数量

revolBal 	信贷周转余额合计

revolUtil 	循环额度利用率，或借款人使用的相对于所有可用循环信贷的信贷金额

totalAcc 	借款人信用档案中当前的信用额度总数

initialListStatus 	贷款的初始列表状态

applicationType 	表明贷款是个人申请还是与两个共同借款人的联合申请

earliesCreditLine 	借款人最早报告的信用额度开立的月份

title 	借款人提供的贷款名称

policyCode 	公开可用的策略_代码=1新产品不公开可用的策略_代码=2

n系列匿名特征 	匿名特征n0-n14，为一些贷款人行为计数特征的处理

In [11]:
train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2 years,2,...,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5 years,0,...,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8 years,0,...,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10+ years,1,...,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,...,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [12]:
# 连接train和test数据
data = pd.concat([train, testA], axis=0, ignore_index=True)

In [13]:
## 数据预处理
# grade、subGrade、employmentLength、issueDate、earliesCreditLine，需要进行预处理
data['employmentLength'].value_counts(dropna=False).sort_index()

1 year        65671
10+ years    328525
2 years       90565
3 years       80163
4 years       59818
5 years       62645
6 years       46582
7 years       44230
8 years       45168
9 years       37866
< 1 year      80226
NaN           58541
Name: employmentLength, dtype: int64

In [14]:
#将employmentLength转换成数值类型
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [15]:
data['employmentLength'].value_counts(dropna=False).sort_index()

0.0      80226
1.0      65671
2.0      90565
3.0      80163
4.0      59818
5.0      62645
6.0      46582
7.0      44230
8.0      45168
9.0      37866
10.0    328525
NaN      58541
Name: employmentLength, dtype: int64

In [16]:
# 对earliesCreditLine进行预处理，只取后四位年份
data['earliesCreditLine'].sample(5)

558274    Jun-1997
632445    Oct-1997
388464    Dec-1989
73653     Nov-1997
958962    Sep-2003
Name: earliesCreditLine, dtype: object

In [17]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [18]:
data['earliesCreditLine'].describe()

count    1000000.000000
mean        1998.688632
std            7.606231
min         1944.000000
25%         1995.000000
50%         2000.000000
75%         2004.000000
max         2015.000000
Name: earliesCreditLine, dtype: float64

In [19]:
data.head()

Unnamed: 0,annualIncome,applicationType,delinquency_2years,dti,earliesCreditLine,employmentLength,employmentTitle,ficoRangeHigh,ficoRangeLow,grade,...,pubRecBankruptcies,purpose,regionCode,revolBal,revolUtil,subGrade,term,title,totalAcc,verificationStatus
0,110000.0,0,0.0,17.05,2001,2.0,320.0,734.0,730.0,E,...,0.0,1,32,24178.0,48.9,E2,5,1.0,27.0,2
1,46000.0,0,0.0,27.83,2002,5.0,219843.0,704.0,700.0,D,...,0.0,0,18,15096.0,38.9,D2,5,1723.0,18.0,2
2,74000.0,0,0.0,22.77,2006,8.0,31698.0,679.0,675.0,D,...,0.0,0,14,4606.0,51.8,D3,5,0.0,27.0,2
3,118000.0,0,0.0,17.21,1999,10.0,46854.0,689.0,685.0,A,...,0.0,4,11,9948.0,52.6,A4,3,4.0,28.0,1
4,29000.0,0,0.0,32.16,1977,,54.0,694.0,690.0,C,...,0.0,10,21,2942.0,32.0,C2,3,11.0,27.0,2


In [20]:
# 类别类型数据处理
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903
policyCode 类型数： 1


In [21]:
# 类型数在2之上，又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [22]:
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]

In [23]:
# 准备训练集测试集
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']

In [52]:
# 模型训练
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test


In [53]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [54]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test
def vot_model(x_train, y_train, x_test):
    vot_train, vote_test = cv_model(VotingClassifier, x_train, y_train, x_test, "vote")
    return vot_train, vot_test

In [55]:
# 使用lgb进行训练
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742591	valid_1's auc: 0.730355
[400]	training's auc: 0.755118	valid_1's auc: 0.731459
[600]	training's auc: 0.766238	valid_1's auc: 0.731846
[800]	training's auc: 0.776359	valid_1's auc: 0.731758
Early stopping, best iteration is:
[669]	training's auc: 0.76993	valid_1's auc: 0.731896
[0.7318963815365425]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.743824	valid_1's auc: 0.726889
[400]	training's auc: 0.756668	valid_1's auc: 0.728177
[600]	training's auc: 0.767686	valid_1's auc: 0.728401
Early stopping, best iteration is:
[476]	training's auc: 0.761101	valid_1's auc: 0.72858
[0.7318963815365425, 0.7285796452663349]
************************************ 3 ************************************
Training until validati

In [None]:
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.67282	eval-auc:0.67359
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.72758	eval-auc:0.72385
[400]	train-auc:0.73515	eval-auc:0.72767
[600]	train-auc:0.74066	eval-auc:0.72967
[800]	train-auc:0.74509	eval-auc:0.73068
[1000]	train-auc:0.74879	eval-auc:0.73140
[1200]	train-auc:0.75220	eval-auc:0.73192
[1400]	train-auc:0.75546	eval-auc:0.73223
[1600]	train-auc:0.75865	eval-auc:0.73260


In [None]:
cat_train, cat_test = cat_model(x_train, y_train, x_test)

In [None]:
rh_test = lgb_test*0.3 + xgb_test*0.4 + cat_test*0.3
testA['isDefault'] = rh_test


In [None]:
testA[['id','isDefault']].to_csv('test_sub.csv', index=False)