In [1]:
import pandas as pd
import os
import gc
import datetime
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_internet = pd.read_csv('E:/TianChi/个贷违约预测/datas/train_internet.csv')
data_train  = pd.read_csv('E:/TianChi/个贷违约预测/datas/train_public.csv')
data_test = pd.read_csv('E:/TianChi/个贷违约预测/datas/test_public (1).csv')
submit_example = pd.read_csv('E:/TianChi/个贷违约预测/datas/submit_example.csv')

In [3]:
data = pd.concat([data_train, data_test], axis=0, ignore_index=True)
data.shape

(15000, 39)

# <font color=gree>数据预处理</font>

In [4]:
numerical_fea = list(data.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data.columns)))
label = 'isDefault'
numerical_fea.remove(label)

In [5]:
print(numerical_fea)

category_fea

['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest', 'monthly_payment', 'house_exist', 'censor_status', 'use', 'post_code', 'region', 'debt_loan_ratio', 'del_in_18month', 'scoring_low', 'scoring_high', 'known_outstanding_loan', 'known_dero', 'pub_dero_bankrup', 'recircle_b', 'recircle_u', 'initial_list_status', 'app_type', 'title', 'policy_code', 'f0', 'f1', 'f2', 'f3', 'f4', 'early_return', 'early_return_amount', 'early_return_amount_3mon']


['class',
 'employer_type',
 'industry',
 'work_year',
 'issue_date',
 'earlies_credit_mon']

In [6]:
#按照中位数填充数值型特征
data[numerical_fea] = data[numerical_fea].fillna(data[numerical_fea].median())
#按照众数填充类别型特征
data[category_fea] = data[category_fea].fillna(data[category_fea].mode())

# <font color=yellow>时间特征处理'issue_date'</font>

In [7]:
data['issue_date'].value_counts().sort_index()

2007/10/1     2
2007/12/1     1
2007/8/1      1
2008/1/1      4
2008/10/1     1
             ..
2018/5/1     64
2018/6/1     61
2018/7/1     42
2018/8/1     36
2018/9/1     22
Name: issue_date, Length: 131, dtype: int64

In [8]:
data['issue_date'] = pd.to_datetime(data['issue_date'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-10-01', '%Y-%m-%d')
data['issueDateDT'] = data['issue_date'].apply(lambda x: x-startdate).dt.days
data[['issueDateDT']]
data.drop('issue_date', axis=1, inplace=True)

In [9]:
data['issueDateDT'].value_counts().sort_index()

-61       1
 0        2
 61       1
 92       4
 152      3
         ..
 3957    36
 3988    22
 4018    20
 4049    20
 4079    14
Name: issueDateDT, Length: 131, dtype: int64

# <font color=yellow>'work_year'处理</font>

In [10]:
data['work_year'].value_counts(dropna=False).sort_index()

1 year       1015
10+ years    5000
2 years      1311
3 years      1161
4 years       884
5 years       957
6 years       706
7 years       660
8 years       682
9 years       577
< 1 year     1147
NaN           900
Name: work_year, dtype: int64

In [11]:
data['work_year'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['work_year'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['work_year'] = data['work_year'].apply(employmentLength_to_int)

In [12]:
data['work_year'].value_counts(dropna=False).sort_index()

0.0     1147
1.0     1015
2.0     1311
3.0     1161
4.0      884
5.0      957
6.0      706
7.0      660
8.0      682
9.0      577
10.0    5000
NaN      900
Name: work_year, dtype: int64

# <font color=yellow>'earlies_credit_mon'处理</font>

In [13]:
data['earlies_credit_mon'].sample(5)

10655    Jul-92
12473     2-Nov
13004    Nov-99
14915    Aug-82
13358    Jan-98
Name: earlies_credit_mon, dtype: object

In [14]:
data.drop('earlies_credit_mon', axis=1, inplace=True)
data.shape

(15000, 38)

# <font color=pink>类别特征处理</font>

In [15]:
# 部分类别特征
category_fea.remove('earlies_credit_mon')
category_fea.remove('issue_date')
for f in category_fea:
    print(f, '类型数：', data[f].nunique())

class 类型数： 7
employer_type 类型数： 6
industry 类型数： 14
work_year 类型数： 11


In [16]:
data[category_fea]

Unnamed: 0,class,employer_type,industry,work_year
0,C,政府机构,金融业,3.0
1,C,政府机构,金融业,10.0
2,A,政府机构,公共服务、社会组织,10.0
3,A,世界五百强,文化和体育业,6.0
4,C,政府机构,信息传输、软件和信息技术服务业,0.0
...,...,...,...,...
14995,C,政府机构,信息传输、软件和信息技术服务业,0.0
14996,A,政府机构,房地产业,5.0
14997,A,上市企业,房地产业,10.0
14998,A,政府机构,文化和体育业,10.0


# <font color=yellow>硬编码</font>

In [17]:
data['class'] = data['class'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

In [18]:
from sklearn.preprocessing import LabelEncoder
for col in category_fea:
    lbl = LabelEncoder().fit(data[col])
    data[col] = lbl.transform(data[col])
data[category_fea]

Unnamed: 0,class,employer_type,industry,work_year
0,2,3,13,3
1,2,3,13,10
2,0,3,3,10
3,0,1,10,6
4,2,3,2,0
...,...,...,...,...
14995,2,3,2,0
14996,0,3,8,5
14997,0,0,8,10
14998,0,3,10,10


## 训练数据/测试数据准备

In [19]:
data.columns

Index(['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest',
       'monthly_payment', 'class', 'employer_type', 'industry', 'work_year',
       'house_exist', 'censor_status', 'use', 'post_code', 'region',
       'debt_loan_ratio', 'del_in_18month', 'scoring_low', 'scoring_high',
       'known_outstanding_loan', 'known_dero', 'pub_dero_bankrup',
       'recircle_b', 'recircle_u', 'initial_list_status', 'app_type', 'title',
       'policy_code', 'f0', 'f1', 'f2', 'f3', 'f4', 'early_return',
       'early_return_amount', 'early_return_amount_3mon', 'isDefault',
       'issueDateDT'],
      dtype='object')

In [20]:
features = [f for f in data.columns if f not in ['loan_id', 'user_id', 'issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)
print(train['isDefault'])

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
9995    0.0
9996    0.0
9997    0.0
9998    0.0
9999    0.0
Name: isDefault, Length: 10000, dtype: float64


In [21]:
y_train.shape

(10000,)

## 模型训练

* 直接构建了一个函数，可以调用三种树模型，方便快捷

In [22]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [23]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

In [24]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.994107	valid_1's auc: 0.868743
Early stopping, best iteration is:
[35]	training's auc: 0.931233	valid_1's auc: 0.878123
[0.8781231723658007]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.99392	valid_1's auc: 0.880656
Early stopping, best iteration is:
[54]	training's auc: 0.947523	valid_1's auc: 0.892191
[0.8781231723658007, 0.8921910664239693]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.995632	valid_1's auc: 0.860456
Early stopping, best iteration is:
[49]	training's auc: 0.945953	valid_1's auc: 0.871299
[0.8781231723658007, 0.8921910664239693, 0.8712993590605531]
********************************

In [25]:
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.86040	eval-auc:0.85525
[200]	train-auc:0.93492	eval-auc:0.87669
[362]	train-auc:0.95951	eval-auc:0.87463
[0.8774161786806238]
************************************ 2 ************************************
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.85992	eval-auc:0.86898
[200]	train-auc:0.93410	eval-auc:0.89263
[301]	train-auc:0.95127	eval-auc:0.88867
[0.87741617868062

In [26]:
cat_train, cat_test = cat_model(x_train, y_train, x_test)

************************************ 1 ************************************
0:	learn: 0.3706852	test: 0.3628935	best: 0.3628935 (0)	total: 65.5ms	remaining: 21m 50s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3124762561
bestIteration = 65

Shrink model to first 66 iterations.
[0.8749417007825051]
************************************ 2 ************************************
0:	learn: 0.3696463	test: 0.3669898	best: 0.3669898 (0)	total: 3.88ms	remaining: 1m 17s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3075200572
bestIteration = 245

Shrink model to first 246 iterations.
[0.8749417007825051, 0.8928519834734623]
************************************ 3 ************************************
0:	learn: 0.3692642	test: 0.3684896	best: 0.3684896 (0)	total: 3.32ms	remaining: 1m 6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3142670668
bestIteration = 183

Shrink model to first 184 iterations.
[0.8749417007825051, 0.89285198347

# 模型调参

## 贪心调参
先使用当前对模型影响最大的参数进行调优，达到当前参数下的模型最优化，再使用对模型影响次之的参数进行调优，如此下去，直到所有的参数调整完毕。
这个方法的缺点就是可能会调到局部最优而不是全局最优，但是只需要一步一步的进行参数最优化调试即可，容易理解。

需要注意的是在树模型中参数调整的顺序，也就是各个参数对模型的影响程度，这里列举一下日常调参过程中常用的参数和调参顺序：
* ①：max_depth、num_leaves
* ②：min_data_in_leaf、min_child_weight
* ③：bagging_fraction、 feature_fraction、bagging_freq
* ④：reg_lambda、reg_alpha
* ⑤：min_split_gain

In [27]:
# from sklearn.model_selection import cross_val_score

# # 调objective
# best_obj = dict()
# for obj in objective:
#     model = LGBMRegressor(objective=obj)
#     """预测并计算roc的相关指标"""
#     score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
#     best_obj[obj] = score

# # num_leaves
# best_leaves = dict()
# for leaves in num_leaves:
#     model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
#     """预测并计算roc的相关指标"""
#     score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
#     best_leaves[leaves] = score

# # max_depth
# best_depth = dict()
# for depth in max_depth:
#     model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
#                           num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
#                           max_depth=depth)
#     """预测并计算roc的相关指标"""
#     score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
#     best_depth[depth] = score

In [27]:
rh_test = lgb_test*0.35 + xgb_test*0.35 + cat_test*0.3

In [28]:
rh_test

array([0.00334499, 0.01356397, 0.0015958 , ..., 0.00182963, 0.00459576,
       0.00225359])

In [29]:
data_test['isDefault'] = rh_test

In [30]:
data_test.rename({'loan_id': 'id'}, axis=1)[['id','isDefault']].to_csv('E:/TianChi/个贷违约预测/result/result00.csv', index=None)

In [33]:
data_test['loan_id']

0       1000575
1       1028125
2       1010694
3       1026712
4       1002895
         ...   
4995    1008856
4996    1016651
4997    1024140
4998    1014316
4999    1012946
Name: loan_id, Length: 5000, dtype: int64

In [34]:
data_test['isDefault']

0       0.003345
1       0.013564
2       0.001596
3       0.000997
4       0.002541
          ...   
4995    0.068051
4996    0.003975
4997    0.001830
4998    0.004596
4999    0.002254
Name: isDefault, Length: 5000, dtype: float64