In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import  StratifiedKFold
train_data = pd.read_csv('./train_public.csv')
submit_example = pd.read_csv('./submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)

In [4]:
work_year_dict = {'< 1 year':0,
                  '1 year':1,
                  '2 years':2,
                  '3 years':3,
                  '4 years':4,
                  '5 years':5,
                  '6 years':6,
                  '7 years':7,
                  '8 years':8,
                  '9 years':9,
                  '10+ years':10 }
train_data['work_year'] = train_data['work_year'].map(work_year_dict)
test_public['work_year'] = test_public['work_year'].map(work_year_dict)

class_dict = {
            'A':1,
            'B':2,
            'C':3,
            'D':4,
            'E':5,
            'F':6,
            'G':7,
}
train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)

In [5]:
# 时间转换
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])

In [6]:
# 提取月份
train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = test_public['issue_date'].dt.month
# # 提取日期
# train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
# test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
# 提取年份
train_data['issue_date_year'] = train_data['issue_date'].dt.year
test_public['issue_date_year'] = test_public['issue_date'].dt.year

In [7]:
import re

def findDig(val):
    fd = re.search('(-\d+)', val)
    if fd is None:
        return val[-3:] + "-" + val[0:val.find("-")]
    return val

In [8]:
month_num = {
                  'Jan':1,
                  'Feb':2,
                  'Mar':3,
                  'Apr':4,
                  'May':5,
                  'Jun':6,
                  'Jul':7,
                  'Aug':8,
                  'Sep':9,
                  'Oct':10,
                  'Nov':11,
                  'Dec':12 
}

In [9]:
def findMonth(val):
    return val[0:3]

def findYear(val):
    return val[val.find("-") +1:]


In [10]:
train_data['earlies_credit_mon_mon'] = train_data['earlies_credit_mon'].map(findDig).map(findMonth).map(month_num)
test_public['earlies_credit_mon_mon'] = test_public['earlies_credit_mon'].map(findDig).map(findMonth).map(month_num)

In [11]:
train_data['earlies_credit_mon_year'] = train_data['earlies_credit_mon'].map(findDig).map(findYear).astype("int")
test_public['earlies_credit_mon_year'] = test_public['earlies_credit_mon'].map(findDig).map(findYear).astype("int")

In [12]:
train_data['earlies_credit_mon_year'] = train_data['earlies_credit_mon_year'].apply(lambda x: x+100 if x < 50 else x)
test_public['earlies_credit_mon_year'] = test_public['earlies_credit_mon_year'].apply(lambda x: x+100 if x < 50 else x)

In [13]:
def gen_target_encoding_feats(train, test, encode_cols, target_col, n_fold = 1):
    
    '''生成target encoding特征'''
    tg_feats = np.zeros((train.shape[0], len(encode_cols)))
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    
    for _, (train_index, val_index) in enumerate(kfold.split(train[encode_cols], train[target_col])):
        df_train, df_val = train.iloc[train_index], train.iloc[val_index]
        for idx, col in enumerate(encode_cols):
            target_mean_dict = df_train.groupby(col)[target_col].mean()
            df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)
            tg_feats[val_index, idx] = df_val[f'{col}_mean_target'].values

    for idx, encode_col in enumerate(encode_cols):
        train[f'{encode_col}_mean_target'] = tg_feats[:, idx]

    for col in encode_cols:
        target_mean_dict = train.groupby(col)[target_col].mean()
        test[f'{col}_mean_target'] = test[col].map(target_mean_dict)

    return train, test

In [14]:
TARGET_ENCODING_FETAS = [
#                             'employer_type',
#                              'industry',
#                             'region',
#                             'use',
                         ]


In [15]:
train1, test1 = gen_target_encoding_feats(train_data, test_public, TARGET_ENCODING_FETAS, target_col='isDefault', n_fold=10)

In [16]:
train1

Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,issue_date_month,issue_date_year,earlies_credit_mon_mon,earlies_credit_mon_year
0,1040418,240418,31818.182,3,11.466,1174.910,3,政府机构,金融业,3.000,0,1,2016-10-01,2,193,13,2.430,0,556.364,649.091,3,0,0.000,7734.231,91.800,0,0,1-Dec,5,1,1.000,0.000,4.000,5.000,4.000,3,9927,0.000,0,10,2016,12,101
1,1025197,225197,28000.000,5,16.841,670.690,3,政府机构,金融业,10.000,0,2,2013-06-01,0,491,30,11.005,1,715.000,893.750,3,0,0.000,31329.000,54.800,1,0,Apr-90,40642,1,7.000,0.000,4.000,45.000,22.000,0,0,0.000,0,6,2013,4,90
2,1009360,209360,17272.727,3,8.900,603.320,1,政府机构,公共服务、社会组织,10.000,1,0,2014-01-01,4,459,8,6.409,0,774.545,903.636,5,0,0.000,18514.000,57.692,1,0,Oct-91,154,1,6.000,0.000,6.000,28.000,19.000,0,0,0.000,0,1,2014,10,91
3,1039708,239708,20000.000,3,4.788,602.300,1,世界五百强,文化和体育业,6.000,0,1,2015-07-01,0,157,8,9.205,0,750.000,875.000,3,0,0.000,20707.000,42.600,0,0,1-Jun,0,1,5.000,0.000,10.000,15.000,9.000,0,0,0.000,0,7,2015,6,101
4,1027483,227483,15272.727,3,12.790,470.310,3,政府机构,信息传输、软件和信息技术服务业,0.000,2,1,2016-07-01,0,38,21,15.578,0,609.091,710.606,15,0,0.000,14016.154,30.462,0,0,2-May,0,1,10.000,0.000,6.000,15.000,4.000,0,0,0.000,0,7,2016,5,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1028093,228093,17727.273,3,15.037,510.270,2,普通企业,建筑业,7.000,1,1,2013-11-01,4,49,12,10.484,0,720.000,780.000,9,0,0.000,19864.923,45.508,1,0,6-Feb,234,1,4.000,0.000,4.000,11.000,7.000,2,5287,0.000,0,11,2013,2,106
9996,1043911,243911,13636.364,3,6.534,464.950,1,政府机构,农、林、牧、渔业,2.000,1,1,2015-12-01,0,143,8,0.655,0,642.273,749.318,4,0,0.000,2025.000,10.300,0,0,May-97,0,1,2.000,0.000,2.000,7.000,6.000,3,7182,0.000,0,12,2015,5,97
9997,1023503,223503,24818.182,3,14.421,708.690,2,普通企业,信息传输、软件和信息技术服务业,10.000,0,0,2012-12-01,4,66,14,20.422,1,780.000,910.000,9,0,0.000,12641.000,21.877,1,0,Feb-87,120,1,6.000,0.000,5.000,15.000,11.000,1,8540,2562.000,0,12,2012,2,87
9998,1024616,224616,20000.000,3,18.450,727.580,4,政府机构,农、林、牧、渔业,10.000,0,0,2018-03-01,2,19,14,14.048,0,552.273,690.341,11,0,0.000,24642.692,60.646,0,0,Oct-92,5,1,7.000,0.000,5.000,17.000,10.000,2,6161,616.100,0,3,2018,10,92


In [17]:
def gen_fre_encoding_feats(train, test, encode_cols, target_col, n_fold=10):

    tg_feats = np.zeros((train.shape[0], len(encode_cols)))
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    for _, (train_index, val_index) in enumerate(kfold.split(train[encode_cols], train[target_col])):
        df_train, df_val = train.iloc[train_index], train.iloc[val_index]
        for idx, col in enumerate(encode_cols):
            target_fre_dict = df_train.groupby(col)[target_col].count()
            df_val[f'{col}_fre_target'] = df_val[col].map(target_fre_dict)
            tg_feats[val_index, idx] = df_val[f'{col}_fre_target'].values

    for idx, encode_col in enumerate(encode_cols):
        train[f'{encode_col}_fre_target'] = tg_feats[:, idx]

    for col in encode_cols:
        target_mean_dict = train.groupby(col)[target_col].mean()
        test[f'{col}_fre_target'] = test[col].map(target_mean_dict)

    return train, test

In [18]:
FREQUENCY_ENCODING_FETAS = [
                            'employer_type',
                             'industry',
                            'region',
                            'use',
                         ]


In [19]:
train2, test2 = gen_target_encoding_feats(train1, test1, FREQUENCY_ENCODING_FETAS, target_col='isDefault', n_fold=10)

In [20]:
Need_01_FETAS = [
                    'known_dero',
                         ]

In [21]:
# 特征工程 一些后处理
for col in Need_01_FETAS:
    train2[f'{col}_01_target'] = train2[col].apply(lambda x: 1 if x > 1 else 0)
    test2[f'{col}_01_target'] = test2[col].apply(lambda x: 1 if x > 1 else 0)

In [22]:
train2['early_return_amount_average'] = (train2['early_return_amount']+1) / (train2['early_return']+1)
test2['early_return_amount_average'] = (test2['early_return_amount']+1) / (test2['early_return']+1)

In [23]:
train2['early_return_amount_rate'] = train2['early_return_amount'] / train2['total_loan']
test2['early_return_amount_rate'] = test2['early_return_amount'] / test2['total_loan']

In [24]:
# train2['early_return_amount_3mon_rate'] = train2['early_return_amount_3mon'] / train2['total_loan']
# test2['early_return_amount_3mon_rate'] = test2['early_return_amount_3mon'] / test2['total_loan']

In [25]:
# train2['early_return_amount_3mon_rate2'] = train2['early_return_amount_3mon'] / (train2['early_return_amount']+1)
# test2['early_return_amount_3mon_rate2'] = test2['early_return_amount_3mon'] / (test2['early_return_amount']+1)

In [26]:
drop_feature = [
#     'loan_id'
    'user_id'
    ,'employer_type'
    ,'industry'
    ,'f1'
#     ,'post_code'
    ,'issue_date'
    ,'earlies_credit_mon',
    'policy_code',
    'app_type',
    'initial_list_status',
    ]

In [27]:
def gen_drop_feats(train, test, drop_cols):
    return train.drop(columns = drop_cols), test.drop(columns = drop_cols)

In [28]:
train3, test3 = gen_drop_feats(train2, test2, drop_feature)

In [29]:
train3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   loan_id                      10000 non-null  int64  
 1   total_loan                   10000 non-null  float64
 2   year_of_loan                 10000 non-null  int64  
 3   interest                     10000 non-null  float64
 4   monthly_payment              10000 non-null  float64
 5   class                        10000 non-null  int64  
 6   work_year                    9378 non-null   float64
 7   house_exist                  10000 non-null  int64  
 8   censor_status                10000 non-null  int64  
 9   use                          10000 non-null  int64  
 10  post_code                    10000 non-null  int64  
 11  region                       10000 non-null  int64  
 12  debt_loan_ratio              10000 non-null  float64
 13  del_in_18month   

In [30]:
NULL_FEATURE = [
    'f0',
    'f2',
    'f3',
    'f4',
    'pub_dero_bankrup',
    'work_year',
]


In [31]:
features_mode = {}
for f in NULL_FEATURE:
    features_mode[f] = list(train3[f].dropna().mode().values)[0]
train3.fillna(features_mode,inplace=True)

features_mode = {}
for f in NULL_FEATURE:
    features_mode[f] = list(test3[f].dropna().mode().values)[0]
test3.fillna(features_mode,inplace=True)

In [32]:
train3['f0-f2'] = train3['f0'] - train3['f2']
test3['f0-f2'] = test3['f0'] - test3['f2']

In [33]:
train3['new1'] = train3['class'] * train3['employer_type_mean_target']
test3['new1'] =  test3['class'] * test3['employer_type_mean_target']

In [34]:
train3['new2'] = train3['recircle_b'] / train3['total_loan']
test3['new2'] =  test3['recircle_b'] / test3['total_loan']

In [35]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

def k_fold_serachParmaters(model,train_data, train_label, test_data):
    n_splits=5

    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    pred_Test = np.zeros(len(test_data))
    
    auc_train, auc_val = 0, 0
    for tr_idx, val_idx in sk.split(train_data, train_label):
        x_train = train_data.iloc[tr_idx]
        y_train = train_label.iloc[tr_idx]
        x_val = train_data.iloc[val_idx]
        y_val = train_label.iloc[val_idx]

        model.fit(x_train, y_train, 
                  eval_set=[(x_val, y_val)], 

                 early_stopping_rounds=100,
                 verbose=False)

        pred_Test += model.predict_proba(test_data)[:, 1]/n_splits

        pred = model.predict(x_val)
        auc_val += roc_auc_score(y_val,pred)/n_splits
                
    return auc_val, pred_Test



In [36]:
import warnings
warnings.filterwarnings("ignore")

score_tta = None
score_list = []

tta_fold = 1
for _ in range(tta_fold):
    clf = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves= 25,
                           max_depth= 6,
                           learning_rate= 0.1,
                           n_estimators=10000,
                           subsample=0.5,
                           feature_fraction= 1,
                           reg_alpha=0.4,
                           reg_lambda=0.4,
                           random_state=2021,
                           is_unbalance=True,
                           metric='auc')

    score, test_pred = k_fold_serachParmaters(clf,
                           train3.drop(['loan_id','isDefault'], axis=1),
                           train3['isDefault'],
                           test3.drop(['loan_id'], axis=1),
                          )

    print(score)
    if score_tta is None:
        score_tta = test_pred/tta_fold
    else:
        score_tta += test_pred/tta_fold

0.8172646062992887


In [37]:
test_public['isDefault'] = score_tta
test_public.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('aaa.csv', index=None)

In [38]:
pd.DataFrame({
        'column': clf.feature_name_,
        'importance': clf.feature_importances_,
    }).sort_values(by=  'importance')


Unnamed: 0,column,importance
36,known_dero_01_target,0
4,class,1
17,pub_dero_bankrup,1
8,use,3
12,del_in_18month,3
25,early_return,4
1,year_of_loan,8
7,censor_status,10
6,house_exist,17
20,title,19


In [40]:
from sklearn.ensemble import RandomForestRegressor
# 定义模型
model = RandomForestRegressor(n_estimators=20,
                              random_state=42,
                              max_depth=5,
                              n_jobs=-1)
model.fit(train3.drop(['loan_id','isDefault'], axis=1), train3['isDefault'])  # 训练模型


RandomForestRegressor(max_depth=5, n_estimators=20, n_jobs=-1, random_state=42)

In [47]:
plt.figure(figsize=(14, 4))
y_pred = model.predict( test3.drop(['loan_id'], axis=1))

<Figure size 1008x288 with 0 Axes>

array([0.0156189 , 0.0262349 , 0.01220287, ..., 0.01343289, 0.0156189 ,
       0.01220287])

In [49]:
test_public['isDefault'] = y_pred
test_public.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('aaa.csv', index=None)

In [52]:
pd.DataFrame({
        'column': model.feature_names_in_,
        'importance': model.feature_importances_,
    }).sort_values(by=  'importance')


Unnamed: 0,column,importance
25,early_return,0.0
1,year_of_loan,0.0
17,pub_dero_bankrup,0.001
27,early_return_amount_3mon,0.001
4,class,0.001
7,censor_status,0.001
6,house_exist,0.001
28,issue_date_month,0.001
8,use,0.001
30,earlies_credit_mon_mon,0.001
