In [None]:
import pandas as pd
import numpy as np
sub = pd.read_csv('../input/ccf2021bankdefault/submit_example.csv')

本赛题要求利用已有的与目标客群稍有差异的另一批信贷数据，辅助目标业务风控模型的创建，两者数据集之间存在大量相同的字段和极少的共同用户。此处希望大家可以利用迁移学习捕捉不同业务中用户基本信息与违约行为之间的关联，帮助实现对新业务的用户违约预测。

In [None]:
!wget https://awscdn.datafountain.cn/cometition_data2/Files/BDCI2021/530ZhongYuan/train_dataset.zip

In [None]:
!unzip ./train_dataset.zip

In [None]:
!wget https://awscdn.datafountain.cn/cometition_data2/Files/BDCI2021/530ZhongYuan/test_public.csv

In [None]:
SEED = 6666

## step 1:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from dateutil.relativedelta import relativedelta
train_data = pd.read_csv('./train_public.csv')
submit_example = pd.read_csv('../input/ccf2021bankdefault/submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')

def train_model(data_, test_, y_, folds_):
    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    feature_importance_df = pd.DataFrame()
    #feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault','policy_code','del_in_18month'] ]
    feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
        cat_feats={'industry','employer_type'}
        clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.08,#0.07
            num_leaves=2**5+1,
            colsample_bytree=.65,
            subsample=.9,
            max_depth=5,#5
            #max_bin=250,
            reg_alpha=.3,
            reg_lambda=.3,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)],#categorical_feature=cat_feats,
                eval_metric='auc', verbose=100, early_stopping_rounds=40  #30
               )

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()  
    print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 
    
    test_['isDefault'] = sub_preds

    return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')
def workYearDIc(x):
    if str(x)=='nan':
        return -1
    x = x.replace('< 1','0')
    return int(re.search('(\d+)', x).group())
def findDig(val):
    fd = re.search('(\d+-)', val)
    if fd is None:
        return '1-'+val
    return val + '-01'
class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}
#public+inte 双表

#work_year
train_data['work_year'] = train_data['work_year'].map(workYearDIc)
test_public['work_year'] = test_public['work_year'].map(workYearDIc)
train_inte['work_year'] = train_inte['work_year'].map(workYearDIc)

#class
train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)
train_inte['class'] = train_inte['class'].map(class_dict)

#earlies_credit_mon
train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
train_inte['earlies_credit_mon'] = pd.to_datetime(train_inte['earlies_credit_mon'].map(findDig))

#earlies_credit_mon
timeMax = pd.to_datetime('1-Dec-21')
train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+  pd.offsets.DateOffset(years=-100)  
test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)

#issue_date
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
train_inte['issue_date'] = pd.to_datetime(train_inte['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = test_public['issue_date'].dt.month
train_inte['issue_date_month'] = train_inte['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
train_inte['issue_date_dayofweek'] = train_inte['issue_date'].dt.dayofweek

train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
train_inte['earliesCreditMon'] = train_inte['earlies_credit_mon'].dt.month

train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
train_inte['earliesCreditYear'] = train_inte['earlies_credit_mon'].dt.year

#特征构造

#月收入
# train_data['monthly_income'] = train_data['monthly_payment'] / train_data['debt_loan_ratio'] *100
# test_public['monthly_income'] = test_public['monthly_payment'] / test_public['debt_loan_ratio'] *100
# train_inte['monthly_income'] = train_inte['monthly_payment'] / train_inte['debt_loan_ratio'] *100

train_data['post_code_to_mean_interst'] = train_data.groupby(['post_code'])['interest'].transform('mean')
test_public['post_code_to_mean_interst'] = test_public.groupby(['post_code'])['interest'].transform('mean')
train_inte['post_code_to_mean_interst'] = train_inte.groupby(['post_code'])['interest'].transform('mean')

train_data['post_code_cnt'] = train_data['post_code'].map(train_data['post_code'].value_counts())
train_data.loc[train_data['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
test_public['post_code_cnt'] = test_public['post_code'].map(test_public['post_code'].value_counts())
test_public.loc[test_public['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
train_inte['post_code_cnt'] = train_inte['post_code'].map(train_inte['post_code'].value_counts())
train_inte.loc[train_inte['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan


del train_data['post_code_cnt']
del test_public['post_code_cnt']
del train_inte['post_code_cnt']
#类别型特征编码
from sklearn.preprocessing import LabelEncoder
cat_cols = ['employer_type', 'industry']
for col in cat_cols:
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_public[col] = lbl.transform(test_public[col])
    train_inte[col] = lbl.transform(train_inte[col])
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1 )
train_inte = train_inte.drop(col_to_drop, axis=1 )
#取train_data和train_inte特征交集，train_inte将不包含的特征使用nan值填补
tr_cols = set(train_data.columns)
same_col = list(tr_cols.intersection(set(train_inte.columns)))
train_inteSame = train_inte[same_col].copy()
Inte_add_cos = list(tr_cols.difference(set(same_col)))
for col in Inte_add_cos:
    train_inteSame[col] = np.nan
#伪标签学习      以train_data特征训练模型->预测train_inteSame
y = train_data['isDefault']
train_data.shape
folds = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds, IntePre, importances = train_model(train_data, train_inteSame, y, folds)

In [None]:
#roc_auc_score
from sklearn.metrics import roc_auc_score
IntePre['isDef'] = train_inte['is_default']
roc_auc_score(IntePre['isDef'],IntePre.isDefault)

## step 2：

In [None]:
# import pandas as pd
# train_data = pd.read_csv('./train_public.csv')
# test_data = pd.read_csv('./test_public.csv')
# sub=pd.read_csv("nn2.csv")
# sub=sub.rename(columns={'id': 'loan_id'})
# thr = sub['isDefault'].quantile(0.9)
# sub.loc[sub['isDefault']>thr,'isDefault'] = 1
# sub.loc[sub['isDefault']<0.5,'isDefault'] = 0
# nw_sub=sub[(sub['isDefault']==1)|(sub['isDefault']==0)]
# nw_test_data=test_data.merge(nw_sub,on='loan_id',how='inner')
# nw_train_data = pd.concat([train_data,nw_test_data]).reset_index(drop=True)
# nw_train_data.to_csv("./nw_train_public.csv",index=0)
## 选择阈值0.05，从internet表中提取预测小于该概率的样本，并对不同来源的样本赋予来源值
InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()
#新增来源域分类特征
train_data['dataSourse'] = 1
test_public['dataSourse'] = 1
train_inteSame['dataSourse'] = 0
train_inteSame['isDefault'] = train_inte['is_default']
use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
#连接表
data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)

In [None]:
# IntePre.isDefault
plt.figure(figsize=(16,6))
plt.title("Distribution of Default values IntePre")
sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
plt.legend();plt.show()
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]

del data
del train_data,test_public

In [None]:
y = train['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds, test_preds, importances = train_model(train, test, y, folds)
test_preds.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('./nn2.csv', index=False)

## step 3:

In [None]:
import pandas as pd
train_data = pd.read_csv('./train_public.csv')
test_data = pd.read_csv('./test_public.csv')
sub=pd.read_csv("nn2.csv")
sub=sub.rename(columns={'id': 'loan_id'})

In [None]:
sub.loc[sub['isDefault']<0.5,'isDefault'] = 0
nw_sub=sub[(sub['isDefault']==0)]
nw_test_data=test_data.merge(nw_sub,on='loan_id',how='inner')
nw_train_data = pd.concat([train_data,nw_test_data]).reset_index(drop=True)
nw_train_data.to_csv("./nw_train_public.csv",index=0)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from dateutil.relativedelta import relativedelta
train_data = pd.read_csv('./nw_train_public.csv')
submit_example = pd.read_csv('../input/ccf2021bankdefault/submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')

pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)
def train_model(data_, test_, y_, folds_):
    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
        clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.08,#0.07
            num_leaves=2**5+1,
            colsample_bytree=.65,
            subsample=.9,
            max_depth=5,#5
            max_bin=250,
            reg_alpha=.3,
            reg_lambda=.3,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
        cat_feats={'industry','employer_type'}
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)],#categorical_feature=cat_feats,
                eval_metric='auc', verbose=100, early_stopping_rounds=40  #30
               )

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
        
    print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 
    
    test_['isDefault'] = sub_preds

    return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
def workYearDIc(x):
    if str(x)=='nan':
        return -1
    x = x.replace('< 1','0')
    return int(re.search('(\d+)', x).group())

def findDig(val):
    fd = re.search('(\d+-)', val)
    if fd is None:
        return '1-'+val
    return val + '-01'


class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}

#public+inte 双表

#work_year
train_data['work_year'] = train_data['work_year'].map(workYearDIc)
test_public['work_year'] = test_public['work_year'].map(workYearDIc)
train_inte['work_year'] = train_inte['work_year'].map(workYearDIc)

#class
train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)
train_inte['class'] = train_inte['class'].map(class_dict)

#earlies_credit_mon
train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
train_inte['earlies_credit_mon'] = pd.to_datetime(train_inte['earlies_credit_mon'].map(findDig))

#earlies_credit_mon
timeMax = pd.to_datetime('1-Dec-21')
train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+  pd.offsets.DateOffset(years=-100)  
test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)

#issue_date
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
train_inte['issue_date'] = pd.to_datetime(train_inte['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = test_public['issue_date'].dt.month
train_inte['issue_date_month'] = train_inte['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
train_inte['issue_date_dayofweek'] = train_inte['issue_date'].dt.dayofweek

train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
train_inte['earliesCreditMon'] = train_inte['earlies_credit_mon'].dt.month

train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
train_inte['earliesCreditYear'] = train_inte['earlies_credit_mon'].dt.year

#特征构造

#月收入
# train_data['monthly_income'] = train_data['monthly_payment'] / train_data['debt_loan_ratio'] *100
# test_public['monthly_income'] = test_public['monthly_payment'] / test_public['debt_loan_ratio'] *100
# train_inte['monthly_income'] = train_inte['monthly_payment'] / train_inte['debt_loan_ratio'] *100

train_data['post_code_to_mean_interst'] = train_data.groupby(['post_code'])['interest'].transform('mean')
test_public['post_code_to_mean_interst'] = test_public.groupby(['post_code'])['interest'].transform('mean')
train_inte['post_code_to_mean_interst'] = train_inte.groupby(['post_code'])['interest'].transform('mean')

train_data['post_code_cnt'] = train_data['post_code'].map(train_data['post_code'].value_counts())
train_data.loc[train_data['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
test_public['post_code_cnt'] = test_public['post_code'].map(test_public['post_code'].value_counts())
test_public.loc[test_public['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
train_inte['post_code_cnt'] = train_inte['post_code'].map(train_inte['post_code'].value_counts())
train_inte.loc[train_inte['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan

del train_data['post_code_cnt']
del test_public['post_code_cnt']
del train_inte['post_code_cnt']

from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    cat_cols = ['employer_type', 'industry']
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_public[col] = lbl.transform(test_public[col])
    
    #Internet处理
    train_inte[col] = lbl.transform(train_inte[col])
    
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1 )
train_inte = train_inte.drop(col_to_drop, axis=1 )

tr_cols = set(train_data.columns)
same_col = list(tr_cols.intersection(set(train_inte.columns)))
train_inteSame = train_inte[same_col].copy()

Inte_add_cos = list(tr_cols.difference(set(same_col)))
for col in Inte_add_cos:
    train_inteSame[col] = np.nan

y = train_data['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds, IntePre, importances = train_model(train_data, train_inteSame, y, folds)

IntePre['isDef'] = train_inte['is_default']
from sklearn.metrics import roc_auc_score
roc_auc_score(IntePre['isDef'],IntePre.isDefault)
## 选择阈值0.05，从internet表中提取预测小于该概率的样本，并对不同来源的样本赋予来源值
InteId = IntePre.loc[IntePre.isDefault<0.5, 'loan_id'].tolist()

train_data['dataSourse'] = 1
test_public['dataSourse'] = 1
train_inteSame['dataSourse'] = 0
train_inteSame['isDefault'] = train_inte['is_default']
use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)

####****新增数据######
# interest#当前贷款利率
# debt_loan_ratio#债务收入比

for method in ['mean','std','sum','median']:
    for col in ['employer_type', 'industry','issue_date_month','issue_date_dayofweek','earliesCreditMon','earliesCreditYear','region']:
        data[f'label_{method}_'+str(col)] = data.groupby(col)['isDefault'].transform(method)
        
# for method in ['mean', 'max', 'min', 'std','sum','median']:
#     for col in ['employer_type', 'industry','issue_date_month','issue_date_dayofweek','earliesCreditMon','earliesCreditYear','region']:
#         data[f'interest_{method}_'+str(col)] = data.groupby(col)['known_outstanding_loan'].transform(method)
# for method in ['mean', 'max', 'min', 'std','sum','median']:
#     for col in ['employer_type', 'industry','issue_date_month','issue_date_dayofweek','earliesCreditMon','earliesCreditYear','region']:
#         data[f'debt_loan_ratio_{method}_'+str(col)] = data.groupby(col)['debt_loan_ratio'].transform(method)


# data = data.replace([-np.inf,np.inf],np.NaN)
# Kdf = data[['known_outstanding_loan','debt_loan_ratio','monthly_payment','f2','total_loan','f4','f3','interest']]
# Kdf = Kdf.fillna(0)
# kmeans = KMeans(3,random_state=2021)
# kmeans.fit(Kdf)
# identified_clusters = kmeans.predict(Kdf)
# data['k_cluster'] = identified_clusters

# for method in ['mean', 'max', 'min', 'std']:
#     for col in ['known_outstanding_loan','debt_loan_ratio','monthly_payment','f2','total_loan','f4','f3','interest']:
#         data[f'k_cluster_{method}_'+str(col)] = data.groupby('k_cluster')[col].transform(method)
        
plt.figure(figsize=(16,6))
plt.title("Distribution of Default values IntePre")
sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
# sns.distplot(train_inte[col],color="red", kde=True,bins=120, label='train_inte')
plt.legend();plt.show()
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]

In [None]:
display_importances(importances)

In [None]:
#lgb_model_0
feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault']]
#lgb_model_1
#feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault','f1']]
x_train = train[feats]
y_train = train['isDefault']
x_test = test[feats]

In [None]:
dataset_blend_train = np.zeros((x_train.shape[0], 4))
dataset_blend_test = np.zeros((x_test.shape[0], 4))

In [None]:
def lgb_model_0(X_train, y_train, X_test, y_test=None):
    #X_train = pd.DataFrame(X_train)
    #X_test = pd.DataFrame(X_test)
    folds_ = KFold(n_splits=5, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(X_train.shape[0])
    sub_preds = np.zeros(X_test.shape[0])
    cat_feats={'industry','employer_type'}
    dataset_blend_test_0 = np.zeros((X_test.shape[0], 5))
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(X_train)):
        trn_x, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.08,#0.07
            num_leaves=2**5+1,
            colsample_bytree=.65,
            subsample=.9,
            max_depth=5,#5
            #max_bin=250,
            reg_alpha=.3,
            reg_lambda=.3,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
        
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], categorical_feature=cat_feats,
                eval_metric='auc', verbose=100, early_stopping_rounds=40  #30
               )
        
        vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        minmin= min(oof_preds[val_idx])
        maxmax= max(oof_preds[val_idx])
        dataset_blend_train[val_idx,0]= vfunc(oof_preds[val_idx])
        sub_preds = clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1]
        #dataset_blend_test_0[:,n_fold] = sub_preds
        minmin= min(sub_preds)
        maxmax= max(sub_preds)
        dataset_blend_test_0[:,n_fold] = vfunc(sub_preds)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds)) 
    return dataset_blend_train,dataset_blend_test_0

In [None]:
dataset_blend_train,dataset_blend_test_0 = lgb_model_0(x_train, y_train, x_test)
dataset_blend_test[:,0]= dataset_blend_test_0.mean(1)

In [None]:
test['isDefault'] = dataset_blend_test[:,0]
thr1 = test['isDefault'].quantile(0.9)
thr2 = test['isDefault'].quantile(0.1)
test['isDefault'][dataset_blend_test[:,0]>thr1]=1
test['isDefault'][dataset_blend_test[:,0]<thr2]=0
test.shape

In [None]:
new_dat = test[(test['isDefault']==1) | (test['isDefault']==0)]
train = pd.concat([train,new_dat])

In [None]:
test['isDefault'].quantile(0.9)

In [None]:
#lgb_model_0
feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault']]
#lgb_model_1
#feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault','f1']]
x_train = train[feats]
y_train = train['isDefault']
x_test = test[feats]

In [None]:
dataset_blend_train = np.zeros((x_train.shape[0], 4))
dataset_blend_test = np.zeros((x_test.shape[0], 4))

In [None]:
def lgb_model_1(X_train, y_train, X_test, y_test=None):
    #X_train = pd.DataFrame(X_train)
    #X_test = pd.DataFrame(X_test)
    folds_ = KFold(n_splits=5, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(x_train.shape[0])
    sub_preds = np.zeros(x_test.shape[0])
    cat_feats={'industry','employer_type'}
    dataset_blend_test_0 = np.zeros((x_test.shape[0], 5))
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(x_train)):
        trn_x, trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
        
        clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.08,#0.07
            num_leaves=2**5+1,
            colsample_bytree=.65,
            subsample=.9,
            max_depth=5,#5
            #max_bin=250,
            reg_alpha=.3,
            reg_lambda=.3,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
        
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], categorical_feature=cat_feats,
                eval_metric='auc', verbose=100, early_stopping_rounds=40  #30
               )
        
        vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        minmin= min(oof_preds[val_idx])
        maxmax= max(oof_preds[val_idx])
        dataset_blend_train[val_idx,0]= vfunc(oof_preds[val_idx])
        sub_preds = clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1]
        #dataset_blend_test_0[:,n_fold] = sub_preds
        minmin= min(sub_preds)
        maxmax= max(sub_preds)
        dataset_blend_test_0[:,n_fold] = vfunc(sub_preds)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds)) 
    return dataset_blend_train,dataset_blend_test_0

In [None]:
dataset_blend_train,dataset_blend_test_0 = lgb_model_1(x_train, y_train, x_test)
dataset_blend_test[:,0]= dataset_blend_test_0.mean(1)

In [None]:
submit = pd.DataFrame([])
submit['id']=test['loan_id']
submit['isDefault']=dataset_blend_test[:,0]
submit.to_csv('lgb_submit.csv', index=False)
#状态 / 得分 
#0.89358266770 

In [None]:
gg

# Ensemble

## CatBoost

### Step 1.

In [None]:
SEED = 7777

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostClassifier
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from dateutil.relativedelta import relativedelta
train_data = pd.read_csv('./train_public.csv')
submit_example = pd.read_csv('../input/ccf2021bankdefault/submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')

def train_model(data_, test_, y_, folds_):
    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    feature_importance_df = pd.DataFrame()
    #feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault','policy_code','del_in_18month'] ]
    feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
        cat_feats={'industry','employer_type'}
        clf = CatBoostClassifier(
        n_estimators=5000,
        learning_rate=0.07,
        #num_leaves=2**5,
        colsample_bylevel=.55,
        subsample=.8,
#             depth=11,
#             max_bin=250,
#             reg_alpha=.3,
#             reg_lambda=.3,
#             min_split_gain=.01,
#             min_child_weight=2,
        logging_level='Verbose',loss_function='Logloss',eval_metric='AUC'
)
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], use_best_model=True,early_stopping_rounds=40, verbose=100#
               )
        oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
        sub_preds += clf.predict_proba(test_[feats])[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()  
    print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 
    
    test_['isDefault'] = sub_preds

    return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')
def workYearDIc(x):
    if str(x)=='nan':
        return -1
    x = x.replace('< 1','0')
    return int(re.search('(\d+)', x).group())
def findDig(val):
    fd = re.search('(\d+-)', val)
    if fd is None:
        return '1-'+val
    return val + '-01'
class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}
#public+inte 双表

#work_year
train_data['work_year'] = train_data['work_year'].map(workYearDIc)
test_public['work_year'] = test_public['work_year'].map(workYearDIc)
train_inte['work_year'] = train_inte['work_year'].map(workYearDIc)

#class
train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)
train_inte['class'] = train_inte['class'].map(class_dict)

#earlies_credit_mon
train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
train_inte['earlies_credit_mon'] = pd.to_datetime(train_inte['earlies_credit_mon'].map(findDig))

#earlies_credit_mon
timeMax = pd.to_datetime('1-Dec-21')
train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+  pd.offsets.DateOffset(years=-100)  
test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)

#issue_date
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
train_inte['issue_date'] = pd.to_datetime(train_inte['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = test_public['issue_date'].dt.month
train_inte['issue_date_month'] = train_inte['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
train_inte['issue_date_dayofweek'] = train_inte['issue_date'].dt.dayofweek

train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
train_inte['earliesCreditMon'] = train_inte['earlies_credit_mon'].dt.month

train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
train_inte['earliesCreditYear'] = train_inte['earlies_credit_mon'].dt.year

#特征构造

#月收入
# train_data['monthly_income'] = train_data['monthly_payment'] / train_data['debt_loan_ratio'] *100
# test_public['monthly_income'] = test_public['monthly_payment'] / test_public['debt_loan_ratio'] *100
# train_inte['monthly_income'] = train_inte['monthly_payment'] / train_inte['debt_loan_ratio'] *100

train_data['post_code_to_mean_interst'] = train_data.groupby(['post_code'])['interest'].transform('mean')
test_public['post_code_to_mean_interst'] = test_public.groupby(['post_code'])['interest'].transform('mean')
train_inte['post_code_to_mean_interst'] = train_inte.groupby(['post_code'])['interest'].transform('mean')

train_data['post_code_cnt'] = train_data['post_code'].map(train_data['post_code'].value_counts())
train_data.loc[train_data['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
test_public['post_code_cnt'] = test_public['post_code'].map(test_public['post_code'].value_counts())
test_public.loc[test_public['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
train_inte['post_code_cnt'] = train_inte['post_code'].map(train_inte['post_code'].value_counts())
train_inte.loc[train_inte['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan


del train_data['post_code_cnt']
del test_public['post_code_cnt']
del train_inte['post_code_cnt']
#类别型特征编码
from sklearn.preprocessing import LabelEncoder
cat_cols = ['employer_type', 'industry']
for col in cat_cols:
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_public[col] = lbl.transform(test_public[col])
    train_inte[col] = lbl.transform(train_inte[col])
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1 )
train_inte = train_inte.drop(col_to_drop, axis=1 )
#取train_data和train_inte特征交集，train_inte将不包含的特征使用nan值填补
tr_cols = set(train_data.columns)
same_col = list(tr_cols.intersection(set(train_inte.columns)))
train_inteSame = train_inte[same_col].copy()
Inte_add_cos = list(tr_cols.difference(set(same_col)))
for col in Inte_add_cos:
    train_inteSame[col] = np.nan
#伪标签学习      以train_data特征训练模型->预测train_inteSame
y = train_data['isDefault']
train_data.shape
folds = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds, IntePre, importances = train_model(train_data, train_inteSame, y, folds)

In [None]:
#roc_auc_score
from sklearn.metrics import roc_auc_score
IntePre['isDef'] = train_inte['is_default']
roc_auc_score(IntePre['isDef'],IntePre.isDefault)

### Step 2.

In [None]:
# import pandas as pd
# train_data = pd.read_csv('./train_public.csv')
# test_data = pd.read_csv('./test_public.csv')
# sub=pd.read_csv("nn2.csv")
# sub=sub.rename(columns={'id': 'loan_id'})
# thr = sub['isDefault'].quantile(0.9)
# sub.loc[sub['isDefault']>thr,'isDefault'] = 1
# sub.loc[sub['isDefault']<0.5,'isDefault'] = 0
# nw_sub=sub[(sub['isDefault']==1)|(sub['isDefault']==0)]
# nw_test_data=test_data.merge(nw_sub,on='loan_id',how='inner')
# nw_train_data = pd.concat([train_data,nw_test_data]).reset_index(drop=True)
# nw_train_data.to_csv("./nw_train_public.csv",index=0)
## 选择阈值0.05，从internet表中提取预测小于该概率的样本，并对不同来源的样本赋予来源值
InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()
#新增来源域分类特征
train_data['dataSourse'] = 1
test_public['dataSourse'] = 1
train_inteSame['dataSourse'] = 0
train_inteSame['isDefault'] = train_inte['is_default']
use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
#连接表
data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)

In [None]:
# IntePre.isDefault
plt.figure(figsize=(16,6))
plt.title("Distribution of Default values IntePre")
sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
plt.legend();plt.show()
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]

del data
del train_data,test_public

In [None]:
y = train['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds, test_preds, importances = train_model(train, test, y, folds)
test_preds.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('./nn3.csv', index=False)

### Step 3.

In [None]:
import pandas as pd
train_data = pd.read_csv('./train_public.csv')
test_data = pd.read_csv('./test_public.csv')
sub=pd.read_csv("nn3.csv")
sub=sub.rename(columns={'id': 'loan_id'})

In [None]:
sub.loc[sub['isDefault']<0.5,'isDefault'] = 0
nw_sub=sub[(sub['isDefault']==0)]
nw_test_data=test_data.merge(nw_sub,on='loan_id',how='inner')
nw_train_data = pd.concat([train_data,nw_test_data]).reset_index(drop=True)
nw_train_data.to_csv("./nw_train_public.csv",index=0)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from dateutil.relativedelta import relativedelta
train_data = pd.read_csv('./nw_train_public.csv')
submit_example = pd.read_csv('../input/ccf2021bankdefault/submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')

pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)
def train_model(data_, test_, y_, folds_):
    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
        clf = CatBoostClassifier(
        n_estimators=5000,
        learning_rate=0.07,
        #num_leaves=2**5,
        colsample_bylevel=.55,
        subsample=.8,
#             depth=11,
#             max_bin=250,
#             reg_alpha=.3,
#             reg_lambda=.3,
#             min_split_gain=.01,
#             min_child_weight=2,
        logging_level='Verbose',loss_function='Logloss',eval_metric='AUC'
)
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], use_best_model=True,early_stopping_rounds=40, verbose=100#
               )
        oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
        sub_preds += clf.predict_proba(test_[feats])[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
        
    print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 
    
    test_['isDefault'] = sub_preds

    return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
def workYearDIc(x):
    if str(x)=='nan':
        return -1
    x = x.replace('< 1','0')
    return int(re.search('(\d+)', x).group())

def findDig(val):
    fd = re.search('(\d+-)', val)
    if fd is None:
        return '1-'+val
    return val + '-01'


class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}

#public+inte 双表

#work_year
train_data['work_year'] = train_data['work_year'].map(workYearDIc)
test_public['work_year'] = test_public['work_year'].map(workYearDIc)
train_inte['work_year'] = train_inte['work_year'].map(workYearDIc)

#class
train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)
train_inte['class'] = train_inte['class'].map(class_dict)

#earlies_credit_mon
train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
train_inte['earlies_credit_mon'] = pd.to_datetime(train_inte['earlies_credit_mon'].map(findDig))

#earlies_credit_mon
timeMax = pd.to_datetime('1-Dec-21')
train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+  pd.offsets.DateOffset(years=-100)  
test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_inte.loc[ train_inte['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)

#issue_date
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
train_inte['issue_date'] = pd.to_datetime(train_inte['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = test_public['issue_date'].dt.month
train_inte['issue_date_month'] = train_inte['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
train_inte['issue_date_dayofweek'] = train_inte['issue_date'].dt.dayofweek

train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
train_inte['earliesCreditMon'] = train_inte['earlies_credit_mon'].dt.month

train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
train_inte['earliesCreditYear'] = train_inte['earlies_credit_mon'].dt.year

#特征构造

#月收入
# train_data['monthly_income'] = train_data['monthly_payment'] / train_data['debt_loan_ratio'] *100
# test_public['monthly_income'] = test_public['monthly_payment'] / test_public['debt_loan_ratio'] *100
# train_inte['monthly_income'] = train_inte['monthly_payment'] / train_inte['debt_loan_ratio'] *100

train_data['post_code_to_mean_interst'] = train_data.groupby(['post_code'])['interest'].transform('mean')
test_public['post_code_to_mean_interst'] = test_public.groupby(['post_code'])['interest'].transform('mean')
train_inte['post_code_to_mean_interst'] = train_inte.groupby(['post_code'])['interest'].transform('mean')

train_data['post_code_cnt'] = train_data['post_code'].map(train_data['post_code'].value_counts())
train_data.loc[train_data['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
test_public['post_code_cnt'] = test_public['post_code'].map(test_public['post_code'].value_counts())
test_public.loc[test_public['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan
train_inte['post_code_cnt'] = train_inte['post_code'].map(train_inte['post_code'].value_counts())
train_inte.loc[train_inte['post_code_cnt']<=5,'post_code_to_mean_interst'] = np.nan

del train_data['post_code_cnt']
del test_public['post_code_cnt']
del train_inte['post_code_cnt']

from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    cat_cols = ['employer_type', 'industry']
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_public[col] = lbl.transform(test_public[col])
    
    #Internet处理
    train_inte[col] = lbl.transform(train_inte[col])
    
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1 )
train_inte = train_inte.drop(col_to_drop, axis=1 )

tr_cols = set(train_data.columns)
same_col = list(tr_cols.intersection(set(train_inte.columns)))
train_inteSame = train_inte[same_col].copy()

Inte_add_cos = list(tr_cols.difference(set(same_col)))
for col in Inte_add_cos:
    train_inteSame[col] = np.nan

y = train_data['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds, IntePre, importances = train_model(train_data, train_inteSame, y, folds)

IntePre['isDef'] = train_inte['is_default']
from sklearn.metrics import roc_auc_score
roc_auc_score(IntePre['isDef'],IntePre.isDefault)
## 选择阈值0.05，从internet表中提取预测小于该概率的样本，并对不同来源的样本赋予来源值
InteId = IntePre.loc[IntePre.isDefault<0.5, 'loan_id'].tolist()

train_data['dataSourse'] = 1
test_public['dataSourse'] = 1
train_inteSame['dataSourse'] = 0
train_inteSame['isDefault'] = train_inte['is_default']
use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)

####****新增数据######
# interest#当前贷款利率
# debt_loan_ratio#债务收入比

for method in ['mean','std','sum','median']:
    for col in ['employer_type', 'industry','issue_date_month','issue_date_dayofweek','earliesCreditMon','earliesCreditYear','region']:
        data[f'label_{method}_'+str(col)] = data.groupby(col)['isDefault'].transform(method)
        
# for method in ['mean', 'max', 'min', 'std','sum','median']:
#     for col in ['employer_type', 'industry','issue_date_month','issue_date_dayofweek','earliesCreditMon','earliesCreditYear','region']:
#         data[f'interest_{method}_'+str(col)] = data.groupby(col)['known_outstanding_loan'].transform(method)
# for method in ['mean', 'max', 'min', 'std','sum','median']:
#     for col in ['employer_type', 'industry','issue_date_month','issue_date_dayofweek','earliesCreditMon','earliesCreditYear','region']:
#         data[f'debt_loan_ratio_{method}_'+str(col)] = data.groupby(col)['debt_loan_ratio'].transform(method)


# data = data.replace([-np.inf,np.inf],np.NaN)
# Kdf = data[['known_outstanding_loan','debt_loan_ratio','monthly_payment','f2','total_loan','f4','f3','interest']]
# Kdf = Kdf.fillna(0)
# kmeans = KMeans(3,random_state=2021)
# kmeans.fit(Kdf)
# identified_clusters = kmeans.predict(Kdf)
# data['k_cluster'] = identified_clusters

# for method in ['mean', 'max', 'min', 'std']:
#     for col in ['known_outstanding_loan','debt_loan_ratio','monthly_payment','f2','total_loan','f4','f3','interest']:
#         data[f'k_cluster_{method}_'+str(col)] = data.groupby('k_cluster')[col].transform(method)
        
plt.figure(figsize=(16,6))
plt.title("Distribution of Default values IntePre")
sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
# sns.distplot(train_inte[col],color="red", kde=True,bins=120, label='train_inte')
plt.legend();plt.show()
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]

In [None]:
#lgb_model_0
feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault']]
#lgb_model_1
#feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault','f1']]
x_train = train[feats]
y_train = train['isDefault']
x_test = test[feats]

In [None]:
dataset_blend_train = np.zeros((x_train.shape[0], 4))
dataset_blend_test = np.zeros((x_test.shape[0], 4))

In [None]:
def cat_model_0(X_train, y_train, X_test, y_test=None):
    #X_train = pd.DataFrame(X_train)
    #X_test = pd.DataFrame(X_test)
    folds_ = KFold(n_splits=5, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(X_train.shape[0])
    sub_preds = np.zeros(X_test.shape[0])
    cat_feats={'industry','employer_type'}
    dataset_blend_test_0 = np.zeros((X_test.shape[0], 5))
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(X_train)):
        trn_x, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        clf = CatBoostClassifier(
        n_estimators=5000,
        learning_rate=0.07,
        #num_leaves=2**5,
        colsample_bylevel=.55,
        subsample=.8,
#             depth=11,
#             max_bin=250,
#             reg_alpha=.3,
#             reg_lambda=.3,
#             min_split_gain=.01,
#             min_child_weight=2,
        logging_level='Verbose',loss_function='Logloss',eval_metric='AUC'
)
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], use_best_model=True,early_stopping_rounds=40, verbose=100#
               )

        
        vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))

        oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
        minmin= min(oof_preds[val_idx])
        maxmax= max(oof_preds[val_idx])
        dataset_blend_train[val_idx,0]= vfunc(oof_preds[val_idx])
        sub_preds = clf.predict_proba(X_test)[:, 1]
        #dataset_blend_test_0[:,n_fold] = sub_preds
        minmin= min(sub_preds)
        maxmax= max(sub_preds)
        dataset_blend_test_0[:,n_fold] = vfunc(sub_preds)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds)) 
    return dataset_blend_train,dataset_blend_test_0

In [None]:
dataset_blend_train,dataset_blend_test_0 = cat_model_0(x_train, y_train, x_test)
dataset_blend_test[:,0]= dataset_blend_test_0.mean(1)

In [None]:
test['isDefault'] = dataset_blend_test[:,0]
thr1 = test['isDefault'].quantile(0.9)
thr2 = test['isDefault'].quantile(0.1)
test['isDefault'][dataset_blend_test[:,0]>thr1]=1
test['isDefault'][dataset_blend_test[:,0]<thr2]=0
test.shape

In [None]:
new_dat = test[(test['isDefault']==1) | (test['isDefault']==0)]
train = pd.concat([train,new_dat])

In [None]:
#lgb_model_0
feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault']]
#lgb_model_1
#feats =[f for f in train.columns if f not in ['loan_id', 'user_id', 'isDefault','f1']]
x_train = train[feats]
y_train = train['isDefault']
x_test = test[feats]

In [None]:
dataset_blend_train = np.zeros((x_train.shape[0], 4))
dataset_blend_test = np.zeros((x_test.shape[0], 4))

In [None]:
dataset_blend_train,dataset_blend_test_0 = lgb_model_1(x_train, y_train, x_test)
dataset_blend_test[:,0]= dataset_blend_test_0.mean(1)

In [None]:
submit = pd.DataFrame([])
submit['id']=test['loan_id']
submit['isDefault']=dataset_blend_test[:,0]
submit.to_csv('cat_submit.csv', index=False)
#状态 / 得分 
#0.89358266770 

In [None]:
sub1 = pd.read_csv('./lgb_submit.csv')
sub2 = pd.read_csv('./cat_submit.csv')
sub3 = pd.read_csv('../input/ccf2021bankdefault/sub897.csv')
sub2.head()

In [None]:
sub2.isDefault = (sub3.isDefault*0.6+sub1.isDefault*0.4)
sub2.to_csv('final_sub.csv',index=False)

In [None]:
sns.displot(sub2.isDefault)

In [None]:
sns.displot(sub3.isDefault)

In [None]:
lgb1_train = oof_preds
lgb2_train = oof_preds2
lgb1_test = dataset_blend_test[:,0]
lgb2_test = test_preds2.isDefault
#
train_1 = np.zeros((oof_preds.shape[0],6))
test_1 = np.zeros((test.shape[0],6))
train_1[:,0] = lgb1_train**2
test_1[:,0] = lgb1_test**2
train_1[:,1] = np.exp(lgb1_train)
test_1[:,1] = np.exp(lgb1_test)

train_1[:,2] = lgb2_train**2
test_1[:,2] = lgb2_test**2
train_1[:,3] = np.exp(lgb2_train)
test_1[:,3] = np.exp(lgb2_test)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
#train_ = pd.DataFrame(train_1)
#test_ = pd.DataFrame(test_1)
clf = LogisticRegression()
#clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
#clf= RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='gini')
clf.fit(train_1, y_train)
y_emb = clf.predict_proba(test_1)[:, 1]
vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
minmin= min(y_emb)
maxmax= max(y_emb)
y_emb1=vfunc(y_emb)

In [None]:
# sub1 = pd.read_csv('baseline_catboost.csv')
# sub2 = pd.read_csv('baseline_lgb.csv')
# sub1['isDefault'] = (sub1['isDefault']+sub2['isDefault'])/2
# sub1.to_csv('cat_lgb_sub.csv',index=False)
