In [3]:
!pwd
!ls

/d/GH/GitWorkSpace/bank_model_competiton/data/v26
loan_predict_rank_2.py
model_gbdt_v26.ipynb
model_lgb_v26.ipynb
model_xgb_v26.ipynb
process_v26.ipynb


In [9]:
import pandas as pd 
import matplotlib.pyplot as plt
import statistics
import datetime
import seaborn as sns
import os
import numpy as np
import time 
from sklearn.preprocessing import OrdinalEncoder

pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

suffix = os.path.split(os.getcwd())[-1]

root_dir='../../'

train_path=root_dir+'train.csv'
train_tx_path=root_dir+'train_bank_statement.csv'
output_train_path = "train.dat.%s" % suffix

test_path=root_dir+'testaa.csv'
test_tx_path=root_dir+'testaa_bank_statement.csv'
output_test_path = "test.dat.%s" % suffix

print(suffix)
print('process time : ',time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))

v26
process time :  2025-09-01 21:33:33


In [55]:
# debug 模式
NROWS = 10000000000000000000000

#  基础特征

In [70]:
def get_sub_grade(grade, sub):
    return grade*10+int(sub[1])

def trans_date(issueDate):
    year,month,day = issueDate.split('-')
    return int(year)*12+int(month)-1

In [75]:
#  读入数据
df_train  = pd.read_csv(train_path, index_col=['id'], nrows=NROWS)
df_test   = pd.read_csv(test_path,  index_col=['id'], nrows=NROWS)

df_bank_train  = pd.read_csv(train_tx_path, index_col=['id'], nrows=NROWS)
df_bank_test   = pd.read_csv(test_tx_path,  index_col=['id'], nrows=NROWS)

df_train_test    = pd.concat([df_train,    df_test],    axis=0)
df_bank_train_test = pd.concat([df_bank_train, df_bank_test], axis=0)
print(df_train.shape)
print(df_test.shape)

#  特征加工
df_train_test['balance_account_avg'] = df_train_test['balance'] / df_train_test['balance_accounts']
df_train_test['loan_term_avg'] = df_train_test['loan'] / df_train_test['term']
df_train_test['balance_accounts_ratio'] = df_train_test['balance_accounts'] / df_train_test['total_accounts']

#log 处理
df_train_test['loan_log'] = df_train_test['loan'].apply(lambda x : np.log(x))
df_train_test['balance_account_avg_log'] = df_train_test['balance_account_avg'].apply(lambda x : np.log(x))
df_train_test['loan_term_avg_log'] = df_train_test['loan_term_avg'].apply(lambda x : np.log(x))
df_train_test['balance_accounts_ratio_log'] = df_train_test['balance_accounts_ratio'].apply(lambda x : np.log(x))
df_train_test['interest_rate_log'] = df_train_test['interest_rate'].apply(lambda x : np.log(x))

df_train_test['balance_log'] = df_train_test['balance'].apply(lambda x : np.log(x))
df_train_test['balance_limit_log'] = df_train_test['balance_limit'].apply(lambda x : np.log(x))
df_train_test['balance_accounts_log'] = df_train_test['balance_accounts'].apply(lambda x : np.log(x))

# zip code
df_train_test['zip_province']  = df_train_test['zip_code'].apply(lambda x : str(x)[:2])
df_train_test['zip_city']      = df_train_test['zip_code'].apply(lambda x : str(x)[:4])

#level 处理
df_train_test['level_hash'] = df_train_test['level'].apply(lambda x : hash(x) % 1000 )
encoder = OrdinalEncoder(categories=[['A0','A1','A2','A3','A4','A5','B0','B1','B2','B3','B4','B5','C0','C1','C2','C3','C4','C5','D0','D1','D2','D3','D4','D5','E0','E1','E2','E3','E4','E5']]) 
df_train_test['level_ord'] = encoder.fit_transform(df_train_test[['level']].values)

df_train_test['grade'] = df_train_test['level'].apply( lambda x : str(x)[0])  #提取ABCDE
df_train_test['grade'] = df_train_test['grade'].map({'A':1 , 'B':2, 'C':3, 'D':4, 'E':5})


# 分桶处理
bins_num = 50
tmp_labels = ['%d' % i for i in range(bins_num)]
df_train_test['interest_rate_cut'] = pd.cut(df_train_test['interest_rate'], bins=bins_num, labels = tmp_labels)

tmp_labels = ['%d' % i for i in range(bins_num)]
df_train_test['interest_rate_log_cut'] = pd.cut(df_train_test['interest_rate_log'], bins=bins_num, labels = tmp_labels)

# 等频
tmp_labels = ['%d' % i for i in range(bins_num)]
df_train_test['balance_cut'] = pd.qcut(df_train_test['balance'], q=bins_num, labels = tmp_labels, duplicates='drop')

df_train_test['loan_cut'] = pd.qcut(df_train_test['loan'], q=bins_num,  duplicates='drop')
loan_labels = np.unique(df_train_test['loan_cut'].values.codes).tolist()
df_train_test['loan_cut'] = pd.qcut(df_train_test['loan'], q=bins_num,  labels = loan_labels,  duplicates='drop')

tmp_labels = ['%d' % i for i in range(bins_num)]
df_train_test['balance_limit_cut'] = pd.qcut(df_train_test['balance_limit'], q=bins_num, labels = tmp_labels, duplicates='drop')

df_train_test['loan_term_avg_cut'] = pd.qcut(df_train_test['loan_term_avg'], q=bins_num,  duplicates='drop')
loan_labels = np.unique(df_train_test['loan_term_avg_cut'].values.codes).tolist()
df_train_test['loan_term_avg_cut'] = pd.qcut(df_train_test['loan_term_avg'], q=bins_num,  labels = loan_labels,  duplicates='drop')

tmp_labels = ['%d' % i for i in range(bins_num)]
df_train_test['balance_account_avg_cut'] = pd.qcut(df_train_test['balance_account_avg'], q=bins_num, labels = tmp_labels, duplicates='drop')

# 时间处理
df_train_test['record_time_format'] = df_train_test['record_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))
df_train_test['record_time_year'] = df_train_test['record_time_format'].map(lambda x : x.year)
df_train_test['record_time_month'] = df_train_test['record_time_format'].map(lambda x : x.month)      # 探索周期性
df_train_test['record_time_week'] = df_train_test['record_time_format'].map(lambda x : x.week)      # 探索周期性
df_train_test['record_time_year_month'] = df_train_test['record_time_format'].map(lambda x : x.strftime('%Y%m'))

#RANK2 
df_train_test['issueDate'] = df_train_test['issue_time'].apply(lambda x:  datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
df_train_test['issueDate'] = df_train_test['issueDate'].apply(lambda x: trans_date(x))        
df_train_test['historyDate'] = df_train_test['history_time'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
df_train_test['historyDate'] = df_train_test['historyDate'].apply(lambda x: trans_date(x))  

# 后验违约概率 
df_tmp_stat = pd.DataFrame()
df_tmp_stat['level_cnt']  = df_train_test[['level','label']].groupby('level').apply(lambda x : x['label'].count())
df_tmp_stat['level_default_cnt']  = df_train_test[['level','label']].groupby('level')[['label']].apply(lambda x : x[x['label'] == 1].count())
df_tmp_stat['level_default_ratio'] = df_tmp_stat['level_default_cnt'] / df_tmp_stat['level_cnt']
df_tmp_stat = df_tmp_stat.reset_index()
# print(df_tmp_stat)
df_train_test = pd.merge(df_train_test, df_tmp_stat[['level','level_default_ratio']], on='level', how='left',sort=False)

df_stat_tmp = pd.DataFrame()
df_stat_tmp['interest_rate_cnt']  = df_train_test[['interest_rate_cut','label']].groupby('interest_rate_cut').apply(lambda x : x['label'].count())
df_stat_tmp['interest_rate_default_cnt']  = df_train_test[['interest_rate_cut','label']].groupby('interest_rate_cut')[['label']].apply(lambda x : x[x['label'] == 1].count())
df_stat_tmp['interest_rate_default_ratio'] = df_stat_tmp['interest_rate_default_cnt'] / df_stat_tmp['interest_rate_cnt']
df_stat_tmp = df_stat_tmp.reset_index()
# print(df_stat_tmp)
df_train_test = pd.merge(df_train_test, df_stat_tmp[['interest_rate_cut','interest_rate_default_ratio']], on='interest_rate_cut', how='left',sort=False)

df_stat_tmp = pd.DataFrame()
df_stat_tmp['term_cnt']  = df_train_test[['term','label']].groupby('term').apply(lambda x : x['label'].count())
df_stat_tmp['term_default_cnt']  = df_train_test[['term','label']].groupby('term')[['label']].apply(lambda x : x[x['label'] == 1].count())
df_stat_tmp['term_default_ratio'] = df_stat_tmp['term_default_cnt'] / df_stat_tmp['term_cnt']
df_stat_tmp = df_stat_tmp.reset_index()
# print(df_stat_tmp)
df_train_test = pd.merge(df_train_test, df_stat_tmp[['term','term_default_ratio']], on='term', how='left',sort=False)



df_train_test.drop(['record_time_format'], axis=1, inplace=True)
df_train_test

(53480, 18)
(20054, 17)


Unnamed: 0,title,career,zip_code,residence,loan,term,interest_rate,issue_time,syndicated,installment,record_time,history_time,total_accounts,balance_accounts,balance_limit,balance,level,label,balance_account_avg,loan_term_avg,balance_accounts_ratio,loan_log,balance_account_avg_log,loan_term_avg_log,balance_accounts_ratio_log,interest_rate_log,balance_log,balance_limit_log,balance_accounts_log,zip_province,zip_city,level_hash,level_ord,grade,interest_rate_cut,interest_rate_log_cut,balance_cut,loan_cut,balance_limit_cut,loan_term_avg_cut,balance_account_avg_cut,record_time_year,record_time_month,record_time_week,record_time_year_month,issueDate,historyDate,level_default_ratio,interest_rate_default_ratio,term_default_ratio
0,9,0.0,221373,1,7200,36,10.95,1238631967,0,1,1238630622,472006661,17.0,9.0,36200.0,13856.00,A4,0.0,1539.555556,200.000000,0.529412,8.881836,7.339249,5.298317,-0.635989,2.393339,9.536474,10.496814,2.197225,22,2213,483,4.0,1,10,20,40,4,43,6,42,2009,4,14,200904,24111,23819,0.167864,0.166035,0.148731
1,8,10.0,311681,0,21300,36,12.95,1128212052,0,0,1161907665,763779041,17.0,9.0,20400.0,13773.00,B0,1.0,1530.333333,591.666667,0.529412,9.966462,7.333241,6.382943,-0.635989,2.561096,9.530465,9.923290,2.197225,31,3116,550,6.0,2,14,25,40,19,35,20,42,2006,10,43,200610,24069,23930,0.183134,0.184066,0.148731
2,8,7.0,271562,1,10400,60,21.05,1249171509,0,0,1383958593,727143443,17.0,9.0,10800.0,2023.00,B4,0.0,224.777778,173.333333,0.529412,9.249561,5.415112,5.155217,-0.635989,3.046901,7.612337,9.287301,2.197225,27,2715,338,10.0,2,30,38,19,10,29,4,1,2013,11,45,201311,24115,23916,0.238525,0.241333,0.316692
3,7,2.0,522083,0,33050,36,16.40,1172882234,0,1,1214353935,687660346,17.0,9.0,24700.0,21992.00,B3,0.0,2443.555556,918.055556,0.529412,10.405777,7.801209,6.822258,-0.635989,2.797281,9.998434,10.114559,2.197225,52,5220,173,9.0,2,21,31,45,22,38,27,46,2008,6,26,200806,24086,23901,0.233343,0.229479,0.148731
4,8,3.0,101026,1,5200,36,14.35,1172882384,0,0,1240274527,322012875,17.0,9.0,5100.0,1669.00,B2,1.0,185.444444,144.444444,0.529412,8.556414,5.222755,4.972895,-0.635989,2.663750,7.419980,8.536996,2.197225,10,1010,17,8.0,2,17,28,16,1,24,1,1,2009,4,17,200904,24086,23762,0.215479,0.221494,0.148731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73529,0,8.0,601107,1,10000,12,18.85,1130976000,0,0,1125964800,1018224000,6.0,3.0,3818.0,2224.69,A4,,741.563333,833.333333,0.500000,9.210340,6.608761,6.725434,-0.693147,2.936513,7.707373,8.247482,1.098612,60,6011,483,4.0,1,26,35,20,9,21,25,20,2005,9,36,200509,24070,24027,0.167864,0.274146,0.182842
73530,0,10.0,601102,1,10000,12,29.30,1156204800,0,0,1157068800,1054425600,6.0,6.0,5502.0,4126.71,B4,,687.785000,833.333333,1.000000,9.210340,6.533476,6.725434,0.000000,3.377588,8.325236,8.612867,1.791759,60,6011,338,10.0,2,46,48,28,9,24,25,16,2006,9,35,200609,24079,24041,0.238525,0.190647,0.182842
73531,0,4.0,601408,1,11000,12,24.75,1144108800,0,0,1111622400,1037404800,8.0,3.0,4844.0,2710.96,A3,,903.653333,916.666667,0.375000,9.305651,6.806446,6.820744,-0.980829,3.208825,7.905058,8.485496,1.098612,60,6014,812,3.0,1,37,43,23,11,23,26,32,2005,3,12,200503,24075,24034,0.149012,0.218543,0.182842
73532,0,3.0,601904,1,8000,12,22.00,1163808000,0,0,1116892800,1057017600,6.0,3.0,3495.0,1834.93,A3,,611.643333,666.666667,0.500000,8.987197,6.416149,6.502290,-0.693147,3.091042,7.514762,8.159089,1.098612,60,6019,812,3.0,1,32,40,18,5,20,21,13,2005,5,21,200505,24082,24042,0.149012,0.219020,0.182842


#  交易特征处理

In [76]:
df_bank_train_test['time_format'] = df_bank_train_test['time'].apply(lambda x: datetime.datetime.fromtimestamp(x))
print(df_bank_train_test)

#days_diff, tm_count, total_amount, amount_1, amount_0, total_amount_day_avg, amount_1_day_avg, amount_0_day_avg
print(df_bank_train_test.shape)

df_bank_stat = pd.DataFrame()
df_bank_stat['tx_time_max'] = df_bank_train_test.groupby('id')['time_format'].agg('max')
df_bank_stat['tx_time_min'] = df_bank_train_test.groupby('id')['time_format'].agg('min')

df_bank_stat['tx_max_min_days'] = (df_bank_stat['tx_time_max'] - df_bank_stat['tx_time_min'])
df_bank_stat['tx_max_min_days'] = df_bank_stat['tx_max_min_days'].map(lambda x : x.days)

df_bank_stat['tx_count'] = df_bank_train_test.groupby('id')['amount'].agg('count')
df_bank_stat['total_amount'] = df_bank_train_test.groupby('id')['amount'].agg('sum')

df_bank_stat['1_amount'] = df_bank_train_test.groupby('id').apply( lambda x : x [ x['direction'] == 1]['amount'].sum())
df_bank_stat['0_amount'] = df_bank_train_test.groupby('id').apply( lambda x : x [ x['direction'] == 0]['amount'].sum())
df_bank_stat['total_income']  =  df_bank_stat['0_amount'] - df_bank_stat['1_amount']

#TODO :  0 - 1 amount
df_bank_stat['annual_income']  =  360*(df_bank_stat['0_amount'] - df_bank_stat['1_amount'])/df_bank_stat['tx_max_min_days']

df_bank_stat['total_amount_avg']  = df_bank_stat['total_amount'] / df_bank_stat['tx_max_min_days'] 
df_bank_stat['1_amount_avg']  = df_bank_stat['1_amount'] / df_bank_stat['tx_max_min_days'] 
df_bank_stat['0_amount_avg']  = df_bank_stat['0_amount'] / df_bank_stat['tx_max_min_days'] 

df_bank_stat['total_amount_avg2']  = df_bank_stat['total_amount'] / df_bank_stat['tx_count'] 
df_bank_stat['1_amount_avg2']  = df_bank_stat['1_amount'] / df_bank_stat['tx_count'] 
df_bank_stat['0_amount_avg2']  = df_bank_stat['0_amount'] / df_bank_stat['tx_count'] 

#交易活跃度： 效果次数/天数
df_bank_stat['tx_count_avg']  = df_bank_stat['tx_count'] / df_bank_stat['tx_max_min_days'] 
df_bank_stat['tx_tmstp_max'] = df_bank_train_test.groupby('id')['time'].agg('max')
df_bank_stat['tx_tmstp_min'] = df_bank_train_test.groupby('id')['time'].agg('min')
# df_bank_stat['tx_max_min_days'] = (df_bank_stat['tx_time_max'] - df_bank_stat['tx_time_min'])
# df_bank_stat['tx_max_min_days'] = df_bank_stat['tx_max_min_days'].map(lambda x : x.days)

df_bank_stat.drop([ 'tx_time_max', 'tx_time_min'], axis = 1, inplace=True)
df_bank_stat.head()

             time  direction       amount         time_format
id                                                           
0      1224115200          0  8771.350000 2008-10-16 08:00:00
0      1224288000          1   310.650000 2008-10-18 08:00:00
0      1224460800          1   152.620000 2008-10-20 08:00:00
0      1225152000          1    20.490000 2008-10-28 08:00:00
0      1226793600          1   173.170000 2008-11-16 08:00:00
...           ...        ...          ...                 ...
71870  1160956800          1   493.403945 2006-10-16 08:00:00
71870  1161043200          0     9.462382 2006-10-17 08:00:00
71870  1161388800          1   222.936072 2006-10-21 08:00:00
71870  1161475200          0   222.936072 2006-10-22 08:00:00
71870  1161475200          1    20.908367 2006-10-22 08:00:00

[2364084 rows x 4 columns]
(2364084, 4)


Unnamed: 0_level_0,tx_max_min_days,tx_count,total_amount,1_amount,0_amount,total_income,annual_income,total_amount_avg,1_amount_avg,0_amount_avg,total_amount_avg2,1_amount_avg2,0_amount_avg2,tx_count_avg,tx_tmstp_max,tx_tmstp_min
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,163,48,71787.0,12079.5,59707.5,47628.0,105190.674847,440.411043,74.107362,366.303681,1495.5625,251.65625,1243.90625,0.294479,1238198400,1224115200
2,180,48,22406.1,15883.72,6522.38,-9361.34,-18722.68,124.478333,88.242889,36.235444,466.79375,330.910833,135.882917,0.266667,1383955200,1368403200
4,169,93,51163.0,30823.1,20339.9,-10483.2,-22331.076923,302.739645,182.385207,120.354438,550.139785,331.431183,218.708602,0.550296,1238284800,1223683200
6,179,61,41733.77,15385.27,26348.5,10963.23,22048.95419,233.149553,85.951229,147.198324,684.160164,252.217541,431.942623,0.340782,1220227200,1204761600
7,175,66,59958.01,22642.76,37315.25,14672.49,30183.408,342.6172,129.3872,213.23,908.454697,343.072121,565.382576,0.377143,1201996800,1186876800


# 合并特征、处理缺失值、保存结果

In [119]:
# 合并交易特征
df_concat = pd.concat([df_train_test, df_bank_stat], axis=1)
print(df_concat.shape)
df_concat.reset_index(inplace=True)
df_concat = df_concat.rename(columns={'index':'id'})
print(df_concat.columns)

# 债务收入比 ; 待确认计算方式
df_concat['dti'] = df_concat['loan'] / df_concat['annual_income']

# 缺失值处理
df_concat = df_concat.replace([np.inf, -np.inf], np.nan)

col_str = 'career,balance_limit_cut'
for col_name in col_str.split(','):
    value = df_concat[col_name].mode()[0]
    print(col_name, value)
    df_concat[col_name].fillna(value, inplace=True)
    
col_str = 'balance_limit,balance_log,balance_limit_log,balance_account_avg_log,tx_max_min_days,tx_count,total_amount,1_amount,0_amount,total_amount_avg,1_amount_avg,0_amount_avg,total_amount_avg2,1_amount_avg2,0_amount_avg2,tx_count_avg,tx_tmstp_max,tx_tmstp_min' 
for col_name in col_str.split(',') : 
#     value = df_concat[col_name].mean(skipna=True)
    # 替换为0值
    value = 0
    df_concat[col_name].fillna(value,inplace=True)
# print('isna \n',df_concat.isna().sum())

#TODO 离群点处理

df_result_train = df_concat.iloc[:df_train.shape[0],:]
df_result_test = df_concat.iloc[df_train.shape[0]:,:]
print('df_result_train ', df_result_train.shape)
print('df_result_test ', df_result_test.shape)



(73534, 66)
Index(['id', 'title', 'career', 'zip_code', 'residence', 'loan', 'term',
       'interest_rate', 'issue_time', 'syndicated', 'installment',
       'record_time', 'history_time', 'total_accounts', 'balance_accounts',
       'balance_limit', 'balance', 'level', 'label', 'balance_account_avg',
       'loan_term_avg', 'balance_accounts_ratio', 'loan_log',
       'balance_account_avg_log', 'loan_term_avg_log',
       'balance_accounts_ratio_log', 'interest_rate_log', 'balance_log',
       'balance_limit_log', 'balance_accounts_log', 'zip_province', 'zip_city',
       'level_hash', 'level_ord', 'grade', 'interest_rate_cut',
       'interest_rate_log_cut', 'balance_cut', 'loan_cut', 'balance_limit_cut',
       'loan_term_avg_cut', 'balance_account_avg_cut', 'record_time_year',
       'record_time_month', 'record_time_week', 'record_time_year_month',
       'issueDate', 'historyDate', 'level_default_ratio',
       'interest_rate_default_ratio', 'term_default_ratio', 'tx_max_min_day

# RANK2新版特征

In [120]:
import warnings
warnings.filterwarnings('ignore')

grade_dict = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6}

cate_features = ['title', 'career', 'zip_code', 'level_ord', 'historyDate_bin', \
                 'zip_province', 'issueDate_bin', 'term_bin',\
                 'interest_rate_bin', 'annual_income_bin', 'loan_bin','residence_bin',\
                 'balance_bin', 'dti_bin', 'loan_term_avg_bin', 'balance_bin']

ratio_feat_lst = ['loan', 'loan_term_avg', 'interest_rate', 'annual_income', 'dti', 'balance_accounts', \
                  'balance', 'balance_limit', 'total_accounts']


In [121]:
# level : subGrade
# career : employmentLength
# history_time : earliesCreditLine
# ballrevolBal
# total_accounts : totalAcc
# balance_accounts : openAcc
# 1_amount_avg  :  annualIncome

dfs=[df_result_train, df_result_test]

# def process(dfs):
for df in dfs:
    print(df.shape)
    df['grade'] = df['grade'].apply(lambda x: x if x not in grade_dict else grade_dict[x])
    df['career'] = df['career'].apply(lambda x : x)   
    df['date_Diff'] = df['issueDate'] - df['historyDate']
    df['term_balance'] = df['term']/(df['balance']+0.1)
    df['balance_limit_balance'] = df['balance_limit']/(df['balance']+0.1)   
    df['balance_account_ratio'] = df['balance_accounts']/df['total_accounts'] 
    df['loan_dti_annual_income'] = df['loan']/(np.abs(df['dti'])*df['annual_income']+0.1)
    df['career_bin'] = df['career']
    df['issueDate_bin'] = df['issueDate']
    df['historyDate_bin'] = df['historyDate']
    df['term_bin'] = df['term']
    df['residence_bin'] = df['residence']
    df['annual_income_loan'] = df['annual_income']/(df['loan']+0.1)
    df['balance_loan'] = df['balance']/(df['loan']+0.1)
    df['balance_term_repay'] = df['balance']/(df['loan_term_avg']+0.1)
    df['annual_income_term_repay'] = df['annual_income']/(df['loan_term_avg']+0.1)

concated_df = pd.concat(dfs)
label_lst = []
# 把分箱后的特征做为类别特征处理
bin_number = 10
label_lst = [i for i in range(bin_number)]
tmp_bins = pd.qcut(concated_df['loan'], q=bin_number,  duplicates='drop')
tmp_labels = np.unique(tmp_bins.values.codes).tolist()
dfs[0]['loan_bin'] = pd.qcut(concated_df['loan'], q=bin_number,  labels=tmp_labels, duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['loan_bin'] = pd.qcut(concated_df['loan'], q=bin_number,  labels=tmp_labels, duplicates='drop')[dfs[0].shape[0]:]
dfs[0]['annual_income_bin'] = pd.qcut(concated_df['annual_income'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['annual_income_bin'] = pd.qcut(concated_df['annual_income'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]

label_lst = []
bin_number = 100
for i in range(bin_number):
    label_lst.append(i)

tmp_bins = pd.qcut(concated_df['balance'], q=bin_number,  duplicates='drop')
tmp_labels = np.unique(tmp_bins.values.codes).tolist()
dfs[0]['balance_bin'] = pd.qcut(concated_df['balance'], bin_number, labels=tmp_labels, duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['balance_bin'] = pd.qcut(concated_df['balance'], bin_number, labels=tmp_labels, duplicates='drop')[:dfs[0].shape[0]]

tmp_bins = pd.qcut(concated_df['loan_term_avg'], q=bin_number,  duplicates='drop')
tmp_labels = np.unique(tmp_bins.values.codes).tolist()
dfs[0]['loan_term_avg_bin'] = pd.qcut(concated_df['loan_term_avg'], bin_number, labels=tmp_labels, duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['loan_term_avg_bin'] = pd.qcut(concated_df['loan_term_avg'], bin_number, labels=tmp_labels, duplicates='drop')[:dfs[0].shape[0]]

dfs[0]['interest_rate_bin'] = pd.qcut(concated_df['interest_rate'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['balance_limit_bin'] = pd.qcut(concated_df['balance_limit'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]

dfs[1]['interest_rate_bin'] = pd.qcut(concated_df['interest_rate'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['balance_limit_bin'] = pd.qcut(concated_df['balance_limit'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]

for df in dfs:
    for cate in cate_features:
        df[cate] = df[cate].fillna(0).astype('int')
        
issueDate_lst = list(set(concated_df['issueDate']))
career_lst    = list(set(concated_df['career']))
residence_lst = list(set(concated_df['residence']))

#  ------------------------------------------------------- #

for feat in ratio_feat_lst:
    #  --------------------------------------  #
    issueDate_median = {}
    issueDate_item_rank = {}
    issueDate_label_mean = {}
    for dt in issueDate_lst:
        # 取最近6个月
        mask = (concated_df['issueDate'] >= dt-3)&(concated_df['issueDate'] <= dt+3)
        # 取最近6个月除去当前月份
        mask_1 = (concated_df['issueDate'] >= dt-3)&(concated_df['issueDate'] <= dt+3)&(concated_df['issueDate'] != dt)
        item_series = concated_df.loc[mask, feat]
        label_series = concated_df.loc[mask_1, 'label']
        # 取最近6个月的中位数
        issueDate_median[dt] = item_series.median()
        issueDate_label_mean[dt] = label_series.mean()
        item_rank = item_series.rank()/len(item_series)
        issueDate_item_rank[dt] = {}
        for item,rank in zip(item_series, item_rank):
            issueDate_item_rank[dt][item] = rank
    #  ------------------------------------------------------- #
    career_median = {}
    for et in career_lst:
        mask = concated_df['career'] == et
        item_series = concated_df.loc[mask, feat]
        career_median[et] = item_series.median()
    #  -------------------------------------------------------  #
    residence_median = {}
    for ho in residence_lst:
        mask = concated_df['residence'] == ho
        item_series = concated_df.loc[mask, feat]
        residence_median[ho] = item_series.median()
    #  -------------------------------------------------------  #
    for df in dfs:
        print(feat, df.shape)
        df['label_issueDate_mean'] = df['issueDate'].apply(lambda x: issueDate_label_mean[x])
        df[feat+'_issueDate_median'] = df['issueDate'].apply(lambda x: issueDate_median[x])
        #df['interest_rate_ratio'] = df['interest_rate']/df['interest_rate_median']
#         df[feat+'_issueDate_ratio'] = df.fillna(0).apply(lambda r: issueDate_item_rank[r['issueDate']][r[feat]], axis=1)
#         df[feat+'_career_ratio'] = df.fillna(0).apply(lambda r: r[feat]/career_median[r['career']], axis=1)
#         df[feat+'_residence_ratio'] = df.fillna(0).apply(lambda r: r[feat]/residence_median[r['residence']], axis=1)
        print(feat, df.shape)

(53480, 68)
(20054, 68)
loan (53480, 89)
loan (53480, 91)
loan (20054, 89)
loan (20054, 91)
loan_term_avg (53480, 91)
loan_term_avg (53480, 92)
loan_term_avg (20054, 91)
loan_term_avg (20054, 92)
interest_rate (53480, 92)
interest_rate (53480, 93)
interest_rate (20054, 92)
interest_rate (20054, 93)
annual_income (53480, 93)
annual_income (53480, 94)
annual_income (20054, 93)
annual_income (20054, 94)
dti (53480, 94)
dti (53480, 95)
dti (20054, 94)
dti (20054, 95)
balance_accounts (53480, 95)
balance_accounts (53480, 96)
balance_accounts (20054, 95)
balance_accounts (20054, 96)
balance (53480, 96)
balance (53480, 97)
balance (20054, 96)
balance (20054, 97)
balance_limit (53480, 97)
balance_limit (53480, 98)
balance_limit (20054, 97)
balance_limit (20054, 98)
total_accounts (53480, 98)
total_accounts (53480, 99)
total_accounts (20054, 98)
total_accounts (20054, 99)


In [124]:
df_result_test = df_result_test.drop(['label'], axis=1)
print(df_result_train.shape)
print(df_result_test.shape)

(53480, 99)
(20054, 98)


# 保存结果

In [125]:
#保存结果
df_result_train.to_csv(output_train_path, index=False)
df_result_test.to_csv(output_test_path, index=False)
print('train result', df_result_train.shape)
print('test result', df_result_test.shape)
print('output_path : ', output_train_path, output_test_path)
print('process time : ',time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))
# !ls

train result (53480, 99)
test result (20054, 98)
output_path :  train.dat.v26 test.dat.v26
process time :  2025-09-02 00:01:28


In [14]:
print('done  time : ',time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))
!wc -l $output_train_path
!wc -l $output_test_path
df_result_train

done  time :  2025-09-01 16:34:35
53481 train.dat.v24
20055 test.dat.v24


Unnamed: 0,id,title,career,zip_code,residence,loan,term,interest_rate,issue_time,syndicated,installment,record_time,history_time,total_accounts,balance_accounts,balance_limit,balance,level,label,balance_account_avg,loan_term_avg,balance_accounts_ratio,loan_log,balance_account_avg_log,loan_term_avg_log,balance_accounts_ratio_log,interest_rate_log,balance_log,balance_limit_log,balance_accounts_log,zip_province,zip_city,level_hash,level_ord,grade,interest_rate_cut,interest_rate_log_cut,balance_cut,loan_cut,balance_limit_cut,loan_term_avg_cut,balance_account_avg_cut,record_time_year,record_time_month,record_time_week,record_time_year_month,level_default_ratio,interest_rate_default_ratio,term_default_ratio,tx_max_min_days,tx_count,total_amount,1_amount,0_amount,total_amount_avg,1_amount_avg,0_amount_avg,total_amount_avg2,1_amount_avg2,0_amount_avg2,tx_count_avg,tx_tmstp_max,tx_tmstp_min
0,0,9,0.0,221373,1,7200,36,10.95,1238631967,0,1,1238630622,472006661,17.0,9.0,36200.0,13856.00,A4,0.0,1539.555556,200.000000,0.529412,8.881836,7.339249,5.298317,-0.635989,2.393339,9.536474,10.496814,2.197225,22,2213,71,4.0,1,10,20,40,4,43,6,42,2009,4,14,200904,0.167864,0.166035,0.148731,163.0,48.0,71787.000000,12079.500000,59707.500000,440.411043,74.107362,366.303681,1495.562500,251.656250,1243.906250,0.294479,1.238198e+09,1.224115e+09
1,1,8,10.0,311681,0,21300,36,12.95,1128212052,0,0,1161907665,763779041,17.0,9.0,20400.0,13773.00,B0,1.0,1530.333333,591.666667,0.529412,9.966462,7.333241,6.382943,-0.635989,2.561096,9.530465,9.923290,2.197225,31,3116,137,6.0,2,14,25,40,19,35,20,42,2006,10,43,200610,0.183134,0.184066,0.148731,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
2,2,8,7.0,271562,1,10400,60,21.05,1249171509,0,0,1383958593,727143443,17.0,9.0,10800.0,2023.00,B4,0.0,224.777778,173.333333,0.529412,9.249561,5.415112,5.155217,-0.635989,3.046901,7.612337,9.287301,2.197225,27,2715,369,10.0,2,30,38,19,10,29,4,1,2013,11,45,201311,0.238525,0.241333,0.316692,180.0,48.0,22406.100000,15883.720000,6522.380000,124.478333,88.242889,36.235444,466.793750,330.910833,135.882917,0.266667,1.383955e+09,1.368403e+09
3,3,7,2.0,522083,0,33050,36,16.40,1172882234,0,1,1214353935,687660346,17.0,9.0,24700.0,21992.00,B3,0.0,2443.555556,918.055556,0.529412,10.405777,7.801209,6.822258,-0.635989,2.797281,9.998434,10.114559,2.197225,52,5220,812,9.0,2,21,31,45,22,38,27,46,2008,6,26,200806,0.233343,0.229479,0.148731,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
4,4,8,3.0,101026,1,5200,36,14.35,1172882384,0,0,1240274527,322012875,17.0,9.0,5100.0,1669.00,B2,1.0,185.444444,144.444444,0.529412,8.556414,5.222755,4.972895,-0.635989,2.663750,7.419980,8.536996,2.197225,10,1010,228,8.0,2,17,28,16,1,24,1,1,2009,4,17,200904,0.215479,0.221494,0.148731,169.0,93.0,51163.000000,30823.100000,20339.900000,302.739645,182.385207,120.354438,550.139785,331.431183,218.708602,0.550296,1.238285e+09,1.223683e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53475,53475,2,2.0,603000,1,9000,12,23.55,1172880000,0,0,1157587200,1061769600,12.0,5.0,3535.0,2595.73,A4,0.0,519.146000,750.000000,0.416667,9.104980,6.252185,6.620073,-0.875469,3.159126,7.861623,8.170469,1.609438,60,6030,71,4.0,1,35,42,23,7,20,23,9,2006,9,36,200609,0.167864,0.218524,0.182842,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
53476,53476,0,10.0,601702,1,8000,12,30.70,1160092800,0,0,1138665600,1038268800,5.0,2.0,1965.0,1433.34,B2,0.0,716.670000,666.666667,0.400000,8.987197,6.574615,6.502290,-0.916291,3.424263,7.267763,7.583248,0.693147,60,6017,228,8.0,2,48,49,13,5,12,21,18,2006,1,5,200601,0.215479,0.178876,0.182842,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
53477,53477,2,10.0,602808,1,10000,12,9.40,1180310400,0,0,1108771200,1087603200,12.0,5.0,7253.0,3813.79,B2,0.0,762.758000,833.333333,0.416667,9.210340,6.636941,6.725434,-0.875469,2.240710,8.246379,8.889170,1.609438,60,6028,228,8.0,2,7,16,27,9,26,25,21,2005,2,7,200502,0.215479,0.140234,0.182842,180.0,251.0,113461.244335,69300.836223,44160.408112,630.340246,385.004646,245.335601,452.036830,276.098949,175.937881,1.394444,1.161475e+09,1.145923e+09
53478,53478,0,10.0,602102,2,9000,12,24.40,1176768000,0,0,1159660800,1071792000,3.0,3.0,2045.0,1006.40,A2,0.0,335.466667,750.000000,1.000000,9.104980,5.815523,6.620073,0.000000,3.194583,6.914135,7.623153,1.098612,60,6021,687,2.0,1,36,43,10,7,12,23,3,2006,10,39,200610,0.132637,0.192063,0.182842,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
