In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest,SelectPercentile

In [2]:
# 读取文件，并查看前5行
df = pd.read_csv('data.csv',encoding='gb18030')
df.head()

Unnamed: 0.1,Unnamed: 0,custid,trade_no,bank_card_no,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,5,2791858,20180507115231274000000023057383,卡号1,0.01,0.99,0,0.9,0.55,0.313,...,2900.0,1688.0,1200.0,75.0,1.0,2.0,1200.0,1200.0,12.0,18.0
1,10,534047,20180507121002192000000023073000,卡号1,0.02,0.94,2000,1.28,1.0,0.458,...,3500.0,1758.0,15100.0,80.0,5.0,6.0,22800.0,9360.0,4.0,2.0
2,12,2849787,20180507125159718000000023114911,卡号1,0.04,0.96,0,1.0,1.0,0.114,...,1600.0,1250.0,4200.0,87.0,1.0,1.0,4200.0,4200.0,2.0,6.0
3,13,1809708,20180507121358683000000388283484,卡号1,0.0,0.96,2000,0.13,0.57,0.777,...,3200.0,1541.0,16300.0,80.0,5.0,5.0,30000.0,12180.0,2.0,4.0
4,14,2499829,20180507115448545000000388205844,卡号1,0.01,0.99,0,0.46,1.0,0.175,...,2300.0,1630.0,8300.0,79.0,2.0,2.0,8400.0,8250.0,22.0,120.0


### Task1 

+ 数据类型的分析
+ 无关特征删除
+ 数据类型转换
+ 缺失值处理

In [3]:
# 查看特征的数据类型
t = df.dtypes
t[t=='object']

trade_no                   object
bank_card_no               object
reg_preference_for_trad    object
source                     object
id_name                    object
latest_query_time          object
loans_latest_time          object
dtype: object

In [4]:
# 查看每个特征
df.columns

Index(['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no',
       'low_volume_percent', 'middle_volume_percent',
       'take_amount_in_later_12_month_highest',
       'trans_amount_increase_rate_lately', 'trans_activity_month',
       'trans_activity_day', 'transd_mcc', 'trans_days_interval_filter',
       'trans_days_interval', 'regional_mobility', 'student_feature',
       'repayment_capability', 'is_high_user', 'number_of_trans_from_2011',
       'first_transaction_time', 'historical_trans_amount',
       'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
       'avg_consume_less_12_valid_month', 'abs',
       'top_trans_count_last_1_month', 'avg_price_last_12_month',
       'avg_price_top_last_12_valid_month', 'reg_preference_for_trad',
       'trans_top_time_last_1_month', 'trans_top_time_last_6_month',
       'consume_top_time_last_1_month', 'consume_top_time_last_6_month',
       'cross_consume_count_last_1_month',
       'trans_fail_top_count_enum_last_1_mont

In [5]:
# 删除无关特征
# 删除单一的和种类过多的离散特征
# 首先删除非数值型的无关特征
df = df.drop(['trade_no','bank_card_no','source','id_name'],axis=1)

In [6]:
# 处理缺失值, 用0填充缺失值
df = df.fillna(0)

In [7]:
# 数据类型转换
df['reg_preference_for_trad'].head() # 文字
df['latest_query_time'].head() # 时间
df['latest_query_time'] = pd.to_datetime(df['latest_query_time'])
df['loans_latest_time'] = pd.to_datetime(df['loans_latest_time'])

In [8]:
set(df['reg_preference_for_trad'])

{0, '一线城市', '三线城市', '二线城市', '其他城市', '境外'}

In [9]:
# 赋值字典
d1 = {0:0, '一线城市':5, '三线城市':3, '二线城市':4, '其他城市':2, '境外':1}

In [10]:
# 利用map函数进行调换
df['reg_preference_for_trad'] = df['reg_preference_for_trad'].map(d1)

In [11]:
# 由于并不清楚特征的确切含义，暂时不删除其余数值型特征。

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
y = df['status'].values
X = df.drop(['status','latest_query_time','loans_latest_time'],axis=1)

In [24]:
X,y

(      Unnamed: 0   custid  low_volume_percent  middle_volume_percent  \
 0              5  2791858                0.01                   0.99   
 1             10   534047                0.02                   0.94   
 2             12  2849787                0.04                   0.96   
 3             13  1809708                0.00                   0.96   
 4             14  2499829                0.01                   0.99   
 5             15   518072                0.02                   0.98   
 6             16  1205125                0.02                   0.98   
 7             18  1129897                0.02                   0.98   
 8             20  2599411                0.03                   0.65   
 9             26  1413051                0.01                   0.99   
 10            28   321061                0.01                   0.99   
 11            29   750939                0.01                   0.99   
 12            30  1169544                0.00     

In [25]:
# 拆分数据集
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=2018)



In [26]:
rf = RandomForestRegressor()

In [27]:
names = df.drop(['status','latest_query_time','loans_latest_time'],axis=1).columns
len(names),X.shape

(83, (4754, 83))

In [28]:
# 使用随机森林对特征进行打分
rf.fit(X_train, y_train)
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True))




Features sorted by their score:
[(0.1044, 'trans_fail_top_count_enum_last_1_month'), (0.0754, 'history_fail_fee'), (0.0272, 'apply_score'), (0.0269, 'loans_score'), (0.0251, 'trans_amount_3_month'), (0.0229, 'abs'), (0.0228, 'trans_day_last_12_month'), (0.0209, 'custid'), (0.0209, 'Unnamed: 0'), (0.0208, 'repayment_capability'), (0.0198, 'loans_latest_day'), (0.0192, 'avg_price_last_12_month'), (0.0191, 'trans_amount_increase_rate_lately'), (0.0191, 'loans_overdue_count'), (0.0155, 'max_cumulative_consume_later_1_month'), (0.0142, 'historical_trans_amount'), (0.0141, 'trans_days_interval_filter'), (0.0139, 'pawns_auctions_trusts_consume_last_1_month'), (0.0138, 'latest_query_day'), (0.0137, 'consume_top_time_last_6_month'), (0.0133, 'pawns_auctions_trusts_consume_last_6_month'), (0.0132, 'trans_days_interval'), (0.013, 'max_consume_count_later_6_month'), (0.0126, 'loans_long_time'), (0.0125, 'consume_mini_time_last_1_month'), (0.0121, 'trans_activity_month'), (0.012, 'latest_one_month_

In [59]:
def CalcIV(Xvar,Yvar,k):
    # 防止除数为0，作平滑处理
    if len(np.unique(Xvar))<=10:
        N_0=np.sum(Yvar==0)+len(np.unique(Xvar))*0.01
        N_1=np.sum(Yvar==1)+len(np.unique(Xvar))*0.01
        N_0_group=np.zeros(np.unique(Xvar).shape)
        N_1_group=np.zeros(np.unique(Xvar).shape)
    
        for i in range(len(np.unique(Xvar))):       
            N_0_group[i] = len(Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==0)])+0.01
            N_1_group[i] = len(Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==1)])+0.01
        iv = np.sum((N_0_group/N_0-N_1_group/N_1)*np.log((N_0_group/N_0)/(N_1_group/N_1)))
    # 连续型特征，分成k组
    else:
        N_0=np.sum(Yvar==0)+k*0.01
        N_1=np.sum(Yvar==1)+k*0.01
        N_0_group=np.zeros(k)
        N_1_group=np.zeros(k)
        for i in range(k-1):
            n = len(np.unique(Xvar))//k
            N_0_group[i] = len(Yvar[(Xvar>=np.unique(Xvar)[i*n])&(Xvar<np.unique(Xvar)[(i+1)*n])&(Yvar==0)])+0.01
            N_1_group[i] = len(Yvar[(Xvar>=np.unique(Xvar)[i*n])&(Xvar<np.unique(Xvar)[(i+1)*n])&(Yvar==1)])+0.01
        N_0_group[k-1] = len(Yvar[(Xvar>=np.unique(Xvar)[(k-1)*n])&(Yvar==0)])+0.01
        N_1_group[k-1] = len(Yvar[(Xvar>=np.unique(Xvar)[(k-1)*n])&(Xvar<np.unique(Xvar)[(i+1)*n])&(Yvar==1)])+0.01
        iv = np.sum((N_0_group/N_0-N_1_group/N_1)*np.log((N_0_group/N_0)/(N_1_group/N_1)))
    #if iv>=1.0:## 处理极端值
        #iv=1
    return iv

def caliv_batch(df,Yvar,k=5):
    ivlist=[]
    for col in df.columns:
        iv=CalcIV(df[col],Yvar,k)
        ivlist.append(iv)
    names=list(df.columns)
    iv_df=pd.DataFrame({'Var':names,'Iv':ivlist},columns=['Var','Iv'])

    return iv_df,ivlist
im_iv, ivl = caliv_batch(X_train,y_train)


In [45]:
np.unique(X['loans_credibility_limit'])

array([ 0., 54., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66.,
       67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
       80., 81., 82., 83., 84., 85., 89.])

In [51]:
np.ceil(2)

2.0

In [57]:
np.zeros(3)

array([0., 0., 0.])

In [60]:
ivl

[1.8573719854223125,
 1.8491513471655685,
 0.01819557200683234,
 8.8855425469641,
 0.1613062473845542,
 0.3766422239061624,
 5.981236669578806,
 1.0500553015415242,
 0.06739540725152715,
 0.10322098721952792,
 0.07622850320961784,
 0.01056377748771872,
 0.007549673872877196,
 1.141676099478533,
 0.015145563864144116,
 0.10328588613316668,
 1.990481803016517,
 1.9103718503567375,
 0.3811344798222696,
 1.2851416160481408,
 1.6586463982916086,
 2.336174353612323,
 0.800153808050931,
 0.07335448260574148,
 0.20490918599866967,
 0.08362665577416335,
 0.006903408450954423,
 0.2312172448966151,
 0.057930428445221595,
 0.2426223108574258,
 0.05766990313690747,
 0.009272465075166875,
 0.5311779908867063,
 0.23383037418030303,
 0.2225645468437458,
 0.7835583149967937,
 0.5685162211018295,
 0.03143619259156257,
 0.030495314341232652,
 0.3198125450105516,
 1.3075719440653433,
 0.010731404292435257,
 1.1244572251021796,
 0.1100141881139219,
 1.6948313343173775,
 0.20635758406720978,
 0.047083409038