# 准备阶段

## 数据集描述

* 该数据集取自现金贷用户最后一个申请通过事件。<br>

* 时间：2018年1月1号~2018年2月10号 <br>

* **数据列**需要注意的地方：

  * initialamount：授信金额，单位是分
  * 列名均为小写



## 加载依赖的块和库

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn import preprocessing
from scipy.stats import skew, boxcox
import os

from pyecharts import Bar
import mpld3
from mpld3 import plugins

%matplotlib inline

# 探索性数据分析(EDA)及可视化

In [None]:
# 定义异常值范围
na_values=['','NULL','NA','null','na','Na','-9999','Infinity','NaN']
xpath='D:\\\xc8\xd5\xb3\xa3\xb7\xd6\xce\xf6\xb1\xa8\xb8\xe6\\\xb3\xac\xd0\xa1\xb6\xee\xb7\xd6\xce\xf6\\'
# 读取数据集
data_set_name=pd.read_table(xpath+'data\\data_set_name.txt',header=None,sep='\t',na_values = na_values)
data_set=pd.read_table(xpath+'data\\data_set.txt',header=None,sep='\t',na_values = na_values,names=data_set_name[0],index_col=None)

# 查看数据集信息
print('---------------------------------------数据预览：---------------------------------------------')
print(data_set.head())

print('---------------------------------------数据集基本信息：---------------------------------------')
print(data_set.info())

print('---------------------------------------数据统计信息：-----------------------------------------')
print(data_set.describe())

## 数据处理

### 聚合分类变量

In [None]:
# 申请城市city_idx聚合为1线城市，2线城市，3线城市及其他
data_set.loc[data_set.city_idx==1.0,'city_index_cut']='1_city'
data_set.loc[data_set.city_idx==1.5,'city_index_cut']='2_city'
data_set.loc[data_set.city_idx==2.0,'city_index_cut']='3_city'
data_set.loc[(data_set.city_idx!=1.0) & (data_set.city_idx!=1.5) & (data_set.city_idx!=2.0),'city_index_cut']='4_city'

# 构建新变量，并按照索引排序
data_set['city_index_cut'].value_counts(dropna=False).sort_index()

### 离散化数值型变量

In [None]:
# age
age_bins=[0,18,22,28,35,45,55,np.inf]
age_group_names=range(1,len(age_bins))
cats=pd.cut(list(data_set['age']),age_bins,right = False,labels=age_group_names) # right=False 表示左闭口右开
data_set['age_cut']=list(cats)

# 同盾和百度多头
duotou_vars=['jiedaiscore','bidnormalscoreexp3v2','prcidnormalscoreexp3v2','phonenormalscoreexp3v2']
duotou_bins=[0,20,40,60,80,100]
duotou_group_name=range(1,len(duotou_bins))
for col in duotou_vars:
    cats=pd.cut(list(data_set[col]),duotou_bins,right = False,labels=duotou_group_name)
    data_set[str(col)+'_'+'cut']=list(cats)


# 手机入网时长 mobileOperatorRegDate
mobile_bins=[0,10,20,30,40,np.inf]
mobile_group_name=range(1,len(mobile_bins))
cats=pd.cut(list(data_set['mobileoperatorregdate']),mobile_bins,right=False,labels=mobile_group_name)
data_set['mobileoperatorregdate_cut']=list(cats)


# 征信
zx_vars=['zxaccountcnt','zxhouseloancnt']
zx_bins=[-np.inf,1,5,10,np.inf]
zx_group_name=range(1,len(zx_bins))
for col in zx_vars:
    cats=pd.cut(list(data_set[col]),zx_bins,right=False,labels=zx_group_name)
    data_set[str(col)+'_'+'cut']=list(cats)
    

# preaficoscorev5
prea_bins=[0,480,520,550,600,650,np.inf]
pre_group_name=range(1,len(prea_bins))
cats=pd.cut(list(data_set['preaficoscorev5']),prea_bins,right=False,labels=pre_group_name)
data_set['preaficoscorev5_cut']=list(cats)



In [None]:
# 概览
data_set[['preaficoscorev5','preaficoscorev5_cut','zxaccountcnt','zxaccountcnt_cut','age','age_cut']].head()

### 构建逻辑变量

In [None]:
# 判断用户是否用信：due_days大于等于0，表示用户用信
action=[]
for i in range(len(data_set)):
    if data_set['due_days'][i]>=0:
        action.append(1)
    else :
        action.append(0)

data_set['action']=action

In [None]:
#  判断是否逾期用户
odu_bid=[]
for i in range(len(data_set)):
    if data_set['due_days'][i]>0:
        odu_bid.append(1)
    else :
        odu_bid.append(0)

data_set['odu_bid']=odu_bid

In [None]:
# 判断是否是超小额用户
petty_bid=[]
for i in range(len(data_set)):
    if data_set['initialamount'][i] == 50000:
        petty_bid.append(1)
    else :
        petty_bid.append(0)

data_set['petty_bid']=petty_bid

In [None]:
# 判断是否是用信超小额用户
action_petty_bid=[]
for i in range(len(data_set)):
    if data_set['initialamount'][i] == 50000 and data_set['due_days'][i]>=0:
        action_petty_bid.append(1)
    else :
        action_petty_bid.append(0)

data_set['action_petty_bid']=action_petty_bid

In [None]:
# 构建白名单和超小额组合字段
data_set.loc[data_set.is_white==1,'white_petty_bid']='A'
data_set.loc[(data_set.is_white==0)&(data_set.petty_bid==0),'white_petty_bid']='B'
data_set.loc[(data_set.is_white==0)&(data_set.petty_bid==1),'white_petty_bid']='C'

### 基本数据概览

In [None]:
white_petty_grouped = data_set.groupby('white_petty_bid')
# 迭代groupby对象
for group, frame in white_petty_grouped:
    apply_sum = frame['bid'].count()
    action_sum = frame['action'].sum()
    odu_sum = frame['odu_bid'].sum()
    ratio_action=round(frame['action'].sum()*1.0/frame['bid'].count()*100,2)
    ratio_odu=round(frame['odu_bid'].sum()*1.0/frame['action'].sum()*100,2)
    print('{}群体授信人数：{}，用信人数：{}，逾期人数{} , 用信率(%)：{}，逾期率(%)：{}'.format(group, apply_sum, action_sum, odu_sum , ratio_action,ratio_odu))

结论1：
* 整体授信群体一共1083380人，用信581076人，逾期19076人。用信率：53.6%，逾期率：3.2%
* 白名单群体用信率和逾期率跟大盘持平
* 非白群体中，超小额群体用信意愿最低，且逾期率也最高，明显高于整体非白群体

## 透视表

* 对于分类型变量，考虑不同组合下，不同群体的分布变化

### 超小额

In [None]:
# 表格：超小额用户群体用信人数和逾期人数
grouped = data_set.groupby(['petty_bid'])
for group, frame in grouped:
    print('超小额为{}的申请用户个数：{}'.format(group, len(frame)))

grouped.agg({'petty_bid':np.sum,'action': np.sum ,'odu_bid':np.sum})

### 白名单

In [None]:
# 表格：超小额用户群体用信人数和逾期人数
grouped = data_set.groupby(['is_white'])
for group, frame in grouped:
    print('名单类型为{}的申请用户个数：{}'.format(group, len(frame)))

grouped.agg({'petty_bid':np.sum,'action': np.sum ,'odu_bid':np.sum})

### （白名单，超小额）

In [None]:
# 表格：超小额用户群体用信人数和逾期人数
grouped = data_set.groupby(['is_white','petty_bid'])
for group, frame in grouped:
    print('（白名单，超小额）为{}的申请用户个数：{}'.format(group, len(frame)))

grouped.agg({'petty_bid':np.sum,'action': np.sum ,'odu_bid':np.sum})

In [None]:
# 作图：三类群体用户和逾期分布
plt.figure()
ax2 = data_set.groupby(['white_petty_bid','action','odu_bid']).size().plot(kind='bar')
ax2.set_title('apply_bid_cnt vs (white_petty_bid , action , odu_bid)')
ax2.set_xlabel('(white_petty_bid , action , odu_bid)')
ax2.set_ylabel('apply_bid_cnt')

# 添加标注
for p in ax2.patches:
    ax2.annotate(str(format(int(p.get_height()), ',d')), (p.get_x(), p.get_height()*1.01))  

由透视表和柱状图可得结论：<br>
* 整体高逾期率主要由非白群体造成：22.2%授信人群占比，29.8%的逾期人数占比
* 虽然超小额群体在非白人群中的逾期与较高11.8%，但是整体人数占比也较低6.7%。

## 数据下钻及可视化

* 分析white_petty_bid群体（细分为A，B，C）的用信和逾期指标在不同 **X特征** 下的分布情况


In [None]:
# 不同目标群体在不同年龄区间的用信和逾期分布
# data_set.pivot_table(values=['bid','action','odu_bid'], index='white_petty_bid', columns='age_cut', aggfunc={"bid":len,"action":np.sum,"odu_bid":np.sum}, margins=True)
data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='age_cut', aggfunc=np.sum, margins=True)

In [None]:
# 画堆积图
age_groups=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='age_cut', aggfunc=np.sum)['odu_bid']
A = age_groups.iloc[0, 0 : 6].values
B = age_groups.iloc[1, 0 : 6].values
C = age_groups.iloc[2, 0 : 6].values
labels = age_groups.index.values.tolist()
age_cut = age_groups.columns.tolist()

bar = Bar("age_cut VS white_petty_bid")
bar.add('白名单', age_cut, A, is_stack=True)
bar.add('非白非小额', age_cut, B, is_stack=True)
bar.add('非白超小额', age_cut, C, is_stack=True)
bar

结论：<br>
* 超小额群体的逾期人群主要分布在18~22岁年龄段,31.2%的用信群体贡献了36.4%的逾期人群

In [None]:
#不同目标群体在不同地区的用信和逾期分布
data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='city_index_cut', aggfunc=np.sum, margins=True)

In [None]:
# 画堆积图
city_groups=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='city_index_cut', aggfunc=np.sum)['action']
A = city_groups.iloc[0, 0 : city_groups.shape[1]].values
B = city_groups.iloc[1, 0 : city_groups.shape[1]].values
C = city_groups.iloc[2, 0 : city_groups.shape[1]].values
labels = city_groups.index.values.tolist()
city_idx = city_groups.columns.tolist()

bar = Bar("city_idx VS white_petty_bid")
bar.add('白名单', city_idx, A, is_stack=True)
bar.add('非白非小额', city_idx, B, is_stack=True)
bar.add('非白超小额', city_idx, C, is_stack=True)
bar

In [None]:
# 不同目标群体在不同X特征下的用信和逾期分布
# 同盾黑名单
tongdunblacklevel=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='tongdunblacklevel', aggfunc=np.sum, margins=True)

# prcid分
prcidnormalscoreexp3v2_cut=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='prcidnormalscoreexp3v2_cut', aggfunc=np.sum, margins=True)

# 学历
education=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='education', aggfunc=np.sum, margins=True)

# 手机入网时长
mobileoperatorregdate_cut=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='mobileoperatorregdate_cut', aggfunc=np.sum, margins=True)

# 借贷分
jiedaiscore_cut=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='jiedaiscore_cut', aggfunc=np.sum, margins=True)

# preA
preacustseg=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='preacustseg', aggfunc=np.sum, margins=True)

# preA_V5
preaficoscorev5_cut=data_set.pivot_table(values=['action','odu_bid'], index='white_petty_bid', columns='preaficoscorev5_cut', aggfunc=np.sum, margins=True)


In [None]:
print('不同目标群体在不同X特征下的用信和逾期分布 : ')
preaficoscorev5_cut

In [None]:
# 箱图分布
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

a = sns.boxplot(x='white_petty_bid', y='zxaccountcnt', data=data_set[data_set.odu_bid==1], ax=axs[0][0])
# axs[0][0].set_yscale('log')

b = sns.boxplot(x='white_petty_bid', y='preaficoscorev5', data=data_set[data_set.odu_bid==1], ax=axs[0][1])
# axs[0][1].set(ylim=(0, 0.5e8))

c = sns.boxplot(x='white_petty_bid', y='prcidnormalscoreexp3v2', data=data_set[data_set.odu_bid==1], ax=axs[1][0])
# axs[1][0].set(ylim=(0, 3e7))

d = sns.boxplot(x='white_petty_bid', y='age', data=data_set[data_set.odu_bid==1], ax=axs[1][1])

plt.show()


结论：<br>

* 多数超小额群体的部分模型分为空值，如preacustseg，preaficoscorev5等
* 由箱图可得，超小额群体的逾期用户年龄和preAsscoreV5分普遍较低

In [None]:
# 箱图分布
fig, axs = plt.subplots(4, 2, figsize=(10, 10))

a0 = sns.boxplot(x='white_petty_bid', y='zxaccountcnt', data=data_set[data_set.action==1], ax=axs[0][0])
a1 = sns.boxplot(x='white_petty_bid', y='zxaccountcnt', data=data_set[data_set.odu_bid==1], ax=axs[0][1])
# axs[0][0].set_yscale('log')

b0 = sns.boxplot(x='white_petty_bid', y='preaficoscorev5', data=data_set[data_set.action==1], ax=axs[1][0])
b1 = sns.boxplot(x='white_petty_bid', y='preaficoscorev5', data=data_set[data_set.odu_bid==1], ax=axs[1][1])
# axs[0][1].set(ylim=(0, 0.5e8))

c0 = sns.boxplot(x='white_petty_bid', y='prcidnormalscoreexp3v2', data=data_set[data_set.action==1], ax=axs[2][0])
c1 = sns.boxplot(x='white_petty_bid', y='prcidnormalscoreexp3v2', data=data_set[data_set.odu_bid==1], ax=axs[2][1])
# axs[1][0].set(ylim=(0, 3e7))

d0 = sns.boxplot(x='white_petty_bid', y='age', data=data_set[data_set.action==1], ax=axs[3][0])
d1 = sns.boxplot(x='white_petty_bid', y='age', data=data_set[data_set.action==1], ax=axs[3][1])

plt.show()


结论：<br>

* 虽然超小额群体的逾期用户年龄和preAsscoreV5分普遍较低，但是趋势与大盘一致

## 趋势性分析

In [None]:
# 首先构建时间变量
from datetime import datetime, date, time
from time import strftime, localtime

apply_time=data_set['apply_time'].astype(str)
apply_day=[]
for i in apply_time:
    a_1=datetime.strptime(i,"%Y%m%d%H%M%S").strftime("%m-%d")
    apply_day.append(a_1)

data_set['apply_dt']=apply_day
data_set['apply_dt']=data_set['apply_dt'].astype(object)

In [None]:
# 不同目标群体逾期人数时间分布
line_petty_df=data_set.pivot_table(values=['odu_bid'], index='apply_dt', columns='white_petty_bid', aggfunc=[np.sum])
# line_petty_df.plot()

# 时间趋势
fig, ax = plt.subplots(figsize=(12, 8))
labels = ['A','B','C']
lines = ax.plot( line_petty_df.values, lw=2)
interactive_legend = plugins.InteractiveLegendPlugin(lines, labels)
plugins.connect(fig, interactive_legend)

mpld3.display()

In [None]:
# 不同目标群体逾期人数时间分布
city_petty_df=data_set.pivot_table(values=['odu_bid'], index='apply_dt', columns=['white_petty_bid','city_index_cut'], aggfunc=[np.sum])
city_petty_df.plot()

## 相关性分析

* 对于连续型变量，考虑不同群体下变量间的相关性

In [None]:
# 查看变量间的相关性
corr_data=data_set[data_set.petty_bid==1][['white_petty_bid','jiedaiscore','bidnormalscoreexp3v2','prcidnormalscoreexp3v2','preaficoscorev5','zxhouseloancnt','zxaccountcnt','age','due_days','phonenormalscoreexp3v2']]
sns.heatmap(corr_data.corr())

思考：

* 超小额群体中，某些变量存在相关性，通过这些有相关性的变量能否从非白群体中定位超小额用户呢？

In [None]:
# 筛选非白群体
sns_data=data_set[data_set.odu_bid==1][['white_petty_bid','jiedaiscore','bidnormalscoreexp3v2','preaficoscorev5','zxhouseloancnt','zxaccountcnt','age']]
sns.pairplot(sns_data, hue='white_petty_bid', diag_kind='kde')

结论：

* C客群整体在变量preaficoscorev5，zxhouseloancnt分布偏低

In [None]:
# 用于查看多遍量分布
petty_data=data_set[data_set.petty_bid==1][['age','jiedaiscore']]
plt.figure()
grid = sns.jointplot(petty_data['age'], petty_data['jiedaiscore'], alpha=0.7)
# grid.ax_joint.set_aspect('equal')

In [None]:
# 用于查看多遍量分布
petty_data=data_set[data_set.white_petty_bid!='A'][['due_days','age','jiedaiscore']]
# 颜色(c)和大小(s)有 due_days 列的数据决定
ax = petty_data.plot('age', 'jiedaiscore', kind='scatter',
        c='due_days', s=petty_data['due_days'], colormap='viridis')

结论：

* 什么样的超小额用户会用信：
* 什么样的超小额用户用信后比较容易逾期：


# 数据建模

## 准备数据

In [None]:
#加载所需库
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
# 构建label
data_set.loc[(data_set.is_white==0)&(data_set.petty_bid==1)&(data_set.odu_bid==1),'label']='A'
data_set.loc[(data_set.is_white==0)&(data_set.petty_bid==1)&(data_set.odu_bid==0),'label']='B'
data_set.loc[(data_set.is_white==1)&(data_set.odu_bid==1),'label']='C'
data_set.loc[(data_set.is_white==1)&(data_set.odu_bid==0),'label']='D'

In [None]:
# 数据集扩充和抽样
data_1=pd.concat([data_set[data_set.label=='A']]*10)
data_2=data_set[data_set.label=='B'].sample(n=7000)
data_3=data_set[data_set.label=='C'].sample(n=7000)
data_4=data_set[data_set.label=='D'].sample(n=7000)

# 合并数据
data_set_new=pd.concat([data_1,data_2,data_3,data_4],ignore_index=True)

In [None]:
# 筛选入模变量
need_cols=['age','preaficoscorev5','zxaccountcnt','bidnormalscoreexp3v2','jiedaiscore']
df_x=data_set_new[need_cols]
# 缺失值填充
df_x=df_x.fillna(0)

In [None]:
# 数据归一化
model_data_x=df_x.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))   
# 筛选label
model_data_y=data_set_new['label']

In [None]:
# 合并数据集.axis=1表示横向合并数据
model_data=pd.concat([model_data_x,model_data_y],axis=1)

In [None]:
# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(model_data_x, model_data_y, test_size=0.3, random_state=0)
print('数据集样本数：{}，\n 训练集样本数：{}，\n 测试集样本数：{}'.format(len(model_data_x), len(X_train), len(X_test)))

## 模型建立及选择

In [None]:
# 执行本地Python脚本
%load train_model.py

In [None]:
# 模型比较
model_name_param_dict = {'kNN':     [5, 10, 15],
                         'LR':      [0.01, 1, 100],
                         'SVM':     [0.01, 1, 100],}

results_df = pd.DataFrame(columns=['Accuracy (%)', 'Time (s)'],index=list(model_name_param_dict.keys()))

results_df.index.name = 'Model'
for model_name, param_range in model_name_param_dict.items():
        _, best_acc, mean_duration = train_model(X_train, y_train, X_test, y_test,param_range, model_name)
        results_df.loc[model_name, 'Accuracy (%)'] = best_acc * 100
        results_df.loc[model_name, 'Time (s)'] = mean_duration


In [None]:
results_df

## kNN

In [None]:
# 筛选最优的K值
k_range = range(1, 20)
acc_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    acc_scores.append(knn.score(X_test, y_test))

# 建立模型
knn = KNeighborsClassifier(n_neighbors=3)
# 训练模型
knn.fit(X_train, y_train)
# 测试模型/预测
y_pred = knn.predict(X_test)

# 概率预测输出每个样本属于某一类的概率
# 输出的结果是array。其中p[i][j] 是通过概率 kNN 判断 X[i] 属于第 j 类的概率
y_prob=knn.predict_proba(X_test)

# 两种计算精度的方法
acc = accuracy_score(y_test, y_pred)

print('acc_ratio:', acc)

In [None]:
# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

result_data=X_test
result_data['y_test']=y_test
result_data['y_pred']=y_pred
result_data.reset_index(drop=True).head(100)

# 结论

* 超小额用户