In [7]:
# 导入科学计算包，读取数据，初步观测数据
import pandas as pd
df = pd.read_csv('loans.csv')
df.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
1,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0.0,-1
2,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0.0,-1
3,C,C1,1,1,RENT,10.08,debt_consolidation,36 months,1,1,91.7,0.0,-1
4,B,B2,0,4,RENT,7.06,other,36 months,1,1,55.5,0.0,-1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46508 entries, 0 to 46507
Data columns (total 13 columns):
grade                    46508 non-null object
sub_grade                46508 non-null object
short_emp                46508 non-null int64
emp_length_num           46508 non-null int64
home_ownership           46508 non-null object
dti                      46508 non-null float64
purpose                  46508 non-null object
term                     46508 non-null object
last_delinq_none         46508 non-null int64
last_major_derog_none    46508 non-null int64
revol_util               46508 non-null float64
total_rec_late_fee       46508 non-null float64
safe_loans               46508 non-null int64
dtypes: float64(3), int64(5), object(5)
memory usage: 4.6+ MB


### 利用info()方法查看数据，可以看到有46508行，13列（total 13 columns），而且数据非常完整。用shape方法也可以得到相同的结论。

In [8]:
df.shape

(46508, 13)

In [12]:
# 查看大体的数值描述
df.describe()

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
count,46508.0,46508.0,46508.0,46508.0,46508.0,46508.0,46508.0,46508.0
mean,0.128645,6.343511,16.129507,0.586372,0.874989,55.964813,1.282769,0.004472
std,0.33481,3.750348,7.599843,0.492489,0.330735,25.526221,7.007558,1.000001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,0.0,3.0,10.47,0.0,1.0,37.7,0.0,-1.0
50%,0.0,6.0,16.0,1.0,1.0,58.4,0.0,1.0
75%,0.0,11.0,21.6,1.0,1.0,76.5,0.0,1.0
max,1.0,11.0,38.13,1.0,1.0,150.7,208.82,1.0


### 只有关于数值型的描述，从均值可以看到平均贷款收入比例约16%，这是比较理性的；但是约87%的人有还款逾期90天以上的记录，说明拖欠还款的情况比较普遍；导致贷款安全比例只到不到0.45%，所以，用未来的钱需要谨慎呐。量入为出，理性消费。

In [14]:
# 划分特征集与目标集
X = df.drop('safe_loans', axis=1)
Y = df.safe_loans
X.shape

(46508, 12)

In [15]:
Y.shape

(46508,)

In [16]:
X.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee
0,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0
1,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0.0
2,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0.0
3,C,C1,1,1,RENT,10.08,debt_consolidation,36 months,1,1,91.7,0.0
4,B,B2,0,4,RENT,7.06,other,36 months,1,1,55.5,0.0


In [21]:
pd.get_dummies(df).head()

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
0,1,1,1.0,1,1,9.4,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,5,5.55,1,1,32.6,0.0,-1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,1,1,18.08,1,1,36.5,0.0,-1,0,1,...,0,0,0,0,1,0,0,0,0,1
3,1,1,10.08,1,1,91.7,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,4,7.06,1,1,55.5,0.0,-1,0,1,...,0,0,0,0,1,0,0,0,1,0


### 此处因为等级的原因，导致独热编码后维度太高（68列），故不采用该做法。其实，怎样编码也是需要研究的，此工作留待日后巩固。

In [26]:
# 标记编码（把非数值型转换为数值型）
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
X_trans = X.apply(lambda x: d[x.name].fit_transform(x))
X_trans.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee
0,2,13,1,1,3,97,0,1,1,1,107,0
1,5,26,0,5,2,552,9,1,1,1,349,0
2,1,9,1,1,3,1805,8,1,1,1,388,0
3,2,10,1,1,3,1005,2,0,1,1,952,0
4,1,6,0,4,3,703,8,0,1,1,583,0


In [29]:
# 划分训练集和测试集
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_trans, Y, random_state=1)
X_train.shape

(34881, 12)

In [30]:
X_test.shape

(11627, 12)

In [31]:
# 调用scikit-learnl的决策树模型
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X_train, Y_train)

In [34]:
# 利用决策树模型测试具体某个的测试集
test_rec = X_test.iloc[1,:]
print('测试集第一行预测值: ', clf.predict([test_rec]))
print('测试集第一行实际值: ', Y_test.iloc[1])

测试集第一行预测值:  [1]
测试集第一行实际值:  1


In [35]:
# 为决策树评分
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, clf.predict(X_test))

0.61615205986066912

### 准确率约为61.6%，刚过及格线......不过这个模型也实在是太简单了，仅仅是3层的随机森林的baseline啦，而且没有进行参数设定。后续优化可从交叉验证（案例只进行一次训练集和数据集划分）、特征工程（标记编码方法应该优化）、模型参数调优和模型融合等方面考虑，留待日后巩固再回来重构。这个案例主要是为了巩固一次泰坦尼克号生还分析的步骤和方法。