---
title: 6种分类模型比较
date: 2019-08-07
categories: [人工智能, 监督学习]
mathjax: false
---

## 数据集载入

In [15]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(u"2019-08-01_金融数据描述_data1.csv",encoding = 'gbk')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 90 columns):
Unnamed: 0                                    4754 non-null int64
custid                                        4754 non-null int64
trade_no                                      4754 non-null object
bank_card_no                                  4754 non-null object
low_volume_percent                            4752 non-null float64
middle_volume_percent                         4752 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4751 non-null float64
trans_activity_month                          4752 non-null float64
trans_activity_day                            4752 non-null float64
transd_mcc                                    4752 non-null float64
trans_days_interval_filter                    4746 non-null float64
trans_days_interval                           4752 non-null float64
regional_mobility

## 特征预处理

#### 删除无用

In [22]:
delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no','id_name','latest_query_time','source','loans_latest_time','first_transaction_time', 'student_feature']
df = df.drop(delete,axis=1)

#### 处理分类型特征

In [23]:
from sklearn.preprocessing import LabelEncoder
df['reg_preference_for_trad'] = LabelEncoder().fit_transform(df['reg_preference_for_trad'].astype(str))

#### 使用众数填充

In [24]:
from sklearn.preprocessing import Imputer
for i in range(df.shape[1]):
    feature = df.iloc[:,i].values.reshape(-1,1)
    imp_mode = Imputer(strategy='most_frequent')
    df.iloc[:,i] = imp_mode.fit_transform(feature)

### 数据划分

In [25]:
X = df[:].drop("status",axis=1)
y = df["status"]

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 数据归一化

In [28]:
from sklearn.preprocessing import minmax_scale
X_train = minmax_scale(X_train)
X_test =  minmax_scale(X_test)

In [None]:
以下

In [29]:
from sklearn.model_selection import cross_val_score
clf_svc_cv = svm.SVC(kernel='linear',C=1)
scores_clf_svc_cv = cross_val_score(clf_svc_cv,X_train,y_train,cv=5)

print(scores_clf_svc_cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_clf_svc_cv.mean(), scores_clf_svc_cv.std() * 2))




clf_svc = svm.SVC(kernel='linear').fit(X_train,y_train)
clf_svc.score(X_test,y_test)

[ 0.79133858  0.78449409  0.78421053  0.78289474  0.79078947]
Accuracy: 0.79 (+/- 0.01)


0.77917981072555209

## 建模与预测

In [58]:
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

LR = LogisticRegression()
LR = LR.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

svc = SVC(kernel='linear', probability = True)
svc = svc.fit(X_train, y_train)
# svc = cross_val_score(svc,X_train,y_train, cv = 5)

DT = DecisionTreeClassifier(max_depth = 6)
DT = DT.fit(X_train, y_train)
# DT = cross_val_score(DT, X_train, y_train, cv = 5)

RF = RandomForestClassifier()
RF = RF.fit(X_train, y_train)
# RF = cross_val_score(RF, X_train, y_train, cv = 5)

KNN = KNeighborsClassifier()
KNN = KNN.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

GBDT = GradientBoostingClassifier()
GBDT = GBDT.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

names = ["LR", "SVC", 'DT', "RF", "KNN", "GBDT"]
models = [LR, svc, DT, RF, KNN, GBDT]
evaluates = ['accuracy', 'precision', 'recall', 'f1', 'auc']

In [59]:
df_list = []
for name,model in zip(names,models):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #accuracy
    train_accuracy = model.score(X_train,y_train)
    test_accuracy = model.score(X_test,y_test)
    
    #precision   
    train_precision = precision_score(y_train,y_train_pred)
    test_precision = precision_score(y_test,y_test_pred)   
    
    #recall
    train_recall = recall_score(y_train,y_train_pred)
    test_recall = recall_score(y_test,y_test_pred)   
    
    #f1
    train_f1 = f1_score(y_train,y_train_pred)
    test_f1 = f1_score(y_test,y_test_pred)   
    
    #auc
    y_train_pred = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict_proba(X_test)[:,1]
    
    train_auc = roc_auc_score(y_train,y_train_pred)
    test_auc = roc_auc_score(y_test,y_test_pred)
    print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
    print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
    print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    df_list.append(df)

pd.concat(df_list,axis=0,keys=names)

LR 训练集： accuracy:0.8,precision:0.749, recall:0.3, f1:0.429, auc:0.802
LR 测试集： accuracy:0.783,precision:0.586, recall:0.494, f1:0.536, auc:0.79


SVC 训练集： accuracy:0.791,precision:0.809, recall:0.218, f1:0.344, auc:0.812
SVC 测试集： accuracy:0.779,precision:0.613, recall:0.349, f1:0.444, auc:0.786


DT 训练集： accuracy:0.824,precision:0.796, recall:0.397, f1:0.53, auc:0.826
DT 测试集： accuracy:0.696,precision:0.414, recall:0.477, f1:0.443, auc:0.69


RF 训练集： accuracy:0.983,precision:0.997, recall:0.937, f1:0.966, auc:1.0
RF 测试集： accuracy:0.752,precision:0.518, recall:0.299, f1:0.379, auc:0.697


KNN 训练集： accuracy:0.808,precision:0.726, recall:0.371, f1:0.491, auc:0.842
KNN 测试集： accuracy:0.747,precision:0.5, recall:0.224, f1:0.309, auc:0.629


GBDT 训练集： accuracy:0.849,precision:0.843, recall:0.489, f1:0.619, auc:0.909
GBDT 测试集： accuracy:0.764,precision:0.545, recall:0.423, f1:0.477, auc:0.766




Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,0.799632,0.748691,0.30042,0.428786,0.802149
LR,test,0.783386,0.586207,0.493776,0.536036,0.789983
SVC,train,0.79148,0.809339,0.218487,0.344086,0.811607
SVC,test,0.77918,0.613139,0.348548,0.444444,0.786219
DT,train,0.82356,0.795789,0.397059,0.529783,0.825663
DT,test,0.696109,0.413669,0.477178,0.44316,0.689945
RF,train,0.983434,0.996648,0.936975,0.965891,0.999668
RF,test,0.75184,0.517986,0.298755,0.378947,0.696546
KNN,train,0.80752,0.726337,0.370798,0.49096,0.841861
KNN,test,0.746583,0.5,0.224066,0.309456,0.62939


In [64]:
def draw_roc_curve(train_pre_proba,test_pre_proba,train_auc,test_auc,model_name,num):
    fpr,tpr,roc_auc = train_pre_proba
    test_fpr,test_tpr,test_roc_auc = test_pre_proba
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % train_auc)
    plt.plot(test_fpr, test_tpr, color='red',
         lw=lw, label='ROC curve (area = %0.2f)' %test_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Roc example '+ model_name)
    plt.legend(loc="lower right")
    plt.savefig("../img/2019-08-07_5种分类模型比较_{}.png".format(num))
    plt.close()
    
for num,name,model in zip(range(1,7),names,models):
    
    y_train_pred = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict_proba(X_test)[:,1]


    train_roc = roc_curve(y_train,y_train_pred)
    test_roc = roc_curve(y_test,y_test_pred)

    train_auc = roc_auc_score(y_train,y_train_pred)
    test_auc = roc_auc_score(y_test,y_test_pred)

    draw_roc_curve(train_roc,test_roc,train_auc,test_auc,name,num)

![](/img/2019-08-07_5种分类模型比较_1.png)

![](/img/2019-08-07_5种分类模型比较_2.png)

![](/img/2019-08-07_5种分类模型比较_3.png)

![](/img/2019-08-07_5种分类模型比较_4.png)

![](/img/2019-08-07_5种分类模型比较_5.png)

![](/img/2019-08-07_5种分类模型比较_6.png)

> 参考：

1. [DataWhale数据挖掘实战营](https://github.com/datawhalechina/Datawhale_Learning/tree/master/doc/%E7%90%86%E8%AE%BA%E5%BA%94%E7%94%A8/%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98)
2. [吴裕雄 PYTHON 机器学习——集成学习梯度提升决策树GRADIENTBOOSTINGCLASSIFIER分类模型](https://www.cnblogs.com/tszr/p/10801558.html)
3. [使用5种分类模型进行用户贷款逾期预测](http://yezuolin.com/2018/11/TheModelofUserLoanOverdueEvaluation/)