---
title: 6种分类模型比较
date: 2019-08-07
categories: [人工智能, 监督学习]
mathjax: false
---

## 数据集载入

In [150]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(u"2019-08-01_金融数据描述_data1.csv",encoding = 'gbk')

In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 90 columns):
Unnamed: 0                                    4754 non-null int64
custid                                        4754 non-null int64
trade_no                                      4754 non-null object
bank_card_no                                  4754 non-null object
low_volume_percent                            4752 non-null float64
middle_volume_percent                         4752 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4751 non-null float64
trans_activity_month                          4752 non-null float64
trans_activity_day                            4752 non-null float64
transd_mcc                                    4752 non-null float64
trans_days_interval_filter                    4746 non-null float64
trans_days_interval                           4752 non-null float64
regional_mobility

## 特征预处理

### 删除无用

In [152]:
delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no','id_name','latest_query_time','source','loans_latest_time','first_transaction_time', 'student_feature']
df = df.drop(delete,axis=1)

### 处理分类型特征

In [153]:
from sklearn.preprocessing import LabelEncoder
df['reg_preference_for_trad'] = LabelEncoder().fit_transform(df['reg_preference_for_trad'].astype(str))

### 使用众数填充

In [154]:
from sklearn.preprocessing import Imputer
for i in range(df.shape[1]):
    feature = df.iloc[:,i].values.reshape(-1,1)
    imp_mode = Imputer(strategy='most_frequent')
    df.iloc[:,i] = imp_mode.fit_transform(feature)

## 数据划分

In [155]:
X = df[:].drop("status",axis=1)
y = df["status"]

In [156]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False)

## 数据归一化

In [157]:
from sklearn.preprocessing import minmax_scale
X_train = minmax_scale(X_train)
X_test =  minmax_scale(X_test)

## 建模与预测

### 使用一般建模方法

In [158]:
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

LR = LogisticRegression()
LR = LR.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

svc = SVC(kernel='linear', probability = True)
svc = svc.fit(X_train, y_train)
# svc = cross_val_score(svc,X_train,y_train, cv = 5)

DT = DecisionTreeClassifier(max_depth = 6)
DT = DT.fit(X_train, y_train)
# DT = cross_val_score(DT, X_train, y_train, cv = 5)

RF = RandomForestClassifier()
RF = RF.fit(X_train, y_train)
# RF = cross_val_score(RF, X_train, y_train, cv = 5)

KNN = KNeighborsClassifier()
KNN = KNN.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

GBDT = GradientBoostingClassifier()
GBDT = GBDT.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

names = ["LR", "SVC", 'DT', "RF", "KNN", "GBDT"]
models = [LR, svc, DT, RF, KNN, GBDT]
evaluates = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

In [159]:
df_list = []
for name,model in zip(names,models):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #accuracy
    train_accuracy = model.score(X_train,y_train)
    test_accuracy = model.score(X_test,y_test)
    
    
    #precision   
    train_precision = precision_score(y_train,y_train_pred)
    test_precision = precision_score(y_test,y_test_pred)   
    
    #recall
    train_recall = recall_score(y_train,y_train_pred)
    test_recall = recall_score(y_test,y_test_pred)   
    
    #f1
    train_f1 = f1_score(y_train,y_train_pred)
    test_f1 = f1_score(y_test,y_test_pred)   
    
    #auc
    y_train_pred = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict_proba(X_test)[:,1]
    
    train_auc = roc_auc_score(y_train,y_train_pred)
    test_auc = roc_auc_score(y_test,y_test_pred)
    
    print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
    print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
    print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    df_list.append(df)

pd.concat(df_list,axis=0,keys=names)

LR 训练集： accuracy:0.801,precision:0.742, recall:0.279, f1:0.406, auc:0.8
LR 测试集： accuracy:0.789,precision:0.752, recall:0.365, f1:0.491, auc:0.787


SVC 训练集： accuracy:0.793,precision:0.809, recall:0.196, f1:0.316, auc:0.809
SVC 测试集： accuracy:0.771,precision:0.843, recall:0.222, f1:0.351, auc:0.796


DT 训练集： accuracy:0.829,precision:0.723, recall:0.487, f1:0.582, auc:0.832
DT 测试集： accuracy:0.727,precision:0.516, recall:0.361, f1:0.425, auc:0.707


RF 训练集： accuracy:0.981,precision:0.999, recall:0.924, f1:0.96, auc:0.999
RF 测试集： accuracy:0.722,precision:0.509, recall:0.218, f1:0.305, auc:0.673


KNN 训练集： accuracy:0.816,precision:0.741, recall:0.379, f1:0.501, auc:0.848
KNN 测试集： accuracy:0.708,precision:0.445, recall:0.184, f1:0.261, auc:0.623


GBDT 训练集： accuracy:0.859,precision:0.867, recall:0.497, f1:0.632, auc:0.913
GBDT 测试集： accuracy:0.754,precision:0.573, recall:0.474, f1:0.519, auc:0.771




Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,0.800684,0.74212,0.279396,0.405956,0.799942
LR,test,0.788644,0.751938,0.364662,0.491139,0.786801
SVC,train,0.792795,0.808889,0.196332,0.315972,0.809276
SVC,test,0.770768,0.842857,0.221805,0.35119,0.796191
DT,train,0.829345,0.722756,0.486516,0.58156,0.83199
DT,test,0.726604,0.516129,0.360902,0.424779,0.706676
RF,train,0.981331,0.998834,0.924488,0.960224,0.999103
RF,test,0.722397,0.508772,0.218045,0.305263,0.672949
KNN,train,0.816198,0.740506,0.378641,0.501071,0.847694
KNN,test,0.707676,0.445455,0.184211,0.260638,0.623415


In [160]:
def draw_roc_curve(train_pre_proba,test_pre_proba,train_auc,test_auc,model_name,num):
    fpr,tpr,roc_auc = train_pre_proba
    test_fpr,test_tpr,test_roc_auc = test_pre_proba
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % train_auc)
    plt.plot(test_fpr, test_tpr, color='red',
         lw=lw, label='ROC curve (area = %0.2f)' %test_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Roc example '+ model_name)
    plt.legend(loc="lower right")
    plt.savefig("../img/2019-08-07_5种分类模型比较_{}.png".format(num))
    plt.close()
    
for num,name,model in zip(range(1,7),names,models):
    
    y_train_pred = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict_proba(X_test)[:,1]


    train_roc = roc_curve(y_train,y_train_pred)
    test_roc = roc_curve(y_test,y_test_pred)

    train_auc = roc_auc_score(y_train,y_train_pred)
    test_auc = roc_auc_score(y_test,y_test_pred)

    draw_roc_curve(train_roc,test_roc,train_auc,test_auc,name,num)

![](/img/2019-08-07_5种分类模型比较_1.png)

![](/img/2019-08-07_5种分类模型比较_2.png)

![](/img/2019-08-07_5种分类模型比较_3.png)

![](/img/2019-08-07_5种分类模型比较_4.png)

![](/img/2019-08-07_5种分类模型比较_5.png)

![](/img/2019-08-07_5种分类模型比较_6.png)

### 使用k-folds交叉建模

In [168]:
from sklearn.model_selection import KFold

def run_cv(X,y,clf_class,**kwargs):
    kf = KFold(n_splits = 5, shuffle = False, random_state = 0)
    y_pred = y.copy()
    clf = clf_class(**kwargs)
    
    for train_index , test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [169]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression

LR_CV_result = run_cv(X_train,y_train,LogisticRegression)
RF_CV_result = run_cv(X_train,y_train,RandomForestClassifier)
KNN_CV_result = run_cv(X_train,y_train,KNeighborsClassifier)

In [170]:
def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print ("Logistic Regression (L2 is default): " + str(accuracy(y_train, LR_CV_result)))
print ("Random forest: " + str(accuracy(y_train, RF_CV_result)))
print ("K-nearest-neighbors: " + str(accuracy(y_train, KNN_CV_result)))

Logistic Regression (L2 is default): 0.7933210623192216
Random forest: 0.7812253484091507
K-nearest-neighbors: 0.7520378648435446


> 参考：

1. [DataWhale数据挖掘实战营](https://github.com/datawhalechina/Datawhale_Learning/tree/master/doc/%E7%90%86%E8%AE%BA%E5%BA%94%E7%94%A8/%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98)
2. [吴裕雄 PYTHON 机器学习——集成学习梯度提升决策树GRADIENTBOOSTINGCLASSIFIER分类模型](https://www.cnblogs.com/tszr/p/10801558.html)
3. [使用5种分类模型进行用户贷款逾期预测](http://yezuolin.com/2018/11/TheModelofUserLoanOverdueEvaluation/)
4. [sklearn.model_selection.KFold](https://blog.csdn.net/kancy110/article/details/74910185/)