<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#数据准备" data-toc-modified-id="数据准备-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>数据准备</a></span></li><li><span><a href="#模型的好坏判断" data-toc-modified-id="模型的好坏判断-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>模型的好坏判断</a></span></li><li><span><a href="#查准率和查全率" data-toc-modified-id="查准率和查全率-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>查准率和查全率</a></span></li><li><span><a href="#ROC-绘图-&amp;-AUC" data-toc-modified-id="ROC-绘图-&amp;-AUC-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>ROC 绘图 &amp; AUC</a></span></li><li><span><a href="#特征重要性" data-toc-modified-id="特征重要性-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>特征重要性</a></span></li><li><span><a href="#输出概率" data-toc-modified-id="输出概率-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>输出概率</a></span></li><li><span><a href="#校准和分析" data-toc-modified-id="校准和分析-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>校准和分析</a></span></li></ul></div>

In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.model_selection  import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
%matplotlib inline

# 数据准备

In [6]:
churn_df = pd.read_csv('churn.csv')
churn_df.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [16]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
State             3333 non-null object
Account Length    3333 non-null int64
Area Code         3333 non-null int64
Phone             3333 non-null object
Int'l Plan        3333 non-null object
VMail Plan        3333 non-null object
VMail Message     3333 non-null int64
Day Mins          3333 non-null float64
Day Calls         3333 non-null int64
Day Charge        3333 non-null float64
Eve Mins          3333 non-null float64
Eve Calls         3333 non-null int64
Eve Charge        3333 non-null float64
Night Mins        3333 non-null float64
Night Calls       3333 non-null int64
Night Charge      3333 non-null float64
Intl Mins         3333 non-null float64
Intl Calls        3333 non-null int64
Intl Charge       3333 non-null float64
CustServ Calls    3333 non-null int64
Churn?            3333 non-null object
dtypes: float64(8), int64(8), object(5)
memory usage: 546.9+ KB


In [7]:
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
y

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
churn_feat_space.head()

Unnamed: 0,Account Length,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls
0,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,137,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,84,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,75,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [11]:
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

In [12]:
features = churn_feat_space.columns
features

Index(['Account Length', 'Int'l Plan', 'VMail Plan', 'VMail Message',
       'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
       'Intl Calls', 'Intl Charge', 'CustServ Calls'],
      dtype='object')

In [14]:
X = churn_feat_space.values.astype(np.float)
X

array([[128.  ,   0.  ,   1.  , ...,   3.  ,   2.7 ,   1.  ],
       [107.  ,   0.  ,   1.  , ...,   3.  ,   3.7 ,   1.  ],
       [137.  ,   0.  ,   0.  , ...,   5.  ,   3.29,   0.  ],
       ...,
       [ 28.  ,   0.  ,   0.  , ...,   6.  ,   3.81,   2.  ],
       [184.  ,   1.  ,   0.  , ...,  10.  ,   1.35,   2.  ],
       [ 74.  ,   0.  ,   1.  , ...,   4.  ,   3.7 ,   0.  ]])

In [15]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 0.67648946, -0.32758048,  1.6170861 , ..., -0.60119509,
        -0.0856905 , -0.42793202],
       [ 0.14906505, -0.32758048,  1.6170861 , ..., -0.60119509,
         1.2411686 , -0.42793202],
       [ 0.9025285 , -0.32758048, -0.61839626, ...,  0.21153386,
         0.69715637, -1.1882185 ],
       ...,
       [-1.83505538, -0.32758048, -0.61839626, ...,  0.61789834,
         1.3871231 ,  0.33235445],
       [ 2.08295458,  3.05268496, -0.61839626, ...,  2.24335625,
        -1.87695028,  0.33235445],
       [-0.67974475, -0.32758048,  1.6170861 , ..., -0.19483061,
         1.2411686 , -1.1882185 ]])

In [19]:
X.shape

(3333, 17)

In [20]:
np.unique(y)

array([0, 1])

# 模型的好坏判断

In [26]:
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(n_splits=3,shuffle=True) #生成交叉数据集
    y_pred = y.copy() 
    # Iterate through folds
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [27]:
def accuracy(y_true,y_pred):
    # NumPy interpretes True and False as 1. and 0.
    return np.mean(y_true == y_pred)

In [28]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import average_precision_score

In [29]:
print("Logistic Regression:")
print("%.3f" % accuracy(y, run_cv(X,y,LR)))
print( "Gradient Boosting Classifier")
print( "%.3f" % accuracy(y, run_cv(X,y,GBC)))
print( "Support vector machines:")
print( "%.3f" % accuracy(y, run_cv(X,y,SVC)))
print( "Random forest:")
print( "%.3f" % accuracy(y, run_cv(X,y,RF)))
print( "K-nearest-neighbors:")
print( "%.3f" % accuracy(y, run_cv(X,y,KNN)))

Logistic Regression:


TypeError: 'KFold' object is not iterable

# 查准率和查全率

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def draw_confusion_matrices(confusion_matricies,class_names):
    class_names = class_names.tolist()
    for cm in confusion_matrices:
        classifier, cm = cm[0], cm[1]
        print cm

        fig = plt.figure()
        ax = fig.add_subplot(111)
        cax = ax.matshow(cm)
        plt.title('Confusion matrix for %s' % classifier)
        fig.colorbar(cax)
        ax.set_xticklabels([''] + class_names)
        ax.set_yticklabels([''] + class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()

y = np.array(y)
class_names = np.unique(y)

confusion_matrices = [
    ( "Support Vector Machines", confusion_matrix(y,run_cv(X,y,SVC)) ),
    ( "Random Forest", confusion_matrix(y,run_cv(X,y,RF)) ),
    ( "K-Nearest-Neighbors", confusion_matrix(y,run_cv(X,y,KNN)) ),
    ( "Gradient Boosting Classifier", confusion_matrix(y,run_cv(X,y,GBC)) ),
    ( "Logisitic Regression", confusion_matrix(y,run_cv(X,y,LR)) )
]

# Pyplot code not included to reduce clutter
# from churn_display import draw_confusion_matrices
%matplotlib inline

draw_confusion_matrices(confusion_matrices,class_names)

# ROC 绘图 & AUC

In [None]:
from sklearn.metrics import roc_curve, auc
from scipy import interp

def plot_roc(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    for i, (train_index, test_index) in enumerate(kf):
        #enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)
        #组合为一个索引序列，同时列出数据和数据下标
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes,输出实例在每个类中的概率
        y_prob[test_index] = clf.predict_proba(X_test)
        # 按照步骤生成绘图所需的点
        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])

        mean_tpr += interp(mean_fpr, fpr, tpr) # 一维线性插值函数

        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)# 计算曲线下面积
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(kf)#上述计算是取了5次迭代每条曲线的平均值的和，需/5 
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

print "Support vector machines:"
plot_roc(X,y,SVC,probability=True)

print "Random forests:"
plot_roc(X,y,RF,n_estimators=18)

print "K-nearest-neighbors:"
plot_roc(X,y,KNN)

print "Gradient Boosting Classifier:"
plot_roc(X,y,GBC)

# 特征重要性
每个特征的重要性，也是帮助决策者决策的重要因素，在这里，我们将其按照重要性排序，重要性的赋值准测归功于决策树，前面的博客中已经提到，这里不再赘述。

In [None]:
train_index,test_index = train_test_split(churn_df.index)

forest = RF()
forest_fit = forest.fit(X[train_index], y[train_index])
forest_predictions = forest_fit.predict(X[test_index])

importances = forest_fit.feature_importances_[:10]

std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0) #计算标准偏差
indices = np.argsort(importances)[::-1]#排序，返回坐标


# Print the feature ranking
print("Feature ranking:")

for f in range(10):
    print("%d. %s (%f)" % (f + 1, features[f], importances[indices[f]]))

# Plot the feature importances of the forest
#import pylab as pl
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices], yerr=std[indices], color="r", align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()

# 输出概率
我们可以看到，随机森林算法预测89个人的流失概率为0.9，实际上该群体的流速为0.97。

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
pred_churn = pred_prob[:,1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
counts[:]

In [None]:
from collections import defaultdict
true_prob = defaultdict(float)

# calculate true probabilities
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts

# 校准和分析
使用上面的DataFrame，我们可以绘制一个非常简单的图形来帮助可视化概率测量。x轴表示随机森林分配给一组个体的流失概率。 y轴是该组内的实际流失率。距离红线越远，预测的越不精准。

In [None]:
from churn_measurements import calibration, discrimination
from sklearn.metrics import roc_curve, auc
from scipy import interp
from __future__ import division 
from operator import idiv

def print_measurements(pred_prob):
    churn_prob, is_churn = pred_prob[:,1], y == 1
    print "  %-20s %.4f" % ("Calibration Error", calibration(churn_prob, is_churn))
    print "  %-20s %.4f" % ("Discrimination", discrimination(churn_prob,is_churn))

    print "Note -- Lower calibration is better, higher discrimination is better"

print "Support vector machines:"
print_measurements(run_prob_cv(X,y,SVC,probability=True))

print "Random forests:"
print_measurements(run_prob_cv(X,y,RF,n_estimators=18))

print "K-nearest-neighbors:"
print_measurements(run_prob_cv(X,y,KNN))

print "Gradient Boosting Classifier:"
print_measurements(run_prob_cv(X,y,GBC))

print "Random Forest:"
print_measurements(run_prob_cv(X,y,RF))