# 手寫數字分類問題（多元分類）

使用 Support Vector Machine Classifier 來解多元分類問題。

我們使用的資料集是 sklearn 內建的手寫數字資料集。

## Step1 載入資料集

In [1]:
# 從 sklearn.dataset 載入手寫數字資料集
from sklearn.datasets import load_digits
digits = load_digits()

In [2]:
digits

 'data': array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,  10.,   0.,   0.],
        [  0.,   0.,   0., ...,  16.,   9.,   0.],
        ..., 
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   2., ...,  12.,   0.,   0.],
        [  0.,   0.,  10., ...,  12.,   1.,   0.]]),
 'images': array([[[  0.,   0.,   5., ...,   1.,   0.,   0.],
         [  0.,   0.,  13., ...,  15.,   5.,   0.],
         [  0.,   3.,  15., ...,  11.,   8.,   0.],
         ..., 
         [  0.,   4.,  11., ...,  12.,   7.,   0.],
         [  0.,   2.,  14., ...,  12.,   0.,   0.],
         [  0.,   0.,   6., ...,   0.,   0.,   0.]],
 
        [[  0.,   0.,   0., ...,   5.,   0.,   0.],
         [  0.,   0.,   0., ...,   9.,   0.,   0.],
         [  0.,   0.,   3., ...,   6.,   0.,   0.],
         ..., 
         [  0.,   0.,   1., ...,   6.,   0.,   0.],
         [  0.,   0.,   1., ...,   6.,   0.,   0.],
         [  0.,   0.,   0., ...,  10.,   0.,   0.]],
 


In [3]:
# 檢視數據規模和特徵維度
digits.data.shape

(1797, 64)

In [4]:
print(digits.data[0])

[  0.   0.   5.  13.   9.   1.   0.   0.   0.   0.  13.  15.  10.  15.   5.
   0.   0.   3.  15.   2.   0.  11.   8.   0.   0.   4.  12.   0.   0.   8.
   8.   0.   0.   5.   8.   0.   0.   9.   8.   0.   0.   4.  11.   0.   1.
  12.   7.   0.   0.   2.  14.   5.  10.  12.   0.   0.   0.   0.   6.  13.
  10.   0.   0.   0.]


In [5]:
print(digits.images[0])

[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]


In [6]:
print(digits.target[0])

0


In [7]:
print(digits.data[23])

[  0.   1.   8.  12.  15.  14.   4.   0.   0.   3.  11.   8.   8.  12.  12.
   0.   0.   0.   0.   0.   2.  13.   7.   0.   0.   0.   0.   2.  15.  12.
   1.   0.   0.   0.   0.   0.  13.   5.   0.   0.   0.   0.   0.   0.   9.
  13.   0.   0.   0.   0.   7.   8.  14.  15.   0.   0.   0.   0.  14.  15.
  11.   2.   0.   0.]


In [8]:
print(digits.images[23])

[[  0.   1.   8.  12.  15.  14.   4.   0.]
 [  0.   3.  11.   8.   8.  12.  12.   0.]
 [  0.   0.   0.   0.   2.  13.   7.   0.]
 [  0.   0.   0.   2.  15.  12.   1.   0.]
 [  0.   0.   0.   0.  13.   5.   0.   0.]
 [  0.   0.   0.   0.   9.  13.   0.   0.]
 [  0.   0.   7.   8.  14.  15.   0.   0.]
 [  0.   0.  14.  15.  11.   2.   0.   0.]]


In [9]:
print(digits.target[23])

3


In [10]:
# 從 sklearn.cross_validation 導入 train_test_split 用於數據分割
from sklearn.cross_validation import train_test_split



In [11]:
# 隨機選取 75% 的數據作為訓練樣本，其餘 25% 數據作為測試樣本
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)

In [12]:
X_train

array([[  0.,   0.,   1., ...,   0.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       ..., 
       [  0.,   0.,   4., ...,   0.,   0.,   0.],
       [  0.,   0.,  12., ...,   0.,   0.,   0.],
       [  0.,   1.,   7., ...,   0.,   0.,   0.]])

In [13]:
# 分別檢視訓練與測試數據規模
print(y_train.shape)
print(y_test.shape)

(1347,)
(450,)


## Step2 資料前置處理（標準化）

In [14]:
# 從 sklean.preprocession 裡導入數據標準化模組
from sklearn.preprocessing import StandardScaler

In [15]:
# 對訓練及測試資料進行標準化
original_X_test = X_test
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [16]:
X_train

array([[ 0.        , -0.32672314, -0.87554711, ..., -1.11540424,
        -0.49709493, -0.19054741],
       [ 0.        , -0.32672314, -0.2423893 , ..., -1.11540424,
        -0.49709493, -0.19054741],
       [ 0.        , -0.32672314, -1.08659972, ..., -1.11540424,
        -0.49709493, -0.19054741],
       ..., 
       [ 0.        , -0.32672314, -0.2423893 , ..., -1.11540424,
        -0.49709493, -0.19054741],
       [ 0.        , -0.32672314,  1.44603155, ..., -1.11540424,
        -0.49709493, -0.19054741],
       [ 0.        ,  0.75459398,  0.39076852, ..., -1.11540424,
        -0.49709493, -0.19054741]])

## Step3 訓練（使用 Linear Support Vector Machine）

In [17]:
# 使用線性支持向量機分類器
from sklearn.svm import LinearSVC

In [18]:
# 初始化
lsvc = LinearSVC()

In [19]:
# 進行模型訓練
lsvc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

## Step4 預測與評估

In [20]:
# 利用訓練好的模型對測試樣本進行預測
y_predict = lsvc.predict(X_test)

In [21]:
# 使用 score 評估準確度
print('The Accuracy of Linear SVC is', lsvc.score(X_test, y_test))

('The Accuracy of Linear SVC is', 0.95333333333333337)


In [22]:
# 使用 classification_report 評估準確度
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict, target_names=digits.target_names.astype(str)))

             precision    recall  f1-score   support

          0       0.92      1.00      0.96        35
          1       0.96      0.98      0.97        54
          2       0.98      1.00      0.99        44
          3       0.93      0.93      0.93        46
          4       0.97      1.00      0.99        35
          5       0.94      0.94      0.94        48
          6       0.96      0.98      0.97        51
          7       0.92      1.00      0.96        35
          8       0.98      0.84      0.91        58
          9       0.95      0.91      0.93        44

avg / total       0.95      0.95      0.95       450



In [23]:
print(y_predict)

[1 3 7 3 2 4 6 1 4 0 4 7 9 5 2 8 3 6 7 0 6 0 8 3 0 6 2 3 0 9 0 2 0 6 9 1 1
 5 8 0 6 1 5 8 9 5 1 6 2 6 6 7 6 7 7 2 7 8 0 7 3 6 3 9 6 6 5 5 4 2 9 3 7 6
 5 7 2 8 1 2 2 8 1 1 6 3 5 0 0 1 6 7 6 8 9 7 0 0 9 8 0 8 2 3 6 1 9 9 1 7 3
 9 8 8 5 9 5 1 1 7 9 3 3 2 8 1 3 8 6 4 0 0 0 7 1 5 5 1 8 5 1 8 1 6 9 9 4 5
 7 5 2 1 2 5 8 7 7 5 1 9 6 9 8 0 6 1 2 1 5 7 8 9 6 8 4 1 0 0 9 8 7 2 8 6 4
 8 9 4 2 6 1 8 5 6 7 5 1 9 2 8 3 2 9 4 3 5 5 6 2 4 3 2 6 4 8 5 8 0 8 8 6 3
 2 3 0 5 7 1 3 9 3 2 1 6 6 5 1 9 7 2 4 5 2 1 3 1 1 2 1 7 0 1 2 2 1 2 4 9 6
 6 3 9 2 8 1 5 5 1 8 6 2 5 6 0 1 4 2 1 8 9 4 3 0 6 8 3 3 2 0 2 0 6 5 6 6 4
 6 1 8 3 4 1 3 5 1 4 9 8 7 5 1 1 3 7 8 8 3 7 4 0 7 2 8 7 1 9 4 5 3 5 2 5 1
 3 0 5 8 4 7 6 9 9 3 3 4 0 6 4 7 0 6 1 2 3 3 4 5 3 3 5 2 0 9 7 1 5 5 8 4 4
 3 6 2 5 1 0 6 1 5 8 4 7 6 4 3 4 0 3 0 1 2 8 0 5 4 5 2 2 9 6 9 8 0 8 8 2 4
 6 5 6 4 3 9 8 9 7 1 7 9 4 1 9 9 5 9 8 0 8 2 5 1 4 2 6 3 7 9 3 7 4 3 7 1 8
 8 9 5 3 6 6]


In [24]:
print(y_test)

[1 3 7 3 2 4 6 1 4 0 4 7 8 9 2 8 3 6 7 0 6 0 8 3 0 6 2 3 0 9 0 2 0 6 9 1 1
 5 8 0 6 1 5 8 9 5 1 6 2 6 6 7 6 7 7 2 7 8 0 7 3 6 3 9 6 6 5 5 4 2 9 3 7 6
 5 7 2 8 1 2 2 8 1 1 6 3 5 0 0 1 6 7 5 8 9 7 0 0 9 8 0 8 2 3 6 1 9 9 1 7 8
 9 8 8 5 9 5 1 1 9 9 3 3 2 8 1 3 8 6 4 0 0 0 7 1 5 5 1 8 5 1 8 8 6 9 9 4 5
 7 5 2 1 2 3 8 7 7 5 1 9 1 9 8 0 6 1 2 1 3 3 8 9 6 8 4 1 0 0 9 8 7 2 8 6 4
 8 9 4 2 6 1 8 5 6 7 5 1 9 2 8 3 2 9 4 8 5 5 6 2 4 3 2 6 4 8 5 8 0 8 8 6 3
 2 3 0 5 7 1 3 9 3 2 1 6 6 5 1 9 7 2 4 5 2 1 3 1 1 2 1 7 0 1 2 2 1 2 4 9 6
 6 3 9 2 8 1 5 5 1 8 6 2 5 6 0 1 4 2 1 8 9 4 3 0 6 8 3 3 2 0 2 5 6 5 6 6 4
 6 1 8 3 4 1 3 5 1 4 9 8 7 5 1 1 3 7 8 8 3 7 4 0 7 2 9 7 1 9 4 5 3 5 2 5 1
 3 0 5 8 4 7 6 9 9 3 3 4 8 6 4 7 0 6 8 2 3 3 4 5 3 3 5 2 0 9 7 1 5 5 8 4 4
 3 6 2 5 1 0 6 1 5 8 4 9 6 4 3 8 0 3 0 1 2 8 0 5 4 5 2 8 9 6 9 8 0 8 8 2 4
 6 5 6 4 3 9 8 9 7 1 7 9 4 1 5 9 5 9 8 6 8 2 5 1 4 2 6 3 7 9 8 7 4 3 7 1 8
 8 9 5 3 6 6]


In [25]:
# 列出預測錯誤的資料
print([index for index, (e1, e2) in enumerate(zip(y_predict, y_test)) if e1 != e2])

[12, 13, 92, 110, 119, 142, 153, 160, 168, 169, 204, 290, 322, 345, 351, 381, 385, 397, 421, 426, 437]


In [26]:
print(y_predict[12])
print(y_test[12])

9
8


In [27]:
print(original_X_test[12])

[  0.   0.   0.   8.  15.   4.   0.   0.   0.   0.   3.  16.  10.  11.   0.
   0.   0.   0.   6.  12.  11.  13.   0.   0.   0.   0.  10.  16.  16.   9.
   0.   0.   0.   1.  16.  12.  11.   5.   0.   0.   0.   2.  13.   0.   2.
   9.   0.   0.   0.   0.   8.   6.   2.  12.   0.   0.   0.   0.   1.   9.
  14.   9.   0.   0.]


In [28]:
import numpy as np
test_image = np.reshape(original_X_test[12], (-1, 8))
print(test_image)

[[  0.   0.   0.   8.  15.   4.   0.   0.]
 [  0.   0.   3.  16.  10.  11.   0.   0.]
 [  0.   0.   6.  12.  11.  13.   0.   0.]
 [  0.   0.  10.  16.  16.   9.   0.   0.]
 [  0.   1.  16.  12.  11.   5.   0.   0.]
 [  0.   2.  13.   0.   2.   9.   0.   0.]
 [  0.   0.   8.   6.   2.  12.   0.   0.]
 [  0.   0.   1.   9.  14.   9.   0.   0.]]
