# 乳癌資料庫預測SVM分類
>使用scikit-learn 機器學習套件裡的SVR演算法

* (一)引入函式庫及內建乳癌資料集<br>
引入之函式庫如下<br>
sklearn.datasets: 用來匯入內建之乳癌資料集`datasets.load_breast_cancer()`<br>
sklearn.SVR: 支持向量機回歸分析之演算法<br>
matplotlib.pyplot: 用來繪製影像

In [1]:
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Step1. 下載資料

In [2]:
breast_cancer=datasets.load_breast_cancer()

In [3]:
breast_cancer.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
#import pandas as pd
#x = pd.DataFrame(breast_cancer['data'],columns=breast_cancer['feature_names'])
#y = pd.DataFrame(breast_cancer['target'], columns=['target'])

In [5]:
x = breast_cancer.data
x

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [6]:
y = breast_cancer.target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

## Step2. 區分訓練集與測試集

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3)

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler().fit(x_train)  ##注意StandardScaler要()
x_train_std = sc.transform(x_train)
sc = StandardScaler().fit(x_test)
x_test_std = sc.transform(x_test)

## Step3. 建模

In [9]:
clf = svm.SVR(kernel='rbf')  ##用SVC比較好
clf.fit(x_train_std,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

#     Step4. 預測

```

```


In [10]:
z = clf.predict(x_test_std)
z

array([-0.12215891,  0.84259084,  0.01895324,  0.43486697,  0.0412532 ,
        1.17917916,  0.98757649,  0.06487461, -0.0922226 ,  0.9937608 ,
        0.47019496,  0.11436334,  0.05272854,  0.08088235, -0.00680543,
       -0.02776479,  0.37367068, -0.10548349,  0.81966947,  0.72816267,
        0.42516743,  1.02949774,  0.21901623,  0.95554547,  0.74241076,
        0.79286348,  0.85357616,  0.84770233,  0.20044652,  0.72186472,
        1.02721673,  1.0357762 ,  0.97330306, -0.02582613,  0.12609224,
        0.21915449,  0.94658898,  1.07744018,  0.23390762,  0.02248895,
       -0.11800235,  1.04958691,  0.93550993,  0.8407842 ,  0.66788731,
        0.80874127,  0.05464382, -0.13549951,  0.12531253,  0.97779389,
        1.03785159,  0.96009034,  0.72797359,  0.95796014,  1.02900504,
        0.21428917, -0.08407123,  1.08547372,  1.07789585,  0.90112822,
       -0.06644495,  0.11444683,  0.98505617,  1.18362984,  0.98264892,
       -0.03375012,  1.03364447,  0.91158905,  0.80411284,  0.99

In [None]:
zm = 

In [11]:
#import numpy as np
#def sigmoid(z):
 #   return 1 / ( 1  + np.exp(-z) )
    
#prob=sigmoid(z)
#predictions = np.round(prob).astype('int32')
#predictions

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [12]:
y_test

array([0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1])

In [13]:
#print(predictions.shape)
print(y_test.shape)

(171,)
(171,)


## Step5. 準確度分析

In [18]:
#accuracy = np.mean(predictions== y_test)  ##直接算出比例
#print('accuracy(standarization):',accuracy)  ##偏低 應該要0.98多 sigmoid不適合

accuracy(standarization): 0.7719298245614035


In [15]:
clf.score(x_train_std,y_train) ##偏低 應該要0.98多

0.9212076537413262

In [16]:
clf.score(x_test_std,y_test)

0.8384319098399241