In [42]:
import numpy
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

# 製作一個簡單的高斯分類器

In [43]:
class myGaussianClassifier(BaseEstimator, ClassifierMixin): #必須繼承 BaseEstimator, ClassifierMixin
    def __init__(self,alpha=1.e-5):        # initializer函式參數必須包含所有需要設定的參數及其內定值
        if isinstance(self,myGaussianClassifier):
            super(myGaussianClassifier,self).__init__()
        self.alpha = alpha

    def fit(self,train,target): # 不能缺
        #
        N,d = train.shape  #N為訓練集的總數，d為每個訓練集的維度
        label = np.sort(np.unique(target.ravel())) # 算出有幾個class
        self.c_     = label.size  # C存的就是class的數量
        self.d_     = d      # 將訓練集的維度(也就是一筆資料有幾種feature)存入d
        #prior也就是 P(c) 每個class的機率
        #所以prior大小是class的數量
        self.prior_ = np.zeros((self.c_,))   # 初始化先頭的先頭為0，其他為0

        #mean是在一種class中 每種feature的絕對值
        #所以mean大小是class的數量*feature的數量
        self.mean_  = np.zeros((self.c_,self.d_))

        #cov是在一種class中 每種feature間的差異(covariance matrix)
        #所以cov大小是class的數量*feature的數量*feature的數量
        self.cov_   = np.zeros((self.c_,self.d_,self.d_))  # 初始化先頭的先頭為0，其他為0
        # 計算 mean, covariance
        for cid,y in enumerate(label):
            idx = np.nonzero(target.ravel()==y)  #取得資料集中 屬於這個class的資料的index
            self.cov_[cid] =np.cov(train[idx],rowvar=False)+self.alpha*np.eye(d)  #計算covariance matrix
            # 完成mean及prior
            self.mean_[cid] = np.mean(train[idx],axis=0) #計算每個feature的mean
            self.prior_[cid] = float(idx[0].size)/N  #計算這個class的機率 （在所有資料集的大小中的比例）
        return self #最後要傳回self這個物件

    def discriminant_score(self,X,class_id):
        x_minus_m = X - self.mean_[class_id]
        #使用課本提供的discriminant_score函式
        discriminant_score = -1.0*0.5 * np.log(np.linalg.det(self.cov_[class_id])) - 0.5 * x_minus_m.T @ np.linalg.inv(self.cov_[class_id]) @ x_minus_m + np.log(self.prior_[class_id])
        return discriminant_score

    # 利用訓練好的模型來預測(回傳預測結果屬於哪個class)
    def predict(self,X, y=None): # 不能缺
        # print("len:" ,len(X.shape))
        #如果輸入是一組data 輸出就會是一個他所屬的class
        if len(X.shape) == 1:
            max_class = 0
            max_prob = -1.0*float('inf')
            for class_id in range(self.c_):
                if(self.discriminant_score(X,class_id) > max_prob):
                    max_prob = self.discriminant_score(X,class_id)
                    max_class = class_id
            return max_class
        else:
            #如果輸入是多組data(也就是說多一個batch的維度) 輸出就會是一個batch的class
            result_list = []
            for x in X:
                max_class = 0
                max_prob = -1.0 * float('inf')
                for class_id in range(self.c_):
                    if (self.discriminant_score(x, class_id) > max_prob):
                        max_prob = self.discriminant_score(x, class_id)
                        max_class = class_id
                result_list.append(max_class)
            return np.array(result_list)

    # 利用訓練好的模型來預測(回傳預測結果屬於各個class的機率)
    def predict_proba(self,X, y=None): # 視需要
        #如果輸入是一組data 輸出就會是一組機率
        if len(X.shape) == 1:
            prob_list = []
            for class_id in range(self.c_):
                prob_list.append(np.exp(self.discriminant_score(X,class_id)))
            return  prob_list/(sum(prob_list)*1.0)
        else:
            #如果輸入是多組data(也就是說多一個batch的維度)  輸出就會是一個batch的機率
            result_list = []
            for x in X:
                prob_list = []
                for class_id in range(self.c_):
                    prob_list.append(np.exp(self.discriminant_score(x, class_id)))
                result_list.append(prob_list / (sum(prob_list) * 1.0))
            return np.array(result_list)

    # 利用訓練好的模型來預測，並回傳預測的準確率
    #通常是給 test_x 和 test_y
    def score(self,X,y): # 可有可無
        data_size = 0
        correct = 0
        for i in range(len(y)):
            data_size += 1
            if(self.predict(X[i]) == y[i]):
                correct += 1
        return correct*1.0/data_size


# 比較分類器

In [44]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import neighbors, svm, naive_bayes
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
import sklearn.datasets as ds
import numpy as np

# 載入資料集
data,target    = ds.load_breast_cancer(return_X_y=True)

In [45]:

# 宣告分類器
gauss_clf      = myGaussianClassifier()
knn_clf        = neighbors.KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='kd_tree',leaf_size=30)
svm_clf        = svm.SVC(kernel='linear', C=1, probability=True)
gaussnb_clf    = naive_bayes.GaussianNB()

In [46]:
gauss_clf.fit(data,target)


myGaussianClassifier()

In [47]:
gauss_clf.score(data,target)

0.9648506151142355

In [48]:
gauss_clf_param={'alpha':[0.001,0.01,0.1,1,10,100]}
gauss_gs     = GridSearchCV(estimator=gauss_clf,param_grid = gauss_clf_param, scoring = 'accuracy', cv=5, n_jobs=-1, verbose=1)
gauss_gs.fit(data,target)
gauss_gs.score(data,target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


0.9595782073813708

In [49]:
# 定義超參數及其候選值
knn_clf_param = {'n_neighbors':[1,3,5,7]}
svm_clf_param = {'C':[0.01, 0.1, 1, 10]}
gauss_clf_param={'alpha':[0.001,0.01,0.1,1,10,100]}
gaussnb_clf_param={'var_smoothing':np.logspace(-5,2,6)}

# inner cross-validation for hyper-parameter tuning
# 當n_jobs=-1時，在Windows可能有Bug，那麼就改為n_jobs = 1
gauss_gs     = GridSearchCV(estimator=gauss_clf,param_grid = gauss_clf_param, scoring = 'accuracy', cv=5, n_jobs=-1, verbose=1)
knn_gs       = GridSearchCV(estimator=knn_clf,param_grid = knn_clf_param, scoring = 'accuracy', cv=5, n_jobs=-1, verbose=1)
svm_gs       = GridSearchCV(estimator=svm_clf,param_grid = svm_clf_param, scoring = 'accuracy', cv=5,  n_jobs=-1, verbose=1)
svm_pipeline = Pipeline([('scaler',MinMaxScaler()),('svm_gs',svm_gs)])
gaussnb_gs   = GridSearchCV(estimator=gaussnb_clf,param_grid = gaussnb_clf_param, scoring = 'accuracy', cv=5, n_jobs=-1, verbose=1)

# outer cross-validation for estimating the accuracy of the classifier
# the classifiers to be compared must be evaluated by the same k-fold CV
kfold        = StratifiedKFold(n_splits=10, shuffle=True, random_state=3)

#當n_jobs=-1時，在Windows可能有Bug，那麼就改為n_jobs = 1
gauss_scores   = cross_val_score(gauss_gs, data, target, scoring='accuracy',cv = kfold, verbose=10)
knn_scores     = cross_val_score(knn_gs, data, target, scoring='accuracy',cv = kfold, verbose=10)
svm_scores     = cross_val_score(svm_pipeline, data, target, scoring='accuracy',cv = kfold, verbose=10)
gaussnb_scores = cross_val_score(gaussnb_gs, data, target, scoring='accuracy',cv = kfold, verbose=10)

#請同學接續寫完評比
# apply the paired t-test (Refer to ppt for Chapter 20 Design and Analysis of Machine Learning Experiments)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.912) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.947) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.965) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s


[CV] END ................................ score: (test=0.947) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.982) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s


[CV] END ................................ score: (test=0.930) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.982) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.8s remaining:    0.0s


[CV] END ................................ score: (test=0.965) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.982) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.1s remaining:    0.0s


[CV] END ................................ score: (test=0.946) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.895) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.860) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.965) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.947) total time=   0.0s
[CV] START ....................

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s remaining:    0.0s


[CV] END ................................ score: (test=0.982) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.893) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.982) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=1.000) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.982) total time=   0.0s
[CV] START ....................

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s


[CV] END ................................ score: (test=0.965) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=1.000) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.965) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=1.000) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s remaining:    0.0s


[CV] END ................................ score: (test=0.947) total time=   0.1s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=0.982) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ................................ score: (test=1.000) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.842) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.912) total time=   0.0s
[CV] START ....................

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s


[CV] END ................................ score: (test=0.912) total time=   0.0s
[CV] START .....................................................................
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................ score: (test=0.911) total time=   0.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [50]:
print("myGaussianClassifier: ",gauss_scores.mean(),"±",gauss_scores.std())
print("SVC with linear kernel: ",svm_scores.mean(),"±",svm_scores.std())
print("kNN: ",knn_scores.mean(),"±",knn_scores.std())
print("GaussianNB: ",gaussnb_scores.mean() ,"±",gaussnb_scores.std())

myGaussianClassifier:  0.9560463659147869 ± 0.02257381062954153
SVC with linear kernel:  0.982456140350877 ± 0.017543859649122816
kNN:  0.9331453634085213 ± 0.038469392087079425
GaussianNB:  0.9244047619047618 ± 0.03430359573514195


# p value

In [52]:
from scipy import stats
print("svm = gauss? pValue = ",stats.ttest_1samp(svm_scores - gauss_scores, 0).pvalue)
print("svm = knn?   pValue = ",stats.ttest_1samp(svm_scores- knn_scores, 0).pvalue)
print("svm = gaussnb? pValue = ",stats.ttest_1samp(svm_scores- gaussnb_scores, 0).pvalue)

svm = gauss? pValue =  0.01189593702700636
svm = knn?   pValue =  0.009607662177057304
svm = gaussnb? pValue =  0.0018882146755463925


## ans
1.  程式
2.
    myGaussianClassifier: 0.9560463659147869
    SVC with linear kernel: 0.982456140350877
    kNN: 0.9331453634085213
    GaussianNB: 0.9244047619047618
3.
    最好的是svm ， accuracy是 0.982456140350877 ± 0.017543859649122816

    svm和myGaussianClassifier比， pValue =  0.01189593702700636
    svm和kNN比， pValue =  0.009607662177057304
    svm 和GaussianNB比， pValue =  0.0018882146755463925

    SVM和其他三者比較, pValue都小於0.05,
    代表我們有足夠的信心能推翻null hypothesis
    也代表這個分類器和其他分類器相比準確度有顯著差異



