linear: u'*v

polynomial: (gamma*u'*v + coef0)^degree

radial basis function: exp(-gamma*|u-v|^2)

sigmoid: tanh(gamma*u'*v + coef0)

In [1]:
# Nạp các gói thư viện cần thiết
import numpy as np
import pandas as pd
from sklearn import svm
# Đọc dữ liệu iris từ UCI (https://archive.ics.uci.edu/ml/datasets/Iris) 
# hoặc từ thư viện scikit-learn
# Tham khảo https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
iris = datasets.load_iris()
columns=["Petal length","Petal Width","Sepal Length","Sepal Width"]
df = pd.DataFrame(iris.data, columns=columns)
y = iris.target
print(df.describe())
print("\n")
print("Kiem tra xem du lieu co bi thieu (NULL) khong?") 
print(df.isnull().sum())
# Sử dụng nghi thức kiểm tra hold-out
# Chia dữ liệu ngẫu nhiên thành 2 tập dữ liệu con:
# training set và test set theo tỷ lệ 70/30
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3)
# Xây dựng mô hình svm sử dụng hàm nhân (kernel) là RBF 
# SVC là viết tắt của từ Support Vector Classification 
model = svm.SVC(kernel='rbf')
model.fit(X_train, y_train)
# Dự đoán nhãn tập kiểm tra 
prediction = model.predict(X_test) 
# print(prediction)
# Tính độ chính xác
print("Do chinh xác cua mo hinh voi nghi thuc kiem tra hold-out: %.3f" % model.score(X_test, y_test))

       Petal length  Petal Width  Sepal Length  Sepal Width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


Kiem tra xem du lieu co bi thieu (NULL) khong?
Petal length    0
Petal Width     0
Sepal Length    0
Sepal Width     0
dtype: int64
Do chinh xác cua mo hinh voi nghi thuc kiem tra hold-out: 0.956


In [2]:
def load_datasets(name):
    if name == 'Iris':
        data = datasets.load_iris()
    elif name == 'Breast Cancer':
        data = datasets.load_breast_cancer()
    elif name == 'Wine':
        data = datasets.load_wine()
    elif name == 'Handwritten Digits':
        data = datasets.load_digits()
    df = pd.DataFrame(data.data)
    y = data.target
    return df, y

In [3]:
def score_dataset(dataset_name, model, X, y, cv=None):
    if cv:
        scores = cross_val_score(model, df, y, cv=cv)
        print("%s %2d-folds = %.3f" % (dataset_name, cv, np.mean(scores)))
    else:
        for nFold in range(2, 10+1):
            scores = cross_val_score(model, df, y, cv=nFold)
            print("%s %2d-folds = %.3f" % (dataset_name, nFold, np.mean(scores)))

In [4]:
score_dataset('Iris', model, df, y, cv=5)

Iris  5-folds = 0.967


In [5]:
# Breast Cancer with 'Linear' kernel
dataset_name = 'Breast Cancer'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='linear')
score_dataset(dataset_name, model, df, y)

Breast Cancer  2-folds = 0.938
Breast Cancer  3-folds = 0.951
Breast Cancer  4-folds = 0.944
Breast Cancer  5-folds = 0.946
Breast Cancer  6-folds = 0.949
Breast Cancer  7-folds = 0.953
Breast Cancer  8-folds = 0.951
Breast Cancer  9-folds = 0.946
Breast Cancer 10-folds = 0.954


In [6]:
# Breast Cancer with 'Polynomial' kernel
dataset_name = 'Breast Cancer'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='poly', gamma=0.001, degree=2, coef0=2)
score_dataset(dataset_name, model, df, y)

Breast Cancer  2-folds = 0.935
Breast Cancer  3-folds = 0.945
Breast Cancer  4-folds = 0.944
Breast Cancer  5-folds = 0.944
Breast Cancer  6-folds = 0.949
Breast Cancer  7-folds = 0.951
Breast Cancer  8-folds = 0.951
Breast Cancer  9-folds = 0.949
Breast Cancer 10-folds = 0.947


In [16]:
# Breast Cancer with 'RBF' kernel
dataset_name = 'Breast Cancer'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='rbf', gamma=0.0001)
score_dataset(dataset_name, model, df, y)

Breast Cancer  2-folds = 0.926
Breast Cancer  3-folds = 0.931
Breast Cancer  4-folds = 0.935
Breast Cancer  5-folds = 0.935
Breast Cancer  6-folds = 0.937
Breast Cancer  7-folds = 0.933
Breast Cancer  8-folds = 0.937
Breast Cancer  9-folds = 0.933
Breast Cancer 10-folds = 0.939


In [21]:
# Wine with 'Linear' kernel
dataset_name = 'Wine'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='linear', C=1000)
score_dataset(dataset_name, model, df, y)

Wine  2-folds = 0.961
Wine  3-folds = 0.933
Wine  4-folds = 0.967
Wine  5-folds = 0.961
Wine  6-folds = 0.956
Wine  7-folds = 0.950
Wine  8-folds = 0.967
Wine  9-folds = 0.967
Wine 10-folds = 0.961


In [48]:
# Wine with 'Polynomial' kernel
dataset_name = 'Wine'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='poly', gamma=0.001, C=1000)
score_dataset(dataset_name, model, df, y)

Wine  2-folds = 0.955
Wine  3-folds = 0.922
Wine  4-folds = 0.983
Wine  5-folds = 0.967
Wine  6-folds = 0.967
Wine  7-folds = 0.961
Wine  8-folds = 0.961
Wine  9-folds = 0.961
Wine 10-folds = 0.961


In [39]:
# Wine with 'RBF' kernel
dataset_name = 'Wine'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='rbf', gamma=0.0001)
score_dataset(dataset_name, model, df, y)

Wine  2-folds = 0.657
Wine  3-folds = 0.658
Wine  4-folds = 0.669
Wine  5-folds = 0.708
Wine  6-folds = 0.698
Wine  7-folds = 0.714
Wine  8-folds = 0.707
Wine  9-folds = 0.725
Wine 10-folds = 0.709


In [43]:
# Handwritten Digits with 'Linear' kernel
dataset_name = 'Handwritten Digits'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='linear', C=1000)
score_dataset(dataset_name, model, df, y)

Handwritten Digits  2-folds = 0.940
Handwritten Digits  3-folds = 0.944
Handwritten Digits  4-folds = 0.951
Handwritten Digits  5-folds = 0.948
Handwritten Digits  6-folds = 0.953
Handwritten Digits  7-folds = 0.950
Handwritten Digits  8-folds = 0.956
Handwritten Digits  9-folds = 0.955
Handwritten Digits 10-folds = 0.960


In [47]:
# Handwritten Digits with 'Polynomial' kernel
dataset_name = 'Handwritten Digits'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='poly', gamma=0.001, C=1000)
score_dataset(dataset_name, model, df, y)

Handwritten Digits  2-folds = 0.949
Handwritten Digits  3-folds = 0.960
Handwritten Digits  4-folds = 0.970
Handwritten Digits  5-folds = 0.969
Handwritten Digits  6-folds = 0.972
Handwritten Digits  7-folds = 0.971
Handwritten Digits  8-folds = 0.973
Handwritten Digits  9-folds = 0.979
Handwritten Digits 10-folds = 0.977


In [50]:
# Handwritten Digits with 'RBF' kernel
dataset_name = 'Handwritten Digits'
df, y = load_datasets(dataset_name)
model = svm.SVC(kernel='rbf')
score_dataset(dataset_name, model, df, y)

Handwritten Digits  2-folds = 0.957
Handwritten Digits  3-folds = 0.970
Handwritten Digits  4-folds = 0.965
Handwritten Digits  5-folds = 0.963
Handwritten Digits  6-folds = 0.970
Handwritten Digits  7-folds = 0.967
Handwritten Digits  8-folds = 0.968
Handwritten Digits  9-folds = 0.976
Handwritten Digits 10-folds = 0.970
