In [1]:
import numpy as np
import pandas as pd
from libsvm.svmutil import *
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

In [2]:
# read data
X_train = pd.read_csv("data/X_train.csv", header=None).to_numpy()
y_train = pd.read_csv("data/y_train.csv", header=None).to_numpy().reshape(-1)
X_test  = pd.read_csv("data/X_test.csv", header=None).to_numpy()
y_test  = pd.read_csv("data/y_test.csv", header=None).to_numpy().reshape(-1)

# Part1: 
Use different kernel functions (linear, polynomial, and RBF kernels) and have comparison between their performance.
![svm_param](img/svm_param.PNG)
reference from https://github.com/cjlin1/libsvm

In [3]:
kernel_types = {'linear':'-q -t 0',
                'polynomial':'-q -t 1',
                'radial basis function':'-q -t 2'}

for kernel_type in kernel_types:
    model = svm_train(y_train, X_train, arg3=kernel_types[kernel_type])
    p_labels, p_acc, p_vals = svm_predict(y_test, X_test, model, '-q')
    
    # p_acc: a tuple including accuracy (for classification), mean-squared error, 
    # and squared correlation coefficient (for regression).
    print("kernel_type:{}, accuracy: {:.2f}".format(kernel_type, p_acc[0]))

kernel_type:linear, accuracy: 95.08
kernel_type:polynomial, accuracy: 34.68
kernel_type:radial basis function, accuracy: 95.32


# Part2: 
Please use C-SVC. please do the grid search for finding parameters of the best performing model. For instance, in C-SVC you have a parameter C, and if you use RBF kernel you have another parameter 𝛾, you can search for a set of (C, 𝛾) which gives you best performance in cross-validation. 

In [4]:
def grid_search(log2c, log2g, X_train, y_train, X_test ,y_test):
    best_lc = log2c[0]
    best_lg = log2g[0]
    best_acc = 0
    for lc in log2c:
        for lg in log2g:
            arg3 = '-q -t 2 -v 3 -c {} -g {}'.format(2.0**lc, 2.0**lg)
            acc = svm_train(y_train, X_train, arg3=arg3)
            
            
            if acc > best_acc:
                best_lc = lc
                best_lg = lg
                best_acc = acc
    return best_lc, best_lg, best_acc

In [5]:
log2c = [-5, -3, -1, 1, 3, 5]
log2g = [-5, -3, -1, 1, 3, 5]
best_lc, best_lg, best_acc = grid_search(log2c, log2g, X_train, y_train, X_test, y_test)
print("Best set (C, gamma)=(2^{}, 2^{}), accuracy:{}%".format(best_lc, best_lg, best_acc))

Cross Validation Accuracy = 94.18%
Cross Validation Accuracy = 41.56%
Cross Validation Accuracy = 21.86%
Cross Validation Accuracy = 20.32%
Cross Validation Accuracy = 78.9%
Cross Validation Accuracy = 54%
Cross Validation Accuracy = 96.96%
Cross Validation Accuracy = 47.66%
Cross Validation Accuracy = 21.68%
Cross Validation Accuracy = 20.22%
Cross Validation Accuracy = 78.78%
Cross Validation Accuracy = 54.06%
Cross Validation Accuracy = 97.92%
Cross Validation Accuracy = 54.34%
Cross Validation Accuracy = 25.32%
Cross Validation Accuracy = 20.22%
Cross Validation Accuracy = 79.04%
Cross Validation Accuracy = 40.98%
Cross Validation Accuracy = 98.4%
Cross Validation Accuracy = 85%
Cross Validation Accuracy = 45.7%
Cross Validation Accuracy = 25.08%
Cross Validation Accuracy = 20.52%
Cross Validation Accuracy = 35%
Cross Validation Accuracy = 98.54%
Cross Validation Accuracy = 85.24%
Cross Validation Accuracy = 44.92%
Cross Validation Accuracy = 25.24%
Cross Validation Accuracy = 20.9

# Part3: 
Use linear kernel + RBF kernel together (therefore a new kernel function) and compare its performance with respect to others. You would need to find out how to use a user-defined kernel in libsvm.

reference from https://stackoverflow.com/questions/7715138/using-precomputed-kernels-with-libsvm

In [6]:
def userDefined_kernel(X, X_, gamma):
    kernel_linear = X @ X_.T
    kernel_RBF = np.exp(-gamma*cdist(X, X_, 'sqeuclidean'))  # seuclidean：標準化歐式距離
    kernel = kernel_linear + kernel_RBF
    kernel = np.hstack((np.arange(1, len(X)+1).reshape(-1,1), kernel))
    return kernel

In [7]:
K  = userDefined_kernel(X_train, X_train, 2**best_lg)    # best_lg: from part2
KK = userDefined_kernel(X_test, X_train, 2**best_lg)     # best_lg: from part2

prob  = svm_problem(y_train, K, isKernel=True)
param = svm_parameter('-q -t 4')
model = svm_train(prob, param)
p_label, p_acc, p_vals = svm_predict(y_test, KK, model, '-q')
print('linear kernel + RBF kernel accuracy: {:.2f}%'.format(p_acc[0]))

linear kernel + RBF kernel accuracy: 95.64%


# Observation
C越大，懲罰越大，越少support vectors，越接近hard-margin SVM的概念，卻容易overfitting

C越小，懲罰越小，越多support vectors，可以追求更大的margin

gamma大，資料點的影響力範圍比較近，對超平面來說，近點的影響力權重較大，容易勾勒出擬合近點的超平面，也容易造成overfitting。

gamma小，資料點的影響力範圍比較遠，對超平面來說，較遠的資料點也有影響力，因此能勾勒出平滑、近似直線的超平面。

reference from https://rpubs.com/skydome20/R-Note14-SVM-SVR

这里面大家需要注意的就是gamma的物理意义，大家提到很多的RBF的幅宽，它会影响每个支持向量对应的高斯的作用范围，从而影响泛化性能。我的理解：如果gamma设的太大，標準差会很小，很小的標準差高斯分布长得又高又瘦， 会造成只会作用于支持向量样本附近，对于未知样本分类效果很差，存在训练准确率可以很高，而测试准确率不高的可能，就是通常说的过训练；而如果设的过小，则会造成平滑效应太大，无法在训练集上得到特别高的准确率，也会影响测试集的准确率。

reference from https://blog.csdn.net/lujiandong1/article/details/46386201

# Observation
嘗試加入polynomial kernel

In [26]:
def userDefined_kernel(X, X_, gamma):
    kernel_linear = X @ X_.T
    kernel_poly = (1 + gamma*(X @ X_.T))**5
    kernel_RBF = np.exp(-gamma*cdist(X, X_, 'sqeuclidean'))  # seuclidean：標準化歐式距離
    kernel = kernel_linear + kernel_RBF + kernel_poly
    kernel = np.hstack((np.arange(1, len(X)+1).reshape(-1,1), kernel))
    return kernel

In [28]:
K  = userDefined_kernel(X_train, X_train, 2**best_lg)    # best_lg: from part2
KK = userDefined_kernel(X_test, X_train, 2**best_lg)     # best_lg: from part2

prob  = svm_problem(y_train, K, isKernel=True)
param = svm_parameter('-q -t 4')
model = svm_train(prob, param)
p_label, p_acc, p_vals = svm_predict(y_test, KK, model, '-q')
print('linear kernel + polynomial kernel +RBF kernel accuracy: {:.2f}%'.format(p_acc[0]))

linear kernel + polynomial kernel +RBF kernel accuracy: 97.88%
