In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time

import sklearn
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder # one-hot encoding
from sklearn.decomposition import PCA # PCA
from sklearn.metrics import confusion_matrix

from sklearn import metrics
from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.model_selection import KFold # K-fold validation
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import random
from matplotlib.colors import ListedColormap
import seaborn as sns

In [29]:
""" Data input """
col_names = [i for i in range(35)]
data = pd.read_csv('ionosphere.data', sep=",", names = col_names)

""" Data preprocessing """

# shuffle
data = data.sample(frac=1)

# feature/label split
X = data.iloc[:,0:-1]
y = data.iloc[:,-1]

In [81]:
""" Model Construciton """

# Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Linear
clf_lin = svm.SVC(kernel='linear')
clf_lin.fit(X_train, y_train)

# Grid search (poly)
best_score = 0
best_degree = -1
best_gamma = 'auto'
best_coef0 = -1

scores_poly = np.zeros((5,5,12))
for i, gamma in enumerate(['scale', 'auto'] + [(i+1)*0.5 for i in range(3)]):
    for j, degree in enumerate([i+1 for i in range(5)]):
        for k, coef0 in enumerate([i*0.1 for i in range(12)]):
            clf_poly = svm.SVC(kernel='poly', degree=degree, gamma=gamma, coef0=coef0)
            scores = cross_val_score(clf_poly,X_train,y_train,cv=5,scoring='accuracy')
            scores_poly[i][j][k] = scores.mean()
            if scores.mean() > best_score:
                best_score = scores.mean()
                best_degree = degree
                best_gamma = gamma
                best_coef0 = coef0
print('Grid search finish!')
print(best_degree, best_gamma, best_coef0)
print('Best score = ', best_score)

clf_poly = svm.SVC(kernel='poly', degree=best_degree, gamma=best_gamma, coef0=best_coef0)
clf_poly.fit(X_train, y_train)

# Grid search (rbf)
best_score = 0
best_gamma = 'auto'

scores_rbf = np.zeros((5))
for i, gamma in enumerate(['scale', 'auto'] + [(i+1)*0.5 for i in range(3)]):
    clf_rbf = svm.SVC(kernel='rbf', gamma=gamma)
    scores = cross_val_score(clf_rbf,X_train,y_train,cv=5,scoring='accuracy')
    scores_rbf[i] = scores.mean()
    if scores.mean() > best_score:
        best_score = scores.mean()
        best_degree = degree
        best_gamma = gamma
        best_coef0 = coef0
print('Grid search finish!')
print(best_gamma)
print('Best score = ', best_score)

clf_rbf = svm.SVC(kernel='rbf', gamma=best_gamma)
clf_rbf.fit(X_train, y_train)

Grid search finish!
3 scale 0.9
Best score =  0.9020408163265307
Grid search finish!
0.5
Best score =  0.9387755102040816


SVC(gamma=0.5)

In [82]:
""" Question """

# poly
for i, gamma in enumerate(['scale', 'auto'] + [(i+1)*0.5 for i in range(3)]):
    print('------------- gamma = {} -------------'.format(gamma))
    print('||', end='')
    for z in range(12):
        print('**%.1f**|' % (z*0.1), end='')
    print()
    for j in range(5):
        print('|**%d**|' % (j+1), end='')
        for k in range(12):
            print('%.2f|' % (scores_poly[i][j][k] * 100), end='')
        print()
        
# poly
for i, gamma in enumerate(['scale', 'auto'] + [(i+1)*0.5 for i in range(3)]):
    print('------------- gamma = {} -------------'.format(gamma))
    print(scores_rbf[i]*100)

------------- gamma = scale -------------
||**0.0**|**0.1**|**0.2**|**0.3**|**0.4**|**0.5**|**0.6**|**0.7**|**0.8**|**0.9**|**1.0**|**1.1**|
|**1**|86.94|86.94|86.94|86.94|86.94|86.94|86.94|86.94|86.94|86.94|86.94|86.94|
|**2**|89.80|89.80|90.20|90.20|89.39|88.98|88.98|88.98|88.98|88.57|88.57|87.76|
|**3**|88.57|88.98|89.39|89.80|89.80|89.80|89.80|89.80|89.80|90.20|89.80|89.39|
|**4**|88.16|89.39|88.98|89.39|88.98|89.39|88.98|89.39|89.39|88.98|89.80|90.20|
|**5**|86.94|87.35|86.94|87.76|89.39|88.98|88.98|90.20|90.20|88.98|88.16|85.31|
------------- gamma = auto -------------
||**0.0**|**0.1**|**0.2**|**0.3**|**0.4**|**0.5**|**0.6**|**0.7**|**0.8**|**0.9**|**1.0**|**1.1**|
|**1**|85.31|85.31|85.31|85.31|85.31|85.31|85.31|85.31|85.31|85.31|85.31|85.31|
|**2**|80.41|86.12|86.94|86.94|87.76|86.94|86.94|87.76|87.35|87.35|88.16|88.16|
|**3**|63.67|65.71|86.12|87.76|88.98|88.98|89.39|89.39|89.39|89.39|89.39|89.39|
|**4**|63.67|63.67|64.49|86.12|88.98|89.39|89.80|90.20|88.98|88.98|88.98|89.39|

In [83]:
""" Result """

# Linear
y_pred = clf_lin.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Poly
y_pred = clf_poly.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# RBF
y_pred = clf_rbf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[23 14]
 [ 3 66]]
              precision    recall  f1-score   support

           b       0.88      0.62      0.73        37
           g       0.82      0.96      0.89        69

    accuracy                           0.84       106
   macro avg       0.85      0.79      0.81       106
weighted avg       0.85      0.84      0.83       106

[[30  7]
 [ 2 67]]
              precision    recall  f1-score   support

           b       0.94      0.81      0.87        37
           g       0.91      0.97      0.94        69

    accuracy                           0.92       106
   macro avg       0.92      0.89      0.90       106
weighted avg       0.92      0.92      0.91       106

[[36  1]
 [ 6 63]]
              precision    recall  f1-score   support

           b       0.86      0.97      0.91        37
           g       0.98      0.91      0.95        69

    accuracy                           0.93       106
   macro avg       0.92      0.94      0.93       106
weighted avg     