# Multiclass SVM 구현

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [5]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

In [6]:
# one-hot 인코딩
y_train = pd.get_dummies(y_train)
y_train

Unnamed: 0,setosa,versicolor,virginica
110,0,0,1
69,0,1,0
148,0,0,1
39,1,0,0
53,0,1,0
...,...,...,...
64,0,1,0
91,0,1,0
81,0,1,0
51,0,1,0


### Setosa vs (Versicolor, Virginica)

In [7]:
svm1 = SVC()
svm1.fit(X_train, y_train.iloc[:,0])

In [8]:
svm1.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0], dtype=uint8)

In [9]:
lst1 = svm1.decision_function(X_test)
lst1

array([-1.12800812, -1.16251843, -1.1422978 , -0.08568098, -1.17027451,
       -1.23267582,  1.09183692, -1.2512859 ,  1.02069728, -1.2643363 ,
       -1.23088294,  1.10969518,  1.09329371, -1.25623518, -1.04702622,
       -1.12673257,  1.1923107 , -1.02360898, -1.15249844, -1.1523064 ,
        0.88066583, -1.22401144, -1.21240853, -1.14948774, -1.13066337,
        1.17327079,  1.04484236, -1.03358459, -1.13006558, -1.02256327])

### Versicolor vs (Setosa, Virginica)

In [10]:
svm2 = SVC()
svm2.fit(X_train, y_train.iloc[:,1])

In [11]:
svm2.predict(X_test)

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1], dtype=uint8)

In [12]:
lst2 = svm2.decision_function(X_test)
lst2

array([ 1.22902861,  1.18021327, -0.17897727, -0.18270157,  0.11157081,
       -0.62302972, -1.30208112, -0.1161839 , -1.11408288,  0.98146982,
       -0.50352989, -1.22949294, -1.09831885, -0.35890882,  0.58081027,
        0.69274083, -1.43031588,  1.07405999,  0.047718  , -1.42480972,
       -1.11371034, -0.89665337,  0.04194481,  0.8041226 , -0.32973119,
       -1.43978879, -1.02086281, -1.26618241, -1.86832122,  1.23359629])

### Virsinica vs (Setosa, Versicolor)

In [13]:
svm3 = SVC()
svm3.fit(X_train, y_train.iloc[:,2])

In [14]:
svm3.predict(X_test)

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0], dtype=uint8)

In [15]:
lst3 = svm3.decision_function(X_test)
lst3

array([-1.42708779, -1.21748862,  0.15658044, -0.9341312 , -0.13715909,
        0.77015558, -1.1852678 ,  0.18563113, -1.0405367 , -0.99876073,
        0.64733162, -1.33283373, -1.49025247,  0.45890847, -0.65925536,
       -0.69718272, -1.26836718, -1.17659977, -0.03926931,  1.53339216,
       -1.12083008,  1.03585352,  0.11634206, -0.71111991,  0.41357359,
       -1.2021444 , -1.55142381,  1.2333595 ,  1.84275195, -1.57732042])

### 각 class별로 확률이 구해졌다면 argmax로 가장 높은 확률을 가질때의 class를 찾기

In [16]:
iris = []
for i in range(len(lst1)):
    iris.append(np.argmax([lst1[i], lst2[i], lst3[i]]))
print(iris)

[1, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0, 1, 1, 2, 0, 2, 2, 1, 2, 0, 0, 2, 2, 1]


0이면 setosa, 1이면 versicolor, 2면 virginica

In [17]:
y_pred = pd.DataFrame(iris, index=y_test.index).replace({0:"setosa", 1:"versicolor", 2:"virginica"})
y_pred

Unnamed: 0,0
96,versicolor
73,versicolor
134,virginica
41,setosa
70,versicolor
116,virginica
19,setosa
138,virginica
33,setosa
89,versicolor


In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9666666666666667

## Scikit-learn

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [20]:
from sklearn.multiclass import OneVsRestClassifier

svc = SVC()
o_vs_r = OneVsRestClassifier(svc)

o_vs_r.fit(X_train, y_train)
yhat = o_vs_r.predict(X_test)

In [21]:
accuracy_score(y_test, yhat)

0.9666666666666667

동일하게 나왔다ㅎㅎ