# Multiclass SVM 구현 - 19기 김은지

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [3]:
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [9]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

# OVR 방법

### 클래스 one-hot 인코딩

In [6]:
y_train

110     virginica
69     versicolor
148     virginica
39         setosa
53     versicolor
          ...    
64     versicolor
91     versicolor
81     versicolor
51     versicolor
0          setosa
Name: species, Length: 120, dtype: object

In [7]:
y_train = pd.get_dummies(y_train)

In [8]:
y_train

Unnamed: 0,setosa,versicolor,virginica
110,0,0,1
69,0,1,0
148,0,0,1
39,1,0,0
53,0,1,0
...,...,...,...
64,0,1,0
91,0,1,0
81,0,1,0
51,0,1,0


### 각각 binary SVM으로 트레이닝

In [10]:
svm1 = SVC(kernel = 'rbf', C = 5, gamma = 5)
svm2 = SVC(kernel = 'rbf', C = 5, gamma = 5)
svm3 = SVC(kernel = 'rbf', C = 5, gamma = 5)

In [11]:
# 각 클래스인지 아닌지를 분류해주는 svm
svm1.fit(X_train, y_train.loc[:, 'setosa'])
svm2.fit(X_train, y_train.loc[:, 'versicolor'])
svm3.fit(X_train, y_train.loc[:, 'virginica'])

SVC(C=5, gamma=5)

In [12]:
print(svm1.predict(X_test))

[0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0]


In [13]:
print(svm1.decision_function(X_test))

[-1.12359969 -0.86782512 -0.65599247 -0.50194294 -0.76541147 -0.8819188
  1.07735938 -0.99156769  0.50201986 -0.9984315  -0.84532712  0.17062549
  0.34917127 -0.9813287  -0.72783399 -0.93313988  1.28153212 -0.56827872
 -0.73092732 -0.99670034  0.43553308 -0.96967771 -0.83939495 -1.03305682
 -0.75566609  1.13888006  0.42965012 -1.04268452 -0.93608147 -1.06090982]


In [14]:
print(svm2.predict(X_test))

[1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1]


In [15]:
print(svm3.predict(X_test))

[0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0]


하나의 클래스에서만 맞다고 인식된 데이터도 있었지만 여러 클래스가 맞다고 인식된 데이터도 있었다. 이 경우 decision function을 이용해 이 값이 큰 클래스를 정답으로 판별한다. 

### 1. 하나의 클래스에서만 맞다고 인식된 경우

In [20]:
# 하나의 클래스만 맞다고 인식되면 그 클래스를 최종 예측 값으로 결정
svm_preds = pd.DataFrame([svm1.predict(X_test), svm2.predict(X_test), svm3.predict(X_test)]).T
#svm_preds.columns = ['setosa', 'versicolor', 'virginica']
svm_preds.head()

Unnamed: 0,0,1,2
0,0,1,0
1,0,1,0
2,0,1,0
3,0,0,0
4,0,0,1


In [34]:
svm_preds.drop('label', axis=1, inplace=True)
svm_preds

Unnamed: 0,0,1,2
0,0,1,0
1,0,1,0
2,0,1,0
3,0,0,0
4,0,0,1
5,0,0,1
6,1,0,0
7,0,0,1
8,1,0,0
9,0,1,0


In [37]:
svm_preds['label'] = 'NaN'
svm_preds

Unnamed: 0,0,1,2,label
0,0,1,0,
1,0,1,0,
2,0,1,0,
3,0,0,0,
4,0,0,1,
5,0,0,1,
6,1,0,0,
7,0,0,1,
8,1,0,0,
9,0,1,0,


In [43]:
sum(svm_preds.iloc[0,:3])

1

In [52]:
labels = y_train.columns
labels

Index(['setosa', 'versicolor', 'virginica'], dtype='object')

In [53]:
for i in range(len(svm_preds)):
    if sum(svm_preds.iloc[i,:3]) == 1:
        label = [j for j in range(3) if svm_preds.iloc[i,j] == 1 ]
        #print(label)
        svm_preds.loc[i, 'label'] = labels[label]
        
svm_preds

Unnamed: 0,0,1,2,label
0,0,1,0,versicolor
1,0,1,0,versicolor
2,0,1,0,versicolor
3,0,0,0,2
4,0,0,1,virginica
5,0,0,1,virginica
6,1,0,0,setosa
7,0,0,1,virginica
8,1,0,0,setosa
9,0,1,0,versicolor


### 2. 여러 클래스가 맞다고 인식된 경우 & 모든 클래스가 맞지 않다고 인식된 경우
decision function의 결과값의 최대값에 해당하는 label을 최종 label로 결정한다

In [47]:
svm1.decision_function(X_test)

array([-1.12359969, -0.86782512, -0.65599247, -0.50194294, -0.76541147,
       -0.8819188 ,  1.07735938, -0.99156769,  0.50201986, -0.9984315 ,
       -0.84532712,  0.17062549,  0.34917127, -0.9813287 , -0.72783399,
       -0.93313988,  1.28153212, -0.56827872, -0.73092732, -0.99670034,
        0.43553308, -0.96967771, -0.83939495, -1.03305682, -0.75566609,
        1.13888006,  0.42965012, -1.04268452, -0.93608147, -1.06090982])

In [46]:
svm_decisions = pd.DataFrame([svm1.decision_function(X_test), svm2.decision_function(X_test), svm3.decision_function(X_test)]).T
#svm_preds.columns = ['setosa', 'versicolor', 'virginica']
svm_decisions.head()

Unnamed: 0,0,1,2
0,-1.1236,1.379466,-1.274571
1,-0.867825,0.641587,-0.78527
2,-0.655992,0.083346,-0.428302
3,-0.501943,-0.371038,-0.130155
4,-0.765411,-0.248726,0.02101


In [48]:
np.argmax(svm_decisions.iloc[0])

1

In [54]:
for i in range(len(svm_preds)):
    if sum(svm_preds.iloc[i,:3]) != 1:
        label = np.argmax(svm_decisions.iloc[i])
        svm_preds.loc[i, 'label'] = labels[label]
        
svm_preds

Unnamed: 0,0,1,2,label
0,0,1,0,versicolor
1,0,1,0,versicolor
2,0,1,0,versicolor
3,0,0,0,virginica
4,0,0,1,virginica
5,0,0,1,virginica
6,1,0,0,setosa
7,0,0,1,virginica
8,1,0,0,setosa
9,0,1,0,versicolor


## 예측 결과 확인

In [51]:
y_test

96     versicolor
73     versicolor
134     virginica
41         setosa
70     versicolor
116     virginica
19         setosa
138     virginica
33         setosa
89     versicolor
137     virginica
36         setosa
20         setosa
126     virginica
87     versicolor
56     versicolor
11         setosa
62     versicolor
72     versicolor
120     virginica
8          setosa
147     virginica
77     versicolor
86     versicolor
129     virginica
4          setosa
31         setosa
136     virginica
132     virginica
88     versicolor
Name: species, dtype: object

In [55]:
accuracy_score(y_test, svm_preds['label'])

0.8666666666666667