# Multiclass SVM 구현

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [3]:
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [5]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

## One vs One

In [6]:
n_class=3

In [7]:
y_train.unique()

array(['virginica', 'versicolor', 'setosa'], dtype=object)

In [8]:
y_ve_vi=pd.get_dummies(y_train[y_train != "setosa"], drop_first=True)
y_se_vi=pd.get_dummies(y_train[y_train != "versicolor"], drop_first=True) 
y_se_ve=pd.get_dummies(y_train[y_train != "virginica"], drop_first=True)

x_ve_vi= X_train[y_train != "setosa"]
x_se_vi= X_train[y_train != "versicolor"]
x_se_ve= X_train[y_train != "virginica"]


In [9]:
print(y_ve_vi.head(1)) # 1:virginica    0: versicolor
print(y_se_vi.head(1)) # 1:virginica    0: setosa
print(y_se_ve.head(1)) # 1:versicolor   0: setosa

     virginica
110          1
     virginica
110          1
    versicolor
69           1


In [10]:
svm1 = SVC(kernel ='rbf', C = 1, gamma = 1, random_state=1)
svm2 = SVC(kernel ='rbf', C = 1, gamma = 1, random_state=1)
svm3 = SVC(kernel ='rbf', C = 1, gamma = 1, random_state=1)

svm1.fit(x_ve_vi, y_ve_vi) 
svm2.fit(x_se_vi, y_se_vi) 
svm3.fit(x_se_ve, y_se_ve)

y_pred1=svm1.predict(X_test) 
y_pred2=svm2.predict(X_test) 
y_pred3=svm3.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [11]:
clf=[svm1, svm2, svm3] # (n_class)(n_class-1)/2 개의 calssifier들 리스트
pred=[]
result=[]

for i in clf:
    pred.append(i.predict(X_test))
    
for i in range(np.shape(pred)[1]):
    score=np.array([0, 0, 0]) # setosa, versicolor, virginica 순서 score 세팅
    if pred[0][i] == 1: # virginica인 경우
        score[2]+=1
    else:               # versicolor인 경우
        score[1]+=1

    if pred[1][i] == 1: # virginica인 경우
        score[2]+=1
    else:               # setosa인 경우
        score[0]+=1

    if pred[2][i] == 1: # versicolor인 경우
        score[1]+=1
    else:               # setosa인 경우
        score[0]+=1
    result.append(score.argmax())
    
result=pd.DataFrame(result, columns=['pred']).replace({0:'setosa', 1:'versicolor', 2:'virginica'})

In [12]:
result.head()

Unnamed: 0,pred
0,versicolor
1,versicolor
2,virginica
3,virginica
4,virginica


In [13]:
accuracy_score(y_test,result)

0.9

---

하단의 one vs rest는 클래스의 수에 상관없이 적용할 수 있도록 직접 구현하였으나 <br>작성한 one vs one 코드는 n_class=3인 경우에만 적용되도록 구현하여<br>
참고를 위해 투빅스의 블로그 관련 코드 첨부 <br>
https://tobigs.gitbook.io/tobigs/data-analysis/svm/python-svm-2

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)
X_train, X_test = standardization(X_train, X_test)

In [15]:
class OneVsOneSVM:
    def __init__(self, n_classes=3):
        self.n_classes = n_classes
        self.clfs = []
        self.y_pred = []
    
    # c1 class와 c2 class의 label 만드는 과정
    def one_vs_one_labels(self, c1, c2, y_train):
        size = y_train.shape[0]
        y = np.zeros(size)
        # one vs one을 학습시키기 위해 c1 class인지 c2 class인지를 구분해야 하므로
        # class가 c1인 경우 1, c2인 경우 -1을 넣은 새로운 label을 생성한다.
        for i in range(size):
            if y_train[i] == c1:
                y[i] = 1
            else:
                y[i] = -1
        return y
    
    # one vs one label을 적용해 두 class의 데이터만 가져오는 과정
    def one_vs_one_data(self, c1, c2, X_train, y_train):
        y_train = pd.DataFrame(y_train).replace({'setosa':0, 'versicolor':1, 'virginica':2}).values.flatten()
        
        # 해당 class의 index를 가져온다.
        index_c1 = (y_train == c1)
        index_c2 = (y_train == c2)
        
        # c1 class인지 c2 class인지를 비교해야 하므로
        # 해당 두 class에 속하는 데이터만 가져온다.
        y_train_c = np.concatenate((y_train[index_c1], y_train[index_c2]))
        y_train_c = self.one_vs_one_labels(c1, c2, y_train_c)
        X_train_c = np.vstack((X_train[index_c1], X_train[index_c2]))
        
        return y_train_c, X_train_c
    
    # class들의 조합 개수만큼의 classifier를 만들고 fitting 시키는 과정
    def fit(self, X_train, y_train, C=5, gamma=5):
        # class가 m개 라면 m * (m-1) / 2 개의 classifer가 필요하다.
        for c1 in range(self.n_classes):
            for c2 in range(c1+1, self.n_classes):
                data_c = self.one_vs_one_data(c1, c2, X_train, y_train)
                y_c = data_c[0].reshape(-1,1)
                X_c = data_c[1]
                
                clf = SVC(kernel='rbf', C=C, gamma=gamma)
                clf.fit(X_c, y_c)
                self.clfs.append([clf, c1, c2])
    
    # 각각의 classifier에서 나온 결과를 바탕으로 투표를 진행하는 과정
    def predict(self, X_test):
        vote = np.zeros((len(X_test), 3), dtype=int)
        size = X_test.shape[0]
        
        for i in range(size):
            x = X_test[i, :].reshape(-1, 4)
            for j in range(len(self.clfs)):
                clf, c1, c2 = self.clfs[j]
                pred = clf.predict(x)
                
                # x를 class c1으로 분류하면 class c1에 +1점
                # c2로 분류하면 class c2에 +1점을 준다.
                if pred == 1:
                    vote[i][c1] += 1
                else:
                    vote[i][c2] += 1
                    
            # 투표한 값 중 가장 큰 값의 인덱스를 test label에 넣는다
            self.y_pred.append(np.argmax(vote[i]))
            
            # 경우의 수
            # 1. 한 분류기의 투표 결과가 제일 높은 경우
            # 2. 세 분류기의 투표 결과가 모두 같은 경우
            # 3. 두 분류기의 투표 결과가 같고 나머지 한 분류기는 다른 경우
            
            # 2번째, 모두 동점일 경우 decision_function의 값이 가장 큰 경우를 test label에 넣는다
            if (vote[i][0] == vote[i][1]) and (vote[i][1] == vote[i][2]):
                self.y_pred[i] = np.argmax([self.clfs[0].decision_function(X_test)[i], self.clfs[1].decision_function(X_test)[i], self.clfs[2].decision_function(X_test)[i]])
            
            # 3번째, 두 분류기의 투표 결과가 양수로 같은 경우 decision_function이 값이 큰 경우를 test label에 넣는다
            elif (vote[i][0] == vote[i][1]) and vote[i][0] > 0 and vote[i][1] > 0:
                self.y_pred[i] = np.argmax([self.clfs[0].decision_function(X_test)[i], self.clfs[1].decision_function(X_test)[i]])
            elif (vote[i][0] == vote[i][2]) and vote[i][0] > 0 and vote[i][2] > 0:
                self.y_pred[i] = np.argmax([self.clfs[0].decision_function(X_test)[i], self.clfs[2].decision_function(X_test)[i]])
            elif (vote[i][1] == vote[i][2]) and vote[i][1] > 0 and vote[i][2] > 0:
                self.y_pred[i] = np.argmax([self.clfs[1].decision_function(X_test)[i], self.clfs[2].decision_function(X_test)[i]])

        # test를 진행하기 위해 0,1,2로 되어있던 데이터를 다시 문자 label로 변환
        self.y_pred = pd.DataFrame(self.y_pred).replace({0:'setosa', 1:'versicolor', 2:'virginica'})
        return self.y_pred
    
    # accuracy 확인
    def evaluate(self, y_test):
        print('Accuacy : {: .5f}'.format(accuracy_score(y_test, self.y_pred)))

In [16]:
onevsone = OneVsOneSVM()
onevsone.fit(X_train, y_train)
y_pred_one = onevsone.predict(X_test)
onevsone.evaluate(y_test)

Accuacy :  0.86667


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## One vs Rest

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)
X_train, X_test = standardization(X_train, X_test)

In [18]:
y_train = pd.get_dummies(y_train)
y_train

Unnamed: 0,setosa,versicolor,virginica
110,0,0,1
69,0,1,0
148,0,0,1
39,1,0,0
53,0,1,0
...,...,...,...
64,0,1,0
91,0,1,0
81,0,1,0
51,0,1,0


In [19]:
n_class=3
clf=[] # n_class 개의 calssifier들 리스트

for i in range(n_class):
    svc=SVC(kernel='rbf', C=1, gamma=1) # C 존재 : soft margin svm
    svc.fit(X_train,y_train.iloc[:,i]) # 각각 해당 클래스인지 아닌지 구분해주는 머신
    clf.append(svc)

In [20]:
size = X_test.shape[0] # 30 (행 수)
y_pred=[] # 예측값 리스트 (0,1,2의 형태)

for i in range(size): # decision function 활용 ( 가장 큰 값 )
    y_pred.append(np.argmax([clf[0].decision_function(X_test)[i], 
                             clf[1].decision_function(X_test)[i], 
                             clf[2].decision_function(X_test)[i]]))

# 0,1,2 -> 문자열 변환
y_pred = pd.DataFrame(y_pred).replace({0:'setosa', 1:'versicolor', 2:'virginica'})


In [21]:
y_pred.head()

Unnamed: 0,0
0,versicolor
1,versicolor
2,virginica
3,virginica
4,virginica


In [22]:
 print('Accuacy : {: .5f}'.format(accuracy_score(y_test,y_pred)))

Accuacy :  0.90000


## Compare Result

In [23]:
# 라이브러리 사용, 결과 비교
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=48)

scaler = StandardScaler() #scaling
X_train_2 = scaler.fit_transform(X_train_2)
X_test_2 = scaler.transform(X_test_2)

svm = SVC(kernel ='rbf', C = 1, gamma = 1)
svm.fit(X_train_2, y_train_2)
y_pred = svm.predict(X_test_2)

accuracy_score(y_test_2,y_pred)

0.9

- one vs one
- one vs rest 
- library <br>
모두 C=1, gamma=1 에서 0.9로 동일한 성능을 보인다.