# Multiclass Classification - Iris data

- 이번에는 0,1분류가 아닌 multiclass에 대한 분류를 해보겠습니다.
- 먼저 간단한 iris data를 이용해보겠습니다.

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import random

random.seed(777)
torch.manual_seed(777)

ModuleNotFoundError: No module named 'torch'

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

ModuleNotFoundError: No module named 'torch'

In [3]:
from sklearn.datasets import load_iris

## Load Data & Preprocessing

In [6]:
iris = load_iris()
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
# train과 test set을 분리합니다.

train_X, test_X, train_y, test_y = train_test_split(iris.data, iris.target, test_size=0.3)

In [8]:
train_y

array([1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 2, 0, 1, 2, 0, 0, 0, 1, 2, 1, 1,
       0, 2, 2, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 1, 1, 2, 2, 1, 0, 2, 1,
       2, 1, 2, 0, 2, 0, 1, 1, 1, 2, 0, 0, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0,
       2, 0, 0, 1, 0, 2, 0, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 0,
       0, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2])

In [9]:
# 이제 익숙해지고 계신가요?

class makeData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [10]:
# normalize은 기본적으로 열(column)기준입니다.

train_data = makeData(np.array(normalize(train_X)), np.array(train_y))
test_data = makeData(np.array(normalize(test_X)), np.array(test_y))

In [11]:
train_data.__getitem__(0)

(array([0.74714194, 0.33960997, 0.54337595, 0.17659719]), 1)

In [12]:
train_data.__len__()

105

## Modeling

In [13]:
class Multiclass_Classification(nn.Module):
    
    def __init__(self, num_feature, num_class):
        super(Multiclass_Classification, self).__init__()
        
        # 이전 예제에서는 __init__에 self만 있었습니다.
        # 이제는 넣고 싶은 feature의 개수와 class를 자유롭게 하기 위해 변수로 지정합니다.
        # Layer 역시 활성화 함수를 ReLU로 변경하고 Layer 안에 넣어 좀 더 짧은 코드로 신경망을 구성했습니다.
        
        self.Layer_1 = nn.Sequential(
                                nn.Linear(num_feature, 100),
                                nn.ReLU()
                            )
        self.Layer_2 = nn.Sequential(
                                nn.Linear(100, 50),
                                nn.ReLU()
                            )
        
        self.Layer_3 = nn.Sequential(
                                nn.Linear(50, 30),
                                nn.ReLU()
                            )
        
        self.dropout = nn.Dropout(0.5)
        
        self.out = nn.Linear(30, num_class)
        
    def forward(self, inputs):
        
        x = self.Layer_1(inputs)
        x = self.Layer_2(x)
        x = self.Layer_3(x)
        
        x = self.out(x)
        
        return x
    
    def predict(self, test_inputs):
        
        x = self.Layer_1(inputs)
        x = self.Layer_2(x)
        x = self.Layer_3(x)
        
        x = self.out(x)
        
        # 1차원의 최대값 위치들을 출력합니다.
        
        return torch.max(x, 1)[1]

## Training

In [15]:
# 30개씩 1000번 학습시켜보겠습니다.

EPOCHS = 1000
BATCH_SIZE = 30
NUM_FEATURE = 4
NUM_CLASS = 3

model = Multiclass_Classification(NUM_FEATURE, NUM_CLASS)

# pytorch에서는 NLL loss + log softmax가 이미 적용되어 있습니다.
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    
    for X_batch, y_batch in train_loader:
        
        #torch.tensor는 기본적으로 float로 들어갑니다.
        
        inputs = torch.Tensor(X_batch.float())
        targets = torch.LongTensor(y_batch.long())
        model.zero_grad()
        y_pred = model(inputs)
        loss = criterion(y_pred, targets)
        loss.backward()
        optimizer.step()
    
    if epoch % 100 == 0:
        print(loss)

tensor(1.0754, grad_fn=<NllLossBackward>)
tensor(0.1792, grad_fn=<NllLossBackward>)
tensor(0.0336, grad_fn=<NllLossBackward>)
tensor(0.0219, grad_fn=<NllLossBackward>)
tensor(0.0289, grad_fn=<NllLossBackward>)
tensor(0.0196, grad_fn=<NllLossBackward>)
tensor(0.0973, grad_fn=<NllLossBackward>)
tensor(0.1993, grad_fn=<NllLossBackward>)
tensor(0.0204, grad_fn=<NllLossBackward>)
tensor(0.0810, grad_fn=<NllLossBackward>)


## Test Accuracy

In [16]:
test_loader = DataLoader(dataset=test_data, batch_size=1)

test_y_true = []
test_y_pred = []

for X_batch, y_batch in test_loader:
    
    inputs = torch.Tensor(X_batch.float())
    test_y_true.append(y_batch.item())
    y_pred = model.predict(inputs)
    test_y_pred.append(int(y_pred.item()))

In [17]:
print(test_y_pred)

[2, 1, 1, 0, 2, 0, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 2, 1, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 2, 2]


In [18]:
print(test_y_true)

[2, 1, 1, 0, 2, 0, 2, 1, 0, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 0, 1, 0, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 1, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 2, 2]


In [19]:
# 약 93.3%의 정확도를 보입니다.

accuracy_score(test_y_pred, test_y_true)

0.9333333333333333

# Multi-Classification_wine_quality

- feature가 좀 더 많은 wine quality 데이터를 이용해 wine quality 분류 문제를 풀어보겠습니다.

In [20]:
import pandas as pd

In [21]:
wine = pd.read_csv('./2_winequality-red.csv', sep=';')

In [22]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [23]:
train_X, test_X, train_y, test_y = train_test_split(wine.iloc[:,range(11)], wine.iloc[:,-1], test_size=0.3)

In [24]:
class makeData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [25]:
# normalize까지만 해보겠습니다. 열 기준이기 때문에 바로 넣어도 괜찮습니다.

train_data = makeData(np.array(normalize(train_X)), np.array(train_y))
test_data = makeData(np.array(normalize(test_X)), np.array(test_y))

In [26]:
train_data.__getitem__(0)

(array([1.00918407e-01, 8.48299649e-03, 1.46258560e-04, 2.77891264e-02,
        1.17006848e-03, 5.85034241e-01, 7.89796225e-01, 1.45794921e-02,
        4.97279105e-02, 1.06768749e-02, 1.41870803e-01]), 5)

In [27]:
train_data.__len__()

1119

In [37]:
class Multiclass_Classification(nn.Module):
    def __init__(self, num_feature, num_class):
        
        super(Multiclass_Classification, self).__init__()
        
        self.Layer_1 = nn.Sequential(
                                nn.Linear(num_feature, 10),
                                nn.ReLU()
                            )
        self.Layer_2 = nn.Sequential(
                                nn.Linear(10, 10),
                                nn.ReLU()
                            )
        
        self.Layer_3 = nn.Sequential(
                                nn.Linear(10, 10),
                                nn.ReLU()
                            )
        self.out = nn.Linear(10, num_class)
        
    def forward(self, inputs):
        
        x = self.Layer_1(inputs)
        x = self.Layer_2(x)
        x = self.Layer_3(x)
        x = self.out(x)
        
        return x
    
    def predict(self, test_inputs):
        
        x = self.Layer_1(inputs)
        x = self.Layer_2(x)
        x = self.Layer_3(x)
        x = self.out(x)
        
        return torch.max(x, 1)[1]

In [38]:
EPOCHS = 200
BATCH_SIZE = 20
NUM_FEATURE = 11
NUM_CLASS = 11

model = Multiclass_Classification(NUM_FEATURE, NUM_CLASS)

criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    for X_batch, y_batch in train_loader:
        
        inputs = Variable(torch.Tensor(X_batch.float()))
        targets = Variable(torch.LongTensor(y_batch.long()))
        
        model.zero_grad()
        y_pred = model(inputs)
        
        loss = criterion(y_pred, targets)
        loss.backward()
        optimizer.step()
    
    if epoch % 100 == 0:
        print(loss)

tensor(2.2257, grad_fn=<NllLossBackward>)


In [39]:
test_loader = DataLoader(dataset=test_data, batch_size=1)
test_y_true = []
test_y_pred = []

for X_batch, y_batch in test_loader:
    
    inputs = Variable(torch.Tensor(X_batch.float()))
    test_y_true.append(y_batch.item())
    y_pred = model.predict(inputs)
    test_y_pred.append(int(y_pred.item()))

In [40]:
# 36.8%의 accuracy가 나왔습니다. 

accuracy_score(test_y_pred, test_y_true)

0.36875