## Softmax Classification

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

$$P(class=i) = \frac{e^i}{\sum e^i}$$

In [2]:
z = torch.FloatTensor([1,2,3])

- max(z) = [0,0,1]
- softmax(z): soft version of the arg max

In [3]:
F.softmax(z, dim=0) 

tensor([0.0900, 0.2447, 0.6652])

### Cross-entropy loss

 $$ L = \frac{1}{N}\sum -y \log(\hat{y})$$
 where $\hat{y}$ is the predicted probability and $y$ is the true label (0 or 1)

In [4]:
logit = torch.rand(3, 5, requires_grad=True)
print(logit)

tensor([[0.1836, 0.6275, 0.8017, 0.8295, 0.0911],
        [0.3492, 0.1369, 0.0062, 0.1066, 0.6846],
        [0.7011, 0.0085, 0.4691, 0.2907, 0.5425]], requires_grad=True)


In [5]:
pred = F.softmax(logit, dim=1)
print(pred)

tensor([[0.1382, 0.2155, 0.2565, 0.2637, 0.1260],
        [0.2127, 0.1720, 0.1509, 0.1669, 0.2975],
        [0.2625, 0.1313, 0.2081, 0.1741, 0.2240]], grad_fn=<SoftmaxBackward>)


In [6]:
one_hot_label = [[0,0,1,0,0], [1,0,0,0,0], [0,0,0,0,1]]
one_hot_label = torch.FloatTensor(one_hot_label)
print(one_hot_label)

tensor([[0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])


In [7]:
loss = -one_hot_label*torch.log(pred)
print(loss)

tensor([[0.0000, 0.0000, 1.3606, 0.0000, 0.0000],
        [1.5479, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.4962]], grad_fn=<MulBackward0>)


In [8]:
torch.sum(loss, dim=1).mean()

tensor(1.4682, grad_fn=<MeanBackward0>)

In [9]:
F.cross_entropy(logit, torch.LongTensor([2,0,4]))  # F.cross_entropy는 one-hot 말고 원래 label을 input으로

tensor(1.4682, grad_fn=<NllLossBackward>)

### Basic training

In [10]:
x_train = [[1, 2, 1, 1],
           [2, 1, 3, 2],
           [3, 1, 3, 4],
           [4, 1, 5, 5],
           [1, 7, 5, 5],
           [1, 2, 5, 6],
           [1, 6, 6, 6],
           [1, 7, 7, 7]]
y_train = [2, 2, 2, 1, 1, 1, 0, 0]
x_train = torch.FloatTensor(x_train)
y_train = torch.LongTensor(y_train)

In [11]:
W = torch.zeros([4,3], requires_grad=True)
b = torch.zeros(3, requires_grad=True)

In [12]:
optimizer = optim.SGD([W,b], lr=0.1)

In [13]:
for step in range(1000):
#     logit = F.softmax(x_train.matmul(W) + b, dim=1) 
#     cost = (y_one_hot * -torch.log(F.softmax(logit, dim=1))).sum(dim=1).mean()
#     softmax로 구한 probabiltiy를 이용해 cross-entropy loss 계산
    logit = torch.matmul(x_train,W)+b
    loss = F.cross_entropy(logit, y_train)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if step%100==0:
        print('step:',step, 'loss:', loss.item())

step: 0 loss: 1.0986123085021973
step: 100 loss: 0.7041994333267212
step: 200 loss: 0.6229996085166931
step: 300 loss: 0.5657167434692383
step: 400 loss: 0.5152913331985474
step: 500 loss: 0.467661589384079
step: 600 loss: 0.4212777018547058
step: 700 loss: 0.37540170550346375
step: 800 loss: 0.3297656178474426
step: 900 loss: 0.28507253527641296


### Implementation with nn.Module

In [14]:
class SoftmaxClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4,3)
    
    def forward(self, x):
        return self.linear(x)

In [15]:
model = SoftmaxClassifier()

In [16]:
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [17]:
for step in range(1000):
    pred = model(x_train)
    loss = F.cross_entropy(pred, y_train)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if step%100==0:
        print('step:',step, 'loss:', loss.item())

step: 0 loss: 2.206914186477661
step: 100 loss: 0.7373358607292175
step: 200 loss: 0.6502688527107239
step: 300 loss: 0.5924007296562195
step: 400 loss: 0.5422427654266357
step: 500 loss: 0.4951184093952179
step: 600 loss: 0.44932663440704346
step: 700 loss: 0.4040595293045044
step: 800 loss: 0.35889995098114014
step: 900 loss: 0.313834547996521
