In [1]:
#cross entropy를 최소화하는 것이 중요

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
torch.manual_seed(1)


<torch._C.Generator at 0x1df460d05f0>

In [6]:
z = torch.FloatTensor([1,2,3])
print(z)
hypothesis = F.softmax(z, dim = 0)
print(hypothesis)
hypothesis.sum()

tensor([1., 2., 3.])
tensor([0.0900, 0.2447, 0.6652])


tensor(1.)

### low level

In [8]:
z = torch.rand(3, 5, requires_grad = True) #아무거나
#requires_grad = True: gradient를 배우도록 되어 있다.
hypothesis = F.softmax(z, dim = 1)
#dimension 1에 대해 softmax를 씌운다. 두번째 디맨젼에 대해 softmax를 수행해라
#결과값은 이미 softmax가 수행되고 난 결과임
print(hypothesis) #이건 prediction, y hat임.

y = torch.randint(5, (3,)).long() #이건 정답, 정답을 랜덤으로 생성
print(y) #정답 index
#클래스의 개수: 5개, sample수: 3개

tensor([[0.1664, 0.1871, 0.1737, 0.2695, 0.2033],
        [0.2002, 0.1783, 0.2218, 0.1944, 0.2054],
        [0.1809, 0.2380, 0.2318, 0.1084, 0.2409]], grad_fn=<SoftmaxBackward>)
tensor([3, 1, 2])


In [13]:
y_one_hot = torch.zeros_like(hypothesis) #y_one_hot: (3, 5)
y_one_hot.scatter_(1, y.unsqueeze(1), 1) #dimension 1에 대해 y.unsqueeze해서 1을 뿌려
#y의 size -> (3,), y.unsqueeze -> (3, 1)
#_을 적어줘서 y_one_hot에 값을 저장, 새로 할당하는 게 아니라.

cost = (y_one_hot * -torch.log(hypothesis)).sum(dim = 1).mean()
#원래가 (3,5) -> dimension 1에 대해 sum을 하면 (3,1) -> mean을 하면 scalar값이 나옴
print(cost) #이렇게 해서 평균을 구함.

tensor(1.4992, grad_fn=<MeanBackward0>)


### cross-entropy loss with torch.nn.functional

In [16]:
#low level
print(torch.log(F.softmax(z, dim = 1)))

#high level
print(F.log_softmax(z, dim = 1))

#똑같음

tensor([[-1.7935, -1.6760, -1.7504, -1.3114, -1.5929],
        [-1.6086, -1.7244, -1.5062, -1.6381, -1.5826],
        [-1.7096, -1.4354, -1.4617, -2.2223, -1.4236]], grad_fn=<LogBackward>)
tensor([[-1.7935, -1.6760, -1.7504, -1.3114, -1.5929],
        [-1.6086, -1.7244, -1.5062, -1.6381, -1.5826],
        [-1.7096, -1.4354, -1.4617, -2.2223, -1.4236]],
       grad_fn=<LogSoftmaxBackward>)


In [18]:
#low level
print((y_one_hot * -torch.log(F.softmax(z, dim = 1))).sum(dim = 1).mean())

#high level
print(F.nll_loss(F.log_softmax(z, dim=1), y))
#nll = negative log likelihood-> sum(dim=1).mean()을 생략할 수 있음.

print(F.cross_entropy(z, y))

tensor(1.4992, grad_fn=<MeanBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward>)
tensor(1.4992, grad_fn=<NllLossBackward>)


### Training with low-level cross entropy loss

In [23]:
x_train = [[1,2,1,1],
          [2,1,3,2],
          [3,1,3,4],
          [4,1,5,5],
          [1,7,5,5],
          [1,2,5,6],
          [1,6,6,6],
          [1,7,7,7]]
y_train = [2,2,2,1,1,1,0,0]
x_train = torch.FloatTensor(x_train) #x_train = (m,4)
y_train = torch.LongTensor(y_train) #y_train = (m,)

#지금은 하나의 레이어만, 나중에 뉴럴넷은 이 레이어를 많이!
W = torch.zeros((4,3), requires_grad = True) 
#4개 중에서 웨이트를 구해서 결국 클래스 세 개 중 하나의 값으로 나와야 하니까 아웃풋이 3
b = torch.zeros(1, requires_grad = True)
# #samples = m, #classes = 3, dim = 4

optimizer = optim.SGD([W, b], lr = 0.1)

nb_epochs = 1000
for epoch in range(nb_epochs + 1):
    #cost 계산
    hypothesis = F.softmax(x_train.matmul(W) + b, dim = 1)
    y_one_hot = torch.zeros_like(hypothesis)
    y_one_hot.scatter_(1, y_train.unsqueeze(1), 1)
    cost = (y_one_hot * -torch.log(F.softmax(hypothesis, dim=1))).sum(dim=1).mean()
    
    #cost로 h(x) 개선
    optimizer.zero_grad() #gradient초기화
    cost.backward() #backpropagation 수행
    optimizer.step()
    
    if epoch % 100 == 0:
        print('Epoch {:4d}/{} Cost: {:.6f}'.format(
        epoch, nb_epochs, cost.item()))

Epoch    0/1000 Cost: 1.098612
Epoch  100/1000 Cost: 0.901535
Epoch  200/1000 Cost: 0.839114
Epoch  300/1000 Cost: 0.807826
Epoch  400/1000 Cost: 0.788472
Epoch  500/1000 Cost: 0.774822
Epoch  600/1000 Cost: 0.764449
Epoch  700/1000 Cost: 0.756191
Epoch  800/1000 Cost: 0.749398
Epoch  900/1000 Cost: 0.743671
Epoch 1000/1000 Cost: 0.738749


### training with F.cross_entropy

In [24]:
W = torch.zeros((4,3), requires_grad = True)
b = torch.zeros(1, requires_grad = True)

optimizer = optim.SGD([W, b], lr = 0.1)

nb_epochs = 1000
for epoch in range(nb_epochs + 1):
    z = x_train.matmul(W) + b
    cost = F.cross_entropy(z, y_train)
    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    
    if epoch % 100  == 0:
        print('Epoch : {:4d}/{} Cost: {:.6f}'.format(
        epoch, nb_epochs, cost.item()))
    

Epoch :    0/1000 Cost: 1.098612
Epoch :  100/1000 Cost: 0.761050
Epoch :  200/1000 Cost: 0.689991
Epoch :  300/1000 Cost: 0.643229
Epoch :  400/1000 Cost: 0.604117
Epoch :  500/1000 Cost: 0.568255
Epoch :  600/1000 Cost: 0.533922
Epoch :  700/1000 Cost: 0.500291
Epoch :  800/1000 Cost: 0.466908
Epoch :  900/1000 Cost: 0.433507
Epoch : 1000/1000 Cost: 0.399962


### high-level implementation with nn.Module

In [26]:
class SoftmaxClassifierModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4, 3)
        #리니어 레이어가 있고 4개의 인풋을 받아 3개의 클래스값을 아웃풋으로 내줌
        
    def forward(self, x):
        return self.linear(x) #x는 sample개수(4개)
    #리니어 레이어를 통과하면 x = (m, 4) ->리니어 모델 통과하면(m, 3)이 될 것이다.
    
model = SoftmaxClassifierModel()

optimizer = optim.SGD(model.parameters(), lr = 0.1)

nb_epochs = 1000
for epoch in range(nb_epochs + 1):
    prediction = model(x_train)
    cost = F.cross_entropy(prediction, y_train)
    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print('Epoch {:4d}/{} Cost {:.6f}'.format(
        epoch, nb_epochs, cost.item()))

Epoch    0/1000 Cost 3.586499
Epoch  100/1000 Cost 0.650367
Epoch  200/1000 Cost 0.566842
Epoch  300/1000 Cost 0.512495
Epoch  400/1000 Cost 0.468165
Epoch  500/1000 Cost 0.428759
Epoch  600/1000 Cost 0.391982
Epoch  700/1000 Cost 0.356357
Epoch  800/1000 Cost 0.320727
Epoch  900/1000 Cost 0.284398
Epoch 1000/1000 Cost 0.250610
