 # Review: XOR

XOR는 하나의 층(layer)만 가지는 perceptron으로는 해결할 수 없는 문제임을 배웠다. 따라서 이 문제는 Multilayer Perceptron으로 해결해야 한다.

# Multilayer Perceptron

하나의 층을 더 쌓아서 선을 하나 더 긋는 원리이다.

![텍스트](https://www.researchgate.net/profile/Hamidreza_Modares/publication/220283506/figure/fig1/AS:669990353133590@1536749556344/Solving-XOR-problem-using-3-conventional-neurons-as-a-2-2-1-MLP-network.png)

# Backpropagation

output과 원래 정답인 G(t)간의 차이, 즉 loss(=cost)에 대해서 weight들에 대한 미분 값을 계산하게 되고 이 gradient를 가지고 뒷단에 있는 weight부터 loss값을 최소화시킬 수 있도록 weight를 업데이트 하는 방식이다.

In [36]:
import torch

In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [38]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

linear1 = torch.nn.Linear(2, 2, bias=True)<br>
linear2 = torch.nn.Linear(2, 1, bias=True)<br>
sigmoid = torch.nn.Sigmoid()

평상시에는 이렇게 torch.nn 패키지에 있는 linear layer를 주로 사용하지만 이번에는 backpropagation으로 직접 업데이트 하기 위해 weight와 bias를 직접 다 선언해보도록 하겠다.

In [39]:
# nn layers
w1 = torch.Tensor(2, 2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2, 1).to(device)
b2 = torch.Tensor(1).to(device)

In [40]:
# sigmoid도 pytorch가아닌 직접 함수로 적용해보겠다,
def sigmoid(x):
    return 1.0 / (1.0 + torch.exp(-x))

# sigmoid를 미분한 함수 (backpropagation결과임)
def sigmoid_prime(x):
    return sigmoid(x) * (1-sigmoid(x))

In [41]:
learning_rate = 1
for step in range(10001):
    # forward
    l1 = torch.add(torch.matmul(X, w1), b1)
    a1 = sigmoid(l1)
    l2 = torch.add(torch.matmul(a1, w2), b2)
    Y_pred = sigmoid(l2)
    
    # binary_cross_entropy loss
    cost = -torch.mean(Y * torch.log(Y_pred) + (1 - Y) * torch.log(1 - Y_pred))
    
    # Back prop (chain rule)
    # binary_cross_entropy loss를 미분한 식
    d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)
    # 마지막 항 1e-7은 0으로 나누어지는 경우를 막아주기 위한 term
    
    # Layer 2
    d_l2 = d_Y_pred * sigmoid_prime(l2) 
    d_b2 = d_l2
    d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2) # transpose(x, y, z) : y 와 z 차원을 서로 swap시켜라
    # 여기서 a1은 (4, 2)였는데 (2, 4)로 바껴서 d_b2(= d_l2)즉 (4, 1)과 행렬곱이 가능해짐
    
    # Layer 1
    d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
    d_l1 = d_a1 * sigmoid_prime(l1)
    d_b1 = d_l1
    d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_b1) 
    
    # Weight update
    # gradient descent를 minimize 시키는 원리
    w1 = w1 - learning_rate * d_w1
    b1 = b1 - learning_rate * torch.mean(d_b1, 0)
    w2 = w2 - learning_rate * d_w2
    b2 = b2 - learning_rate * torch.mean(d_b2, 0)
    
    if step % 100 == 0 :
        print(step, cost.item())

0 0.7240769863128662
100 0.6931483745574951
200 0.6931473016738892
300 0.6931471824645996
400 0.6931471824645996
500 0.6931471824645996
600 0.6931471824645996
700 0.6931471824645996
800 0.6931471824645996
900 0.6931471228599548
1000 0.6931471824645996
1100 0.6931471824645996
1200 0.6931471228599548
1300 0.6931471824645996
1400 0.6931471824645996
1500 0.6931471824645996
1600 0.6931471824645996
1700 0.6931471824645996
1800 0.6931471824645996
1900 0.6931471824645996
2000 0.6931471824645996
2100 0.6931471228599548
2200 0.6931471824645996
2300 0.6931471824645996
2400 0.6931471824645996
2500 0.6931471824645996
2600 0.6931471824645996
2700 0.6931470632553101
2800 0.6931471824645996
2900 0.6931471824645996
3000 0.6931471824645996
3100 0.6931470632553101
3200 0.6931470036506653
3300 0.6931470632553101
3400 0.6931470632553101
3500 0.6931470632553101
3600 0.6931469440460205
3700 0.693146824836731
3800 0.693146824836731
3900 0.6931465268135071
4000 0.6931461095809937
4100 0.6931454539299011
4200 0

In [42]:
# Accuracy computation
# True if hypothesis>0.5 else False
with torch.no_grad():
    hypothesis = model(X)
    predicted = (hypothesis > 0.5).float()
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[0.00151766]
 [0.9988372 ]
 [0.9988397 ]
 [0.00134214]] 
Correct:  [[0.]
 [1.]
 [1.]
 [0.]] 
Accuracy:  1.0


# Code: xor-nn

In [43]:
import torch

In [44]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [45]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

In [46]:
# nn layers
linear1 = torch.nn.Linear(2, 2, bias=True)
linear2 = torch.nn.Linear(2, 1, bias=True)
sigmoid = torch.nn.Sigmoid()

In [47]:
# model
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

In [48]:
# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)  # modified learning rate from 0.1 to 1

In [49]:
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print(step, cost.item())

0 0.7434073090553284
100 0.693165123462677
200 0.6931577920913696
300 0.6931517124176025
400 0.6931463479995728
500 0.6931411027908325
600 0.6931357383728027
700 0.6931294798851013
800 0.6931220293045044
900 0.6931126117706299
1000 0.6930999755859375
1100 0.6930822730064392
1200 0.6930569410324097
1300 0.6930190324783325
1400 0.6929606199264526
1500 0.6928660273551941
1600 0.6927032470703125
1700 0.6923959255218506
1800 0.6917300820350647
1900 0.6899652481079102
2000 0.6838312149047852
2100 0.6561650037765503
2200 0.4310865104198456
2300 0.13488933444023132
2400 0.06630323827266693
2500 0.04216768220067024
2600 0.03045358881354332
2700 0.023665759712457657
2800 0.019277628511190414
2900 0.016223931685090065
3000 0.013983718119561672
3100 0.012273887172341347
3200 0.010928073897957802
3300 0.009842442348599434
3400 0.008948973380029202
3500 0.008201291784644127
3600 0.0075667379423975945
3700 0.007021641358733177
3800 0.006548580713570118
3900 0.006134208757430315
4000 0.005768344737589

In [35]:
# Accuracy computation
# True if hypothesis>0.5 else False
with torch.no_grad():
    hypothesis = model(X)
    predicted = (hypothesis > 0.5).float()
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[0.00151766]
 [0.9988372 ]
 [0.9988397 ]
 [0.00134214]] 
Correct:  [[0.]
 [1.]
 [1.]
 [0.]] 
Accuracy:  1.0


지난번과 다르게 loss가 계속 감소하는 양상을 확인할 수 있고, 입력에 따라 XOR 게이트에 맞게 잘 출력된다.

# Code: xor-nn-wide-deep

이전 2개의 레이어에 2개를 더 쌓아서 총 4개짜리 Multilayer Perception을 만들어서 학습을 진행한다.

In [57]:
import torch

In [58]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [59]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

In [60]:
# nn layers
linear1 = torch.nn.Linear(2, 10, bias=True)
linear2 = torch.nn.Linear(10, 10, bias=True)
linear3 = torch.nn.Linear(10, 10, bias=True)
linear4 = torch.nn.Linear(10, 1, bias=True)
sigmoid = torch.nn.Sigmoid()

In [61]:
# model
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid, linear4, sigmoid).to(device)

In [62]:
# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)  # modified learning rate from 0.1 to 1

In [63]:
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print(step, cost.item())

0 0.6948983669281006
100 0.693155825138092
200 0.6931535601615906
300 0.6931513547897339
400 0.6931493282318115
500 0.6931473016738892
600 0.6931453943252563
700 0.6931435465812683
800 0.6931416988372803
900 0.6931397914886475
1000 0.6931380033493042
1100 0.6931361556053162
1200 0.6931343078613281
1300 0.6931324005126953
1400 0.6931304931640625
1500 0.6931284666061401
1600 0.6931264400482178
1700 0.6931242942810059
1800 0.6931220293045044
1900 0.6931197047233582
2000 0.6931172013282776
2100 0.6931144595146179
2200 0.6931116580963135
2300 0.6931084990501404
2400 0.6931051015853882
2500 0.6931014657020569
2600 0.6930974721908569
2700 0.6930930018424988
2800 0.6930880546569824
2900 0.6930825114250183
3000 0.6930763721466064
3100 0.6930692791938782
3200 0.6930612325668335
3300 0.6930518746376038
3400 0.6930411458015442
3500 0.6930283904075623
3600 0.6930133104324341
3700 0.6929951310157776
3800 0.6929728984832764
3900 0.6929453015327454
4000 0.6929103136062622
4100 0.6928648948669434
4200 

In [64]:
# Accuracy computation
# True if hypothesis>0.5 else False
with torch.no_grad():
    hypothesis = model(X)
    predicted = (hypothesis > 0.5).float()
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[1.11760564e-04]
 [9.99828696e-01]
 [9.99842167e-01]
 [1.85418889e-04]] 
Correct:  [[0.]
 [1.]
 [1.]
 [0.]] 
Accuracy:  1.0


2개짜리 layer보다 loss가 더 많이 줄어든 것을 확인할 수 있고, 이는 그만큼 더 많은 학습을 했다는 것을 의미한다.