# Multy Layer Perceptron (MLP)

In [5]:
import torch
import random

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
random.seed(777)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

단층으로는 xor 해결 불가 => MLP

하지만 MLP는 W와 b를 학습할 수 없다고 결론지어졌지만 backpropagation을 통해서 학습 가능함이 밝혀짐

### backpropagation

loss에 대해서 neural network에 있는 weight들에 대한 미분값 계산.  
gradient를 가지고 뒷단에 있는 weight부터 loss 값을 최소화 시킬 수 있도록 weight 업데이트

밑에서는 전부 구현을 하지만 실제로는 .backward 라는 funtion을 이용하면 한 줄로 해결됨.

In [8]:
X = torch.FloatTensor([[0,0],[0,1],[1,0],[1,1]]).to(device)
Y = torch.FloatTensor([[0],[1],[1],[0]]).to(device)

# nn Layers
## torch.nn 패키지의 linear layer 사용 안하고 weight와 bias 직접 선언. nn.Linear 2개를 사용했다고 생각
w1 = torch.Tensor(2,2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2,1).to(device)  # 2 -> 1
b2 = torch.Tensor(1).to(device)


#sigmoid 구현
def sigmoid(x):
    #sigmoid function
    return 1.0/(1.0 + torch.exp(-x))
    #return torch.div(torch.tensor(1), torch.add(torch.tensor(1.0), torch.exp(-x)))
    

##sigmoid 미분
def sigmoid_prime(x):
    #derivative of the sigmoid funtion
    return sigmoid(x) * (1 - sigmoid(x))

In [22]:
for step in range(10001):
    # forward
    l1 = torch.add(torch.matmul(X, w1), b1)  # X * W + Bb
    a1 = sigmoid(l1)
    l2 = torch.add(torch.matmul(a1, w2), b2)
    Y_pred = sigmoid(l2)
    
    cost = -torch.mean(Y * torch.log(Y_pred) + (1 - Y) * torch.log(1 - Y_pred))  # binary_cross_entropy
    
    
    # Back prop (chain rule) 
    ## Loss derivative (미분)
    d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)  # binary_cross_entropy -> 1e-7은 나누기 0 막으려고
    
    ## Layer2
    d_l2 = d_Y_pred * sigmoid_prime(l2)
    d_b2 = d_l2  # bias 미분
    d_w2 = torch.matmul(torch.transpose(a1,0,1),d_b2) # weight 미분  #transpose => 2,3 인자 swap 
    
    ## Layer1
    d_a1 = torch.matmul(d_b2, torch.transpose(w2,0,1))
    d_l1 = d_a1 * sigmoid_prime(l1)
    d_b1 = d_l1
    d_w1 = torch.matmul(torch.transpose(X,0,1),d_b1)
    
    ## weight update  => step 함수 이용하면 간단하게 한 줄로 해결
    learning_rate = 0.001
    w1 = w1 - learning_rate * d_w1  # gradient descent minimize
    b1 = b1 - learning_rate * torch.mean(d_b1, 0)
    w2 = w2 - learning_rate * d_w2
    b2 = b2 - learning_rate * torch.mean(d_b2, 0)
    
    if step % 100 == 0:
        print(step, cost.item())

0 nan
100 nan
200 nan
300 nan
400 nan
500 nan
600 nan
700 nan
800 nan
900 nan
1000 nan
1100 nan
1200 nan
1300 nan
1400 nan
1500 nan
1600 nan
1700 nan
1800 nan
1900 nan
2000 nan
2100 nan
2200 nan
2300 nan
2400 nan
2500 nan
2600 nan
2700 nan
2800 nan
2900 nan
3000 nan
3100 nan
3200 nan
3300 nan
3400 nan
3500 nan
3600 nan
3700 nan
3800 nan
3900 nan
4000 nan
4100 nan
4200 nan
4300 nan
4400 nan
4500 nan
4600 nan
4700 nan
4800 nan
4900 nan
5000 nan
5100 nan
5200 nan
5300 nan
5400 nan
5500 nan
5600 nan
5700 nan
5800 nan
5900 nan
6000 nan
6100 nan
6200 nan
6300 nan
6400 nan
6500 nan
6600 nan
6700 nan
6800 nan
6900 nan
7000 nan
7100 nan
7200 nan
7300 nan
7400 nan
7500 nan
7600 nan
7700 nan
7800 nan
7900 nan
8000 nan
8100 nan
8200 nan
8300 nan
8400 nan
8500 nan
8600 nan
8700 nan
8800 nan
8900 nan
9000 nan
9100 nan
9200 nan
9300 nan
9400 nan
9500 nan
9600 nan
9700 nan
9800 nan
9900 nan
10000 nan


###### Code : xor-nn

In [17]:
X = torch.FloatTensor([[0,0],[0,1],[1,0],[1,1]]).to(device)
Y = torch.FloatTensor([[0],[1],[1],[0]]).to(device)

# nn Layers
linear1 = torch.nn.Linear(2,2, bias = True)
linear2 = torch.nn.Linear(2,1, bias = True)

sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

#define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(),lr=1)

for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)
    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    if step % 100 == 0 :
        print(step, cost.item())

0 0.7126196026802063
100 0.6933620572090149
200 0.6932636499404907
300 0.6932241916656494
400 0.6932039260864258
500 0.6931912899017334
600 0.6931823492050171
700 0.6931754350662231
800 0.6931697130203247
900 0.6931648254394531
1000 0.6931605339050293
1100 0.6931564807891846
1200 0.6931525468826294
1300 0.6931486129760742
1400 0.6931445002555847
1500 0.693139910697937
1600 0.6931346654891968
1700 0.6931277513504028
1800 0.6931186318397522
1900 0.6931052207946777
2000 0.6930840611457825
2100 0.6930474042892456
2200 0.6929752826690674
2300 0.6928061842918396
2400 0.6922802925109863
2500 0.6895955204963684
2600 0.6631379127502441
2700 0.5434404611587524
2800 0.2109796702861786
2900 0.07923407107591629
3000 0.04585752636194229
3100 0.031775638461112976
3200 0.024158718064427376
3300 0.01942525804042816
3400 0.016212500631809235
3500 0.013895191252231598
3600 0.012147676199674606
3700 0.0107844527810812
3800 0.00969223864376545
3900 0.008798046037554741
4000 0.008052884601056576
4100 0.0074

In [19]:
# Accuracy computation
# True if hypothesis>0.5 else False
with torch.no_grad(): ## gradient 계산 x => 테스트 사용 시 실수 방지 습관
    hypothesis = model(X)
    predicted = (hypothesis > 0.5).float()
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[0.00151766]
 [0.9988372 ]
 [0.9988397 ]
 [0.00134214]] 
Correct:  [[0.]
 [1.]
 [1.]
 [0.]] 
Accuracy:  1.0


###### Code : xor-nn-wide-deep

In [21]:
X = torch.FloatTensor([[0,0],[0,1],[1,0],[1,1]]).to(device)
Y = torch.FloatTensor([[0],[1],[1],[0]]).to(device)

# nn Layers
linear1 = torch.nn.Linear(2,2, bias = True)
linear2 = torch.nn.Linear(10,10, bias = True)
linear3 = torch.nn.Linear(10,10, bias = True)
linear4 = torch.nn.Linear(10,1, bias = True)

sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid, linear4, sigmoid).to(device)

#define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(),lr=1)

for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)
    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    if step % 100 == 0 :
        print(step, cost.item())

RuntimeError: mat1 dim 1 must match mat2 dim 0