# MultiLayer Perceptron

## Backpropagation

In [2]:
import torch

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

X = torch.FloatTensor([[0,0],[0,1],[1,0],[1,1]]).to(device)
Y = torch.FloatTensor([[0],[1],[1],[0]]).to(device)

#nn Layers
w1 = torch.Tensor(2,2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2,1).to(device)
b2 = torch.Tensor(1).to(device)

def sigmoid(x):
    # sigmoid function
    return 1.0 /  (1.0 + torch.exp(-x))
    # return torch.div(torch.tensor(1), torch.add(torch.tensor(1.0), torch.exp(-x)))
    
def sigmoid_prime(x):
    #derivative of the sigmoid function
    return sigmoid(x) * (1 - sigmoid(x))

In [12]:
lr = 1
epoch = 10000
for step in range(epoch + 1):
    #forward
    l1 = torch.add(torch.matmul(X,w1),b1)
    a1 = sigmoid(l1)
    l2 = torch.add(torch.matmul(a1,w2), b2)
    Y_pred = sigmoid(l2)
    
    cost = -torch.mean(Y * torch.log(Y_pred) + (1 - Y) * torch.log(1 - Y_pred))
    
    #Backpropagation (Chain Rule)
    #Loss derivative
    d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)
    
    # Layer 2
    d_l2 = d_Y_pred * sigmoid_prime(l2)
    d_b2 = d_l2
    d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2)
    
    # Layer 1
    d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
    d_l1 = d_a1 * sigmoid_prime(l1)
    d_b1 = d_l1
    d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_b1)
    
    #weigth update
    w1 = w1 - lr * d_w1
    b1 = b1 - lr * torch.mean(d_b1, 0)
    w2 = w2 - lr * d_w2
    b2 = b2 - lr * torch.mean(d_b2, 0)
    
    if step % 100 == 0:
        print(step, cost.item())

0 0.6931471824645996
100 0.6931471824645996
200 0.6931471824645996
300 0.6931471824645996
400 0.6931471824645996
500 0.6931471824645996
600 0.6931471824645996
700 0.6931471824645996
800 0.6931471824645996
900 0.6931471824645996
1000 0.6931471824645996
1100 0.6931471824645996
1200 0.6931471824645996
1300 0.6931471824645996
1400 0.6931471824645996
1500 0.6931471824645996
1600 0.6931471824645996
1700 0.6931471824645996
1800 0.6931471824645996
1900 0.6931471824645996
2000 0.6931471824645996
2100 0.6931471824645996
2200 0.6931471824645996
2300 0.6931471824645996
2400 0.6931471824645996
2500 0.6931471824645996
2600 0.6931471824645996
2700 0.6931471824645996
2800 0.6931471824645996
2900 0.6931471824645996
3000 0.6931471824645996
3100 0.6931471824645996
3200 0.6931471824645996
3300 0.6931471824645996
3400 0.6931471824645996
3500 0.6931471824645996
3600 0.6931471824645996
3700 0.6931471824645996
3800 0.6931471824645996
3900 0.6931471824645996
4000 0.6931471824645996
4100 0.6931471824645996
4200

## xor-nn

In [7]:
#nn Layers
linear1 = torch.nn.Linear(2,2, bias = True)
linear2 = torch.nn.Linear(2,1, bias = True)
sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

#define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 1)
epoch = 10000
for step in range(epoch + 1):
    optimizer.zero_grad()
    hypothesis = model(X)
    
    #cos/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    if step % 100 == 0:
        print(step, cost.item())

0 0.7346739768981934
100 0.6932229399681091
200 0.6931405067443848
300 0.6930267810821533
400 0.6926414966583252
500 0.6901740431785583
600 0.6634379625320435
700 0.5491836071014404
800 0.4039493203163147
900 0.15778645873069763
1000 0.07213657349348068
1100 0.04408682882785797
1200 0.0312125775963068
1300 0.023987866938114166
1400 0.019408047199249268
1500 0.016261592507362366
1600 0.013973670080304146
1700 0.012238546274602413
1800 0.010879326611757278
1900 0.00978686474263668
2000 0.008890369907021523
2100 0.008141839876770973
2200 0.007507735397666693
2300 0.006963868159800768
2400 0.006492419168353081
2500 0.006079902872443199
2600 0.005715972278267145
2700 0.005392673425376415
2800 0.005103521514683962
2900 0.004843408707529306
3000 0.004608238115906715
3100 0.004394622519612312
3200 0.004199673887342215
3300 0.004021090921014547
3400 0.003856964409351349
3500 0.0037055518478155136
3600 0.0035654702223837376
3700 0.0034354592207819223
3800 0.0033144974149763584
3900 0.00320172938

## xor-nn-wide-deep

In [8]:
#nn Layers
linear1 = torch.nn.Linear(2,10, bias = True)
linear2 = torch.nn.Linear(10,10, bias = True)
linear3 = torch.nn.Linear(10,10, bias = True)
linear4 = torch.nn.Linear(10,1, bias = True)
sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid, linear4, sigmoid).to(device)

#define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 1)
epoch = 10000
for step in range(epoch + 1):
    optimizer.zero_grad()
    hypothesis = model(X)
    
    #cos/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    if step % 100 == 0:
        print(step, cost.item())

0 0.7151408195495605
100 0.6931414604187012
200 0.6931411027908325
300 0.6931408643722534
400 0.6931405067443848
500 0.6931401491165161
600 0.693139910697937
700 0.6931394934654236
800 0.6931390762329102
900 0.6931387782096863
1000 0.6931383609771729
1100 0.6931379437446594
1200 0.6931375861167908
1300 0.6931371092796326
1400 0.6931366920471191
1500 0.6931362152099609
1600 0.6931357383728027
1700 0.693135142326355
1800 0.6931346654891968
1900 0.693134069442749
2000 0.693133533000946
2100 0.6931329965591431
2200 0.6931322813034058
2300 0.693131685256958
2400 0.6931308507919312
2500 0.6931301355361938
2600 0.693129301071167
2700 0.6931284666061401
2800 0.6931275725364685
2900 0.6931266784667969
3000 0.6931256055831909
3100 0.6931245923042297
3200 0.6931234002113342
3300 0.6931222081184387
3400 0.6931208372116089
3500 0.6931194067001343
3600 0.6931179165840149
3700 0.6931162476539612
3800 0.6931144595146179
3900 0.6931124925613403
4000 0.6931103467941284
4100 0.6931080222129822
4200 0.693