# Numpy

In [34]:
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()


learning_rate = 1e-6
for t in range(20000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 2182.1868440027283
199 1474.212676126563
299 997.804581368072
399 676.9343661643926
499 460.6224466702057
599 314.65898918946215
699 216.06920931056862
799 149.41085300381232
899 104.2956907038782
999 73.7292233541745
1099 52.99768206451142
1199 38.92134917013577
1299 29.353239616315303
1399 22.842242021444104
1499 18.406576268326248
1599 15.38130719174017
1699 13.315612853237198
1799 11.90350853972258
1899 10.937086462746322
1999 10.274922145714013
2099 9.820706639047994
2199 9.50877986957942
2299 9.294325944965623
2399 9.146720936363682
2499 9.045014603617183
2599 8.974858124150922
2699 8.926412713149645
2799 8.892924352244023
2899 8.869751362247214
2999 8.853700179634256
3099 8.842571132230326
3199 8.834847463965724
3299 8.829482187618405
3399 8.825751823255343
3499 8.82315591951765
3599 8.821347948983568
3699 8.820087728182298
3799 8.819208621721641
3899 8.81859491228484
3999 8.818166168704494
4099 8.817866436753906
4199 8.817656757122148
4299 8.817509981260113
4399 8.8174071758

# Pytorch tensor

In [42]:
import torch
import math


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((1,), device=device, dtype=dtype)
b = torch.randn((1,), device=device, dtype=dtype)
c = torch.randn((1), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(20000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3
    
    # Compute and print loss
    loss = ((y_pred - y) ** 2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    # dLoss/dW = dLoss/dY_pred * dY_pred/dW
    grad_y_pred = 2.0 * (y_pred - y) # dLoss/dY_pred
    grad_a = grad_y_pred.sum() # dLoss/dY_pred * dY_pred/dW
    grad_b = (grad_y_pred * x).sum() # dLoss/dY_pred * dY_pred/dW
    grad_c = (grad_y_pred * x ** 2).sum() # dLoss/dY_pred * dY_pred/dW
    grad_d = (grad_y_pred * x ** 3).sum() # dLoss/dY_pred * dY_pred/dW

    # Update weights using gradient descent
    # w - n.dLoss/dw
    a -= learning_rate * grad_a 
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(loss)
print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')


99 559.0468139648438
199 397.3835754394531
299 283.2564697265625
399 202.67410278320312
499 145.7685546875
599 105.57772064208984
699 77.18856811523438
799 57.13334274291992
899 42.96391677856445
999 32.951866149902344
1099 25.87676429748535
1199 20.87662696838379
1299 17.342607498168945
1399 14.844618797302246
1499 13.078810691833496
1599 11.830480575561523
1699 10.947933197021484
1799 10.32393741607666
1899 9.882730484008789
1999 9.570749282836914
2099 9.35013198852539
2199 9.194117546081543
2299 9.083781242370605
2399 9.005748748779297
2499 8.950557708740234
2599 8.911523818969727
2699 8.883912086486816
2799 8.86438274383545
2899 8.850567817687988
2999 8.84079647064209
3099 8.833883285522461
3199 8.82899284362793
3299 8.825533866882324
3399 8.82308578491211
3499 8.821353912353516
3599 8.820130348205566
3699 8.819263458251953
3799 8.81865119934082
3899 8.818216323852539
3999 8.817910194396973
4099 8.817691802978516
4199 8.81753921508789
4299 8.817428588867188
4399 8.817352294921875
4

# Pytorch autograd

In [52]:
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(20000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    # MSE
    loss = ((y_pred - y)**2).sum()
    # RMSE
    loss = ((y_pred - y)**2).sum().sqrt()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
        if (t == 20000-1):
            print(f' the gradient in last itteration are {a.grad}, {b.grad}, {c.grad}, {d.grad}')
        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None
        
print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 347.4620361328125
199 332.49444580078125
299 318.3998107910156
399 305.1962890625
499 292.8902893066406
599 281.475341796875
699 270.9311828613281
799 261.2241516113281
899 252.30857849121094
999 244.12811279296875
1099 236.61936950683594
1199 229.71417236328125
1299 223.34262084960938
1399 217.4364471435547
1499 211.9303436279297
1599 206.76425170898438
1699 201.8837890625
1799 197.2410888671875
1899 192.79507446289062
1999 188.51040649414062
2099 184.35813903808594
2199 180.31324768066406
2299 176.35665893554688
2399 172.4730682373047
2499 168.64955139160156
2599 164.87632751464844
2699 161.1459503173828
2799 157.45303344726562
2899 153.79342651367188
2999 150.16444396972656
3099 146.5642547607422
3199 142.991943359375
3299 139.447265625
3399 135.9304656982422
3499 132.4423065185547
3599 128.98394775390625
3699 125.55694580078125
3799 122.16304016113281
3899 118.80435943603516
3999 115.48324584960938
4099 112.20234680175781
4199 108.96453857421875
4299 105.77301025390625
4399 102.

# Custom Fordward and Backward (Pytorch autograd)

In [77]:
import torch
import math


class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
    P3 = LegendrePolynomial3.apply

    # Forward pass: compute predicted y using operations; we compute
    # P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()

    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')

99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.97850799560547
599 37.403133392333984
699 28.206867218017578
799 21.97318458557129
899 17.7457275390625
999 14.877889633178711
1099 12.93176555633545
1199 11.610918045043945
1299 10.714258193969727
1399 10.10548210144043
1499 9.692106246948242
1599 9.411375999450684
1699 9.220744132995605
1799 9.091285705566406
1899 9.003361701965332
1999 8.943639755249023
Result: y = -7.290119619085544e-09 + -2.208526849746704 * P3(1.3728043146699065e-09 + 0.2554861009120941 x)


# nn Module

In [134]:
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
# (3,), for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3) 

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. The Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# The Flatten layer flatens the output of the linear layer to a 1D tensor,
# to match the shape of `y`.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0,1)
    
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):

    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(xx)
    #print(f'y_pred is: {y_pred}')


    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    
    loss = loss_fn(y_pred, y)
    if t == 0:
        print(f'y_pred is: {y_pred}')
        print(t, loss.item())
    
    if t % 100 == 99:
        print(t, loss.item())
        

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

y_pred is: tensor([3.4552, 3.4435, 3.4318,  ..., 1.3952, 1.3953, 1.3953],
       grad_fn=<ViewBackward>)
0 2688.932373046875
99 318.91845703125
199 222.7721710205078
299 156.593505859375
399 110.99147033691406
499 79.53413391113281
599 57.810855865478516
699 42.793678283691406
799 32.40189743041992
899 25.20355987548828
999 20.21243667602539
1099 16.748422622680664
1199 14.34203052520752
1299 12.66884994506836
1399 11.504463195800781
1499 10.693482398986816
1599 10.128175735473633
1699 9.73380184173584
1799 9.458494186401367
1899 9.266154289245605
1999 9.131688117980957
Result: y = -0.017354760318994522 + 0.8501694202423096 x + 0.002993984380736947 x^2 + -0.09239566326141357 x^3


# Pytorch Optim

In [139]:
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 647.8887939453125
199 339.8917236328125
299 238.18356323242188
399 164.631103515625
499 106.37096405029297
599 63.659305572509766
699 35.217002868652344
799 18.762378692626953
899 11.353139877319336
999 9.155719757080078
1099 8.863226890563965
1199 8.942862510681152
1299 8.949155807495117
1399 8.893209457397461
1499 8.892653465270996
1599 8.911840438842773
1699 8.912635803222656
1799 8.905150413513184
1899 8.90535831451416
1999 8.90806770324707
Result: y = 8.990648781015409e-10 + 0.8562381267547607 x + -1.1081590756134574e-08 x^2 + -0.09383315593004227 x^3


### PyTorch: Custom nn Modules

In [140]:
import torch
import math


class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 2211.896240234375
199 1479.482666015625
299 991.1165771484375
399 665.31201171875
499 447.8426818847656
599 302.6036682128906
699 205.54678344726562
799 140.64759826660156
899 97.22325134277344
999 68.14839172363281
1099 48.66747283935547
1199 35.60517120361328
1299 26.840051651000977
1399 20.953657150268555
1499 16.99726676940918
1599 14.335859298706055
1699 12.543975830078125
1799 11.336421966552734
1899 10.521862983703613
1999 9.971896171569824
Result: y = 0.02167087234556675 + 0.8831051588058472 x + -0.00373858492821455 x^2 + -0.09708049148321152 x^3


### PyTorch: Control Flow + Weight Sharing

In [141]:
import random
import torch
import math


class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
for t in range(30000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 2000 == 1999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 1729.00732421875
3999 772.6729125976562
5999 347.7442626953125
7999 154.42994689941406
9999 75.41864013671875
11999 38.35637283325195
13999 21.888696670532227
15999 14.619317054748535
17999 11.372334480285645
19999 9.963314056396484
21999 9.189109802246094
23999 9.046416282653809
25999 8.750997543334961
27999 8.629594802856445
29999 8.859413146972656
Result: y = 0.0028848357032984495 + 0.8536442518234253 x + -0.0009894638787955046 x^2 + -0.09312067925930023 x^3 + 0.00011581122089410201 x^4 ? + 0.00011581122089410201 x^5 ?


In [154]:
import random
import torch
import math


class Model(torch.nn.Module):
    
    # constructor
    def __init__(self, inputs, hidden, outputs):
        
        # Call constructor of parent class
        super(Model, self).__init__()
        
        self.inputs = inputs
        self.hidden = hidden
        self.outputs = outputs
        
        # Layers: https://pytorch.org/docs/stable/nn.html
        self.fc1 = torch.nn.Linear(inputs, hidden)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden, outputs)
        
    # Logic for forward pass
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


# Dataset
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

model = Model(3, 60, 1)
epoch = 2000

# Loss functions: 
#criterion = torch.nn.MSELoss(reduction='sum')
criterion = torch.nn.MSELoss()

# Optimizers: https://pytorch.org/docs/stable/optim.html
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(xx)

    #y_pred = torch.flatten(y_pred, start_dim=0) # if I use reduction
    
    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


99 3.8104710578918457
199 3.3886544704437256
299 3.012815475463867
399 2.6789960861206055
499 2.3834547996520996
599 2.1226449012756348
699 1.893211007118225
799 1.6920043230056763
899 1.516097068786621
999 1.3628109693527222
1099 1.229657769203186
1199 1.1143780946731567
1299 1.0149446725845337
1399 0.9295467138290405
1499 0.8565689921379089
1599 0.7945709824562073
1699 0.742257833480835
1799 0.6984593868255615
1899 0.6621087789535522
1999 0.6322301626205444
