In [62]:
import torch

First we define the variable/parameter $x$. We enable gradient computations
for this variable.

In [63]:
x = torch.tensor(-42., requires_grad=True)

Next, we create an optimizer that we will use to minimize the loss
function.

In [64]:
opt = torch.optim.SGD([x], lr = 0.25)

Then we optimize the loss function $x^2$ until the gradient
is less than 1e-5.

In [65]:
for i in range(100):
    x_before = float(x)

    # Clear gradients (set gradients to zero), otherwise gradients
    # would be accumulated.
    opt.zero_grad()

    # Compute our loss function.
    loss = x ** 2

    # Compute gradients for the parameter(s) for which  we
    # enabled gradient computation.
    loss.backward()

    # Let the optimizer update the parameter(s).
    opt.step()

    print(f"before: {x_before:g}, gradient: {x.grad:g}, after: {x:#7.2g}")

    # Stop when the gradient for $x$ is less than 1e-5.
    if x.grad.abs() < 1e-5:
        break

before: -42, gradient: -84, after:     -21
before: -21, gradient: -42, after:     -10
before: -10.5, gradient: -21, after:    -5.2
before: -5.25, gradient: -10.5, after:    -2.6
before: -2.625, gradient: -5.25, after:    -1.3
before: -1.3125, gradient: -2.625, after:   -0.66
before: -0.65625, gradient: -1.3125, after:   -0.33
before: -0.328125, gradient: -0.65625, after:   -0.16
before: -0.164062, gradient: -0.328125, after:  -0.082
before: -0.0820312, gradient: -0.164062, after:  -0.041
before: -0.0410156, gradient: -0.0820312, after:  -0.021
before: -0.0205078, gradient: -0.0410156, after:   -0.01
before: -0.0102539, gradient: -0.0205078, after: -0.0051
before: -0.00512695, gradient: -0.0102539, after: -0.0026
before: -0.00256348, gradient: -0.00512695, after: -0.0013
before: -0.00128174, gradient: -0.00256348, after: -0.00064
before: -0.000640869, gradient: -0.00128174, after: -0.00032
before: -0.000320435, gradient: -0.000640869, after: -0.00016
before: -0.000160217, gradient: -0.0