In [2]:
from sklearn.datasets import load_iris 

In [1]:
import torch

In [3]:
data = load_iris()

In [9]:
data.data[0], data.target[0]

(array([5.1, 3.5, 1.4, 0.2]), 0)

In [10]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [11]:
list(data.target_names)

['setosa', 'versicolor', 'virginica']

In [126]:
labels = list(data.target_names)

## Normalization

In [17]:
mean = data.data.mean(axis=0)
std = data.data.std(axis=0)
mean, std

(array([5.84333333, 3.05733333, 3.758     , 1.19933333]),
 array([0.82530129, 0.43441097, 1.75940407, 0.75969263]))

In [23]:
x = data.data

In [24]:
x_normed = (x - x.mean(axis=0)) / x.std(axis=0)

In [25]:
x_normed.shape

(150, 4)

In [28]:
x_normed[:10]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ],
       [-0.53717756,  1.93979142, -1.16971425, -1.05217993],
       [-1.50652052,  0.78880759, -1.34022653, -1.18381211],
       [-1.02184904,  0.78880759, -1.2833891 , -1.3154443 ],
       [-1.74885626, -0.36217625, -1.34022653, -1.3154443 ],
       [-1.14301691,  0.09821729, -1.2833891 , -1.44707648]])

## One Hot Y

In [None]:
# b = np.zeros((3, 4))
# b[np.arange(3), a] = 1

In [86]:
data.target.shape[0]

150

In [83]:
data.target[0]

0

In [85]:
data.target_names.shape[0]

3

In [91]:
y_ = np.zeros((data.target.shape[0], data.target_names.shape[0]))

In [97]:
y_.shape, y_.shape[0]

((150, 3), 150)

In [98]:
y_[np.arange(y_.shape[0]), data.target] = 1

## Numpy

In [32]:
import numpy as np

In [100]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 150, 4, 20, 3

In [101]:
x = x_normed
y = y_

In [102]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [103]:
w1[0], w2[0]

(array([-2.13252634, -0.29943278, -0.59257012, -2.4283629 ,  1.28388229,
         0.96079767,  0.34573516, -0.70260458, -1.51762955,  1.58655232,
         0.91763169, -0.63976917,  0.15227276,  1.50495579,  0.71933933,
         0.77143137, -0.26957057, -0.06085943, -1.36407398,  0.01172359]),
 array([-1.90462284, -0.94971942, -0.25210993]))

In [104]:
learning_rate = 1e-6
for t in range(20):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 11849.804775206961
1 11708.559354374092
2 11569.67978142541
3 11433.184622420751
4 11298.954711594703
5 11166.914477771106
6 11037.02008375627
7 10909.242156511742
8 10783.573745845511
9 10659.924953815822
10 10538.256025019848
11 10418.540149106135
12 10300.770688416254
13 10184.892476405235
14 10070.83790391095
15 9958.482306196194
16 9847.906765344664
17 9739.06103608396
18 9631.853147946484
19 9526.32981031091


## PyTorch

In [59]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 150, 4, 20, 3

In [105]:
x = x_normed
y = y_

In [106]:
# Create random input and output data
x = torch.from_numpy(x).to(dtype)
y = torch.from_numpy(y).to(dtype)

In [107]:
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [108]:
learning_rate = 1e-6
for t in range(20):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 17763.7890625
1 17547.123046875
2 17335.14453125
3 17126.84375
4 16922.083984375
5 16720.8046875
6 16522.921875
7 16328.3408203125
8 16136.921875
9 15948.7021484375
10 15763.875
11 15582.064453125
12 15403.0419921875
13 15226.9306640625
14 15053.345703125
15 14882.5439453125
16 14714.384765625
17 14548.8017578125
18 14385.8349609375
19 14225.43359375


## Autograd

In [109]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [110]:
learning_rate = 1e-6
for t in range(20):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 9809.3154296875
1 9678.162109375
2 9549.2197265625
3 9422.5029296875
4 9297.3427734375
5 9173.8193359375
6 9052.34375
7 8933.2021484375
8 8816.173828125
9 8700.759765625
10 8587.677734375
11 8476.7490234375
12 8367.9130859375
13 8260.7890625
14 8154.763671875
15 8050.7294921875
16 7948.638671875
17 7848.42919921875
18 7749.9775390625
19 7653.318359375


## PyTorch: nn

In [113]:
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [120]:
for t in range(20):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 34.96329879760742
1 34.9467887878418
2 34.930294036865234
3 34.913822174072266
4 34.89736557006836
5 34.88092803955078
6 34.864505767822266
7 34.848106384277344
8 34.83172607421875
9 34.81536102294922
10 34.799015045166016
11 34.782684326171875
12 34.76637268066406
13 34.75008010864258
14 34.73379898071289
15 34.7175407409668
16 34.7012939453125
17 34.6850700378418
18 34.66885757446289
19 34.65266418457031


In [129]:
idx = 120
y_pred = model(x[idx])
y_pred, y[0]

(tensor([-0.0299,  0.1622,  0.8312], grad_fn=<AddBackward0>),
 tensor([1., 0., 0.]))

In [130]:
values, indices = y_pred.max(0)

In [131]:
values, indices

(tensor(0.8312, grad_fn=<MaxBackward0>), tensor(2))

In [132]:
labels[data.target[indices]], labels[indices]

('setosa', 'virginica')

## Softmax

Softmax only ever occurs in the final layer. It outputs numbers between 0 and 1, and they add up to 1. In theory, this is not strictly necessary — we could ask out neural net to learn a set of kernels which give probabilities that line up as closely as possible with what we want.

We use softmax because the nice thing about softmax is that we want to ensure all of the weights that we are using add up to 1 and we also expect that one of those weights should probably be higher than the other ones. Softmax gives us the guarantee that they add up to 1 and because it has e^ in it, it tends to encourage one of the weights to be higher than the other ones.