In [3]:
import numpy as np


import matplotlib.pyplot as plt
import matplotlib.text as text

## Linear regression

$$ y_i = x_{ij} w_j + b$$

$$ y_i = x_{ij} w_j, \quad x_{i,-1}=1,\quad b=w_{-1} $$

In [4]:
def linear(x,w):
    return x @ w

Generate a random feature vector $\mathbf{x}$ witch 10000 samples and three feature 
such that first feature is drawn from N(0,1), second feature from  U(,1) and third from N(1,2).

In [5]:
x = np.stack((np.random.normal(0, 1, (1000)), 
              np.random.uniform(0, 1, (1000)), 
              np.random.normal(1,2, (1000))), axis=1)
x.shape

(1000, 3)

N(mu,sigma) denotes normal distribution with mean mu and standard deviation sigma. You can use ``numpy.random.normal`` and ``numpy.random.uniform`` functions.

Using $\mathbf{x}$ and weights w = [0.2, 0.5,-0.25,1.0] generate output $\mathbf{y}$ assuming a $N(0,0.1)$ noise $\mathbf{\epsilon}$. 

In [6]:
w = np.array((0.2, 0.5, -0.25, 1.))
ones = np.ones((x.shape[0], 1))
x = np.concatenate((x,ones), axis = 1)
noise = np.random.normal(0, 0.1)
y = linear(x,w)
y = y + noise

$$ y_i = x_{ij} w_j+\epsilon_i, \quad x_{i,-1}=1,\quad b=w_{-1} $$

#### Loss

$$ \frac{1}{2}\frac{1}{N}\sum_{i=0}^{N-1} (y_i -  x_{ij} w_j  )^2$$

In [7]:
def getLoss(y, x, w):
    loss = np.square(y - linear(x,w))
    loss = np.sum(loss) / (2*y.shape[0])
    return loss


## Gradient descent 

### Problem 1 

Find the gradient of the loss function with respect to weights.

Write gradient function ``grad(y,x,w)``.

In [8]:
def grad(y, x, w):
    diff = (x @ w - y)
    return np.dot(x.T, diff) / x.shape[0]
#     return (w - (alpha/x.shape[0]) * tmp)
gradient = grad(y, x, w)
print(gradient)

[-0.00015544 -0.00385066 -0.00709402 -0.00766448]


### Problem 2

Implement gradient descent for linear regression.

In [9]:
alpha = 0.1

def gradientDescent(y, x, w, maxIterations = 500, tolerance=0.0000001):
    for i in range(maxIterations):
        loss = getLoss(y, x, w)
        if loss < tolerance:
            maxIterations = i
            break
        gradient = grad(y, x, w)
        w = w - alpha*gradient

    print("finished gradient descent on iteration " + str(maxIterations))
    print("with loss equal " + str(loss))
    return w
        
gradientDescent(y, x, w)

finished gradient descent on iteration 113
with loss equal 9.966835550193213e-08


array([ 0.19998102,  0.50151607, -0.24998489,  1.00683376])

### Problem 3

Implement stochastic gradient descent (SGD).

In [10]:
def getBatches(x, y, batchSize):
    randomIndices = np.random.randint(1000, size=(batchSize))
    xResult = []
    yResult = []
    for i in randomIndices:
        xResult.append(x[i])
        yResult.append(y[i])
    return (np.asanyarray(xResult), np.asanyarray(yResult))

def sgd(y, x, w, maxIterations = 500, tolerance=0.0000001, batchSize = 10):
    for i in range(maxIterations):
        loss = getLoss(y, x, w)
        if loss < tolerance:
            maxIterations = i
            break
        randomIndices = np.random.randint(1000, size=(batchSize))
        selectedX = x[randomIndices]
        selectedY = y[randomIndices]
#         (selectedX, selectedY) = getBatches(x, y, batchSize)
        gradient = grad(selectedY, selectedX, w)
        w = w - alpha*gradient

    print("finished gradient descent on iteration " + str(maxIterations))
    print("with loss equal " + str(loss))
    return w
sgd(y, x, w)

finished gradient descent on iteration 111
with loss equal 9.968142672278636e-08


array([ 0.20002494,  0.50152032, -0.24999132,  1.00686708])

In [11]:
print("SGD takes: ")
%time tSGD = sgd(y, x, w)
print("gradient descent takes: ")
%time tGD = gradientDescent(y, x, w)

SGD takes: 
finished gradient descent on iteration 124
with loss equal 9.865369386331472e-08
CPU times: user 9.46 ms, sys: 450 µs, total: 9.91 ms
Wall time: 9.02 ms
gradient descent takes: 
finished gradient descent on iteration 113
with loss equal 9.966835550193213e-08
CPU times: user 6.34 ms, sys: 0 ns, total: 6.34 ms
Wall time: 6.17 ms


### Problem 4

Implement SGD using pytorch. Start by just rewritting Problem 3 to use torch Tensors instead of numpy arrays. 

To convert frrom numpy arrays to torch tensors you can use ``torch.from_numpy()`` function. 

In [16]:
import torch
import torch.optim as optim
import torch.nn as nn
from torchviz import make_dot

device = 'cuda' if torch.cuda.is_available() else 'cpu'

x_train_tensor = torch.from_numpy(x).float().to(device)
y_train_tensor = torch.from_numpy(y).float().to(device)

torch.manual_seed(42)

print(x_train_tensor)


tensor([[ 0.2903,  0.0101,  1.7262,  1.0000],
        [ 0.5182,  0.8843,  1.4876,  1.0000],
        [ 0.0453,  0.6313,  1.0153,  1.0000],
        ...,
        [-1.4175,  0.0063,  0.1106,  1.0000],
        [ 0.7033,  0.4105,  0.7287,  1.0000],
        [ 1.2382,  0.8392,  4.9640,  1.0000]])


In [108]:
def modelTorch(x):
    return x @ w.t() + b

In [72]:
def gradientDescentTorch(y, x, maxIterations = 500, tolerance=0.0000001):
    alpha = 0.00000001
    w = torch.randn(x.shape[1], 1, device=device, requires_grad=True)
    loss = float('Inf')
    for i in range(maxIterations):
        y_pred = linear(x, w)
        lastLoss = loss
        loss = (y_pred - y).pow(2).sum()
        if loss < tolerance:
            maxIterations = i
            break
        lossDelta = lastLoss - loss
        print(lossDelta)
#         if lossDelta < 500 and lossDelta > 0:
#             alpha = alpha * 10
        w.retain_grad()
        loss.backward()
#         if lossDelta < 0:
#             w = w - (alpha*0.001)*w.grad
#             print("Loss smaller than 0")
#         else:
        w = w - alpha*w.grad
        print(loss)
        print("--------")

    
    print("finished gradient descent on iteration " + str(maxIterations))
    print("with loss equal " + str(loss))
    return w

%time tGD = gradientDescentTorch(y_train_tensor, x_train_tensor)

tensor(inf, grad_fn=<RsubBackward1>)
tensor(34612736., grad_fn=<SumBackward0>)
--------
tensor(5133116., grad_fn=<SubBackward0>)
tensor(29479620., grad_fn=<SumBackward0>)
--------
tensor(4219604., grad_fn=<SubBackward0>)
tensor(25260016., grad_fn=<SumBackward0>)
--------
tensor(3475624., grad_fn=<SubBackward0>)
tensor(21784392., grad_fn=<SumBackward0>)
--------
tensor(2869400., grad_fn=<SubBackward0>)
tensor(18914992., grad_fn=<SumBackward0>)
--------
tensor(2375178., grad_fn=<SubBackward0>)
tensor(16539814., grad_fn=<SumBackward0>)
--------
tensor(1972004., grad_fn=<SubBackward0>)
tensor(14567810., grad_fn=<SumBackward0>)
--------
tensor(1642852., grad_fn=<SubBackward0>)
tensor(12924958., grad_fn=<SumBackward0>)
--------
tensor(1373906., grad_fn=<SubBackward0>)
tensor(11551052., grad_fn=<SumBackward0>)
--------
tensor(1153926., grad_fn=<SubBackward0>)
tensor(10397126., grad_fn=<SumBackward0>)
--------
tensor(973774., grad_fn=<SubBackward0>)
tensor(9423352., grad_fn=<SumBackward0>)
---

tensor(484382.5000, grad_fn=<SumBackward0>)
--------
tensor(6883.3125, grad_fn=<SubBackward0>)
tensor(477499.1875, grad_fn=<SumBackward0>)
--------
tensor(6621.9062, grad_fn=<SubBackward0>)
tensor(470877.2812, grad_fn=<SumBackward0>)
--------
tensor(6370.5312, grad_fn=<SubBackward0>)
tensor(464506.7500, grad_fn=<SumBackward0>)
--------
tensor(6128.5625, grad_fn=<SubBackward0>)
tensor(458378.1875, grad_fn=<SumBackward0>)
--------
tensor(5896., grad_fn=<SubBackward0>)
tensor(452482.1875, grad_fn=<SumBackward0>)
--------
tensor(5672., grad_fn=<SubBackward0>)
tensor(446810.1875, grad_fn=<SumBackward0>)
--------
tensor(5457.0312, grad_fn=<SubBackward0>)
tensor(441353.1562, grad_fn=<SumBackward0>)
--------
tensor(5249.6562, grad_fn=<SubBackward0>)
tensor(436103.5000, grad_fn=<SumBackward0>)
--------
tensor(5050.6250, grad_fn=<SubBackward0>)
tensor(431052.8750, grad_fn=<SumBackward0>)
--------
tensor(4858.9688, grad_fn=<SubBackward0>)
tensor(426193.9062, grad_fn=<SumBackward0>)
--------
tenso

tensor(307644.8750, grad_fn=<SumBackward0>)
--------
tensor(184.4375, grad_fn=<SubBackward0>)
tensor(307460.4375, grad_fn=<SumBackward0>)
--------
tensor(177.5000, grad_fn=<SubBackward0>)
tensor(307282.9375, grad_fn=<SumBackward0>)
--------
tensor(171.0625, grad_fn=<SubBackward0>)
tensor(307111.8750, grad_fn=<SumBackward0>)
--------
tensor(164.5000, grad_fn=<SubBackward0>)
tensor(306947.3750, grad_fn=<SumBackward0>)
--------
tensor(158.5000, grad_fn=<SubBackward0>)
tensor(306788.8750, grad_fn=<SumBackward0>)
--------
tensor(152.5000, grad_fn=<SubBackward0>)
tensor(306636.3750, grad_fn=<SumBackward0>)
--------
tensor(146.9688, grad_fn=<SubBackward0>)
tensor(306489.4062, grad_fn=<SumBackward0>)
--------
tensor(141.4688, grad_fn=<SubBackward0>)
tensor(306347.9375, grad_fn=<SumBackward0>)
--------
tensor(136.2188, grad_fn=<SubBackward0>)
tensor(306211.7188, grad_fn=<SumBackward0>)
--------
tensor(131.1562, grad_fn=<SubBackward0>)
tensor(306080.5625, grad_fn=<SumBackward0>)
--------
tensor(

tensor(7.7188, grad_fn=<SubBackward0>)
tensor(302693.6875, grad_fn=<SumBackward0>)
--------
tensor(7.3125, grad_fn=<SubBackward0>)
tensor(302686.3750, grad_fn=<SumBackward0>)
--------
tensor(7.2500, grad_fn=<SubBackward0>)
tensor(302679.1250, grad_fn=<SumBackward0>)
--------
tensor(7.1250, grad_fn=<SubBackward0>)
tensor(302672., grad_fn=<SumBackward0>)
--------
tensor(6.9688, grad_fn=<SubBackward0>)
tensor(302665.0312, grad_fn=<SumBackward0>)
--------
tensor(6.7188, grad_fn=<SubBackward0>)
tensor(302658.3125, grad_fn=<SumBackward0>)
--------
tensor(6.6562, grad_fn=<SubBackward0>)
tensor(302651.6562, grad_fn=<SumBackward0>)
--------
tensor(6.4688, grad_fn=<SubBackward0>)
tensor(302645.1875, grad_fn=<SumBackward0>)
--------
tensor(6.3125, grad_fn=<SubBackward0>)
tensor(302638.8750, grad_fn=<SumBackward0>)
--------
tensor(6.0625, grad_fn=<SubBackward0>)
tensor(302632.8125, grad_fn=<SumBackward0>)
--------
tensor(6.0625, grad_fn=<SubBackward0>)
tensor(302626.7500, grad_fn=<SumBackward0>)
-

tensor(2.1562, grad_fn=<SubBackward0>)
tensor(302312.8125, grad_fn=<SumBackward0>)
--------
tensor(2.1875, grad_fn=<SubBackward0>)
tensor(302310.6250, grad_fn=<SumBackward0>)
--------
tensor(2.3125, grad_fn=<SubBackward0>)
tensor(302308.3125, grad_fn=<SumBackward0>)
--------
tensor(2.3750, grad_fn=<SubBackward0>)
tensor(302305.9375, grad_fn=<SumBackward0>)
--------
tensor(2.1250, grad_fn=<SubBackward0>)
tensor(302303.8125, grad_fn=<SumBackward0>)
--------
tensor(2.3125, grad_fn=<SubBackward0>)
tensor(302301.5000, grad_fn=<SumBackward0>)
--------
tensor(2.1875, grad_fn=<SubBackward0>)
tensor(302299.3125, grad_fn=<SumBackward0>)
--------
tensor(2.1250, grad_fn=<SubBackward0>)
tensor(302297.1875, grad_fn=<SumBackward0>)
--------
tensor(2.1875, grad_fn=<SubBackward0>)
tensor(302295., grad_fn=<SumBackward0>)
--------
tensor(2.0938, grad_fn=<SubBackward0>)
tensor(302292.9062, grad_fn=<SumBackward0>)
--------
tensor(2.2188, grad_fn=<SubBackward0>)
tensor(302290.6875, grad_fn=<SumBackward0>)
-

In [117]:
from torch.utils.data import TensorDataset, DataLoader

inputs = np.stack((np.random.normal(0, 1, (1000)), 
              np.random.uniform(0, 1, (1000)), 
              np.random.normal(1,2, (1000))), axis=1)
inputs = torch.from_numpy(inputs).float().to(device)
targets = model(inputs)
targets = np.asanyarray([[float(x + noise)] for x in targets])
targets = torch.from_numpy(targets).float()

w = torch.randn(1, 3, requires_grad=True)
b = torch.randn(1, requires_grad=True)

train_ds = TensorDataset(inputs, targets)
train_ds[0:3]
# Define data loader
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
next(iter(train_dl))

[tensor([[ 1.2562,  0.9243,  0.4724],
         [ 0.7644,  0.1015, -1.4955],
         [ 0.8127,  0.3179, -0.2380],
         [-1.1483,  0.3693,  3.2355],
         [ 0.9877,  0.8425,  1.5595]]), tensor([[-1.2986],
         [-0.2923],
         [-0.7623],
         [-0.8785],
         [-1.5118]])]

In [127]:
import torch.nn.functional as F

model = nn.Linear(3, 1)
# Define optimizer
opt = torch.optim.SGD(model.parameters(), lr=1e-5)
loss_fn = F.mse_loss
loss = loss_fn(model(inputs), targets)
print(loss)

tensor(0.3132, grad_fn=<MseLossBackward>)


In [150]:
w = torch.randn(1, 3, requires_grad=True)
b = torch.randn(1, requires_grad=True)

In [151]:
num_epochs = 100
tolerance = 0.01
for epoch in range(num_epochs):
        loss_total = 0
        for xb,yb in train_dl:
            # Generate predictions
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss_total += loss
            # Perform gradient descent
            loss.backward()
            opt.step()
            opt.zero_grad()
        loss_avg = loss_total/(len(train_dl))
        print(loss_avg)
        if loss_avg < tolerance:
            break
print('Training loss: ', loss_fn(model(inputs), targets))

tensor(0.0010, grad_fn=<DivBackward0>)
Training loss:  tensor(0.0010, grad_fn=<MseLossBackward>)


### Problem 5 

Implement GD using pytorch automatic differentiation.

To this end the variable with respect to which the gradient will be calculated, ``t_w`` in this case, must have attribute
``requires_grad`` set to ``True`` (``t_w.require_grad=True``).

The torch will automatically track any expression containing ``t_w`` and store its computational graph. The method ``backward()`` can be run on the final expression to back propagate the gradient e.g. ``loss.backward()``. Then the gradient is accesible as ``t_w.grad``.