## Outline
* PyTorch
* What are tensors
* Initialising, slicing, reshaping tensors
* Numpy and PyTorch interfacing
* GPU support for PyTorch + Enabling GPUs on Google Colab
* Speed comparisons, Numpy -- PyTorch -- PyTorch on GPU
* Autodiff concepts and application
* Writing a basic learning loop using autograd
* Exercises

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Initialise tensors

In [2]:
x = torch.ones(3, 2)
print(x)
x = torch.zeros(3, 2)
print(x)
x = torch.rand(3, 2)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[0.8289, 0.7393],
        [0.7940, 0.8718],
        [0.3350, 0.7439]])


In [3]:
x.shape

torch.Size([3, 2])

In [4]:
x = torch.empty(3, 2) 
print(x)
y = torch.zeros_like(x)
print(y)

tensor([[-1.0365e+23,  4.5729e-41],
        [-6.7615e-38,  3.0883e-41],
        [ 6.4420e+17,  4.5729e-41]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [5]:
x = torch.linspace(0, 1, steps=5)
print(x)

tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])


In [6]:
x.dtype

torch.float32

In [7]:
x = torch.tensor([[1, 2], 
                 [3, 4], 
                 [5, 6]])
print(x)

tensor([[1, 2],
        [3, 4],
        [5, 6]])


## Slicing tensors

In [8]:
print(x.size())
print(x[:, 1]) 
print(x[0, :]) 

torch.Size([3, 2])
tensor([2, 4, 6])
tensor([1, 2])


In [9]:
x[:, 1].shape

torch.Size([3])

In [10]:
x[0, :].shape

torch.Size([2])

In [11]:
y = x[1, 1]
print(y)
print(y.item())

tensor(4)
4


## Reshaping tensors

In [12]:
print(x)
y = x.view(2, 3)
print(y)

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [19]:
y = x.view(6,-1) 
print(y, y.shape)

tensor([[  1],
        [100],
        [  3],
        [  4],
        [  5],
        [  6]]) torch.Size([6, 1])


In [14]:
x

tensor([[1, 2],
        [3, 4],
        [5, 6]])

In [15]:
y

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])

In [16]:
x[0, 1] = 100

In [17]:
x

tensor([[  1, 100],
        [  3,   4],
        [  5,   6]])

In [18]:
y

tensor([[  1],
        [100],
        [  3],
        [  4],
        [  5],
        [  6]])

## Simple Tensor Operations

In [20]:
x = torch.ones([3, 2])
y = torch.ones([3, 2])
z = x + y
print(z)
z = x - y
print(z)
z = x * y
print(z)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [21]:
z = y.add(x)
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [22]:
z = y.add_(x)
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])


## Numpy <> PyTorch

In [23]:
x_np = x.numpy()
print(type(x), type(x_np))
print(x_np)

<class 'torch.Tensor'> <class 'numpy.ndarray'>
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [33]:
a = np.random.randn(5)
print(a)
a_pt = torch.from_numpy(a)
print(type(a), type(a_pt))
print(a_pt)

[ 0.75432691  0.02224987 -0.32031461  0.17330092 -0.78493466]
<class 'numpy.ndarray'> <class 'torch.Tensor'>
tensor([ 0.7543,  0.0222, -0.3203,  0.1733, -0.7849], dtype=torch.float64)


In [34]:
np.add(a, 1, out=a)   # a = a + 1
print(a)
print(a_pt)         # Its kind of shallow copy, it changes a_pt as well

[1.75432691 1.02224987 0.67968539 1.17330092 0.21506534]
tensor([1.7543, 1.0222, 0.6797, 1.1733, 0.2151], dtype=torch.float64)


In [35]:
%%time
for i in range(100):
    a = np.random.randn(100,100)
    b = np.random.randn(100,100)
    c = np.matmul(a, b)

CPU times: user 186 ms, sys: 0 ns, total: 186 ms
Wall time: 240 ms


In [36]:
%%time
for i in range(100):
    a = torch.randn([100, 100])
    b = torch.randn([100, 100])
    c = torch.matmul(a, b)

CPU times: user 59.3 ms, sys: 0 ns, total: 59.3 ms
Wall time: 110 ms


In [37]:
%%time
for i in range(10):
    a = np.random.randn(10000,10000)
    b = np.random.randn(10000,10000)
    c = a + b

CPU times: user 1min 27s, sys: 6.92 s, total: 1min 34s
Wall time: 1min 34s


In [39]:
%%time
for i in range(10):
    a = torch.randn([10000, 10000])
    b = torch.randn([10000, 10000])
    c = a + b

CPU times: user 24.5 s, sys: 5.97 s, total: 30.5 s
Wall time: 28.8 s


## CUDA support

In [40]:
print(torch.cuda.device_count())

1


In [41]:
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7f795cedbeb8>
Tesla K80


In [42]:
cuda0 = torch.device('cuda:0')

In [46]:
a = torch.ones(3, 2, device=cuda0)
b = torch.ones(3, 2, device=cuda0)
c = a + b
print(c)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], device='cuda:0')


In [47]:
c.dtype

torch.float32

In [48]:
print(c)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], device='cuda:0')


In [49]:
print(a)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], device='cuda:0')


In [50]:
%%time
for i in range(10):
    a = np.random.randn(10000,10000)
    b = np.random.randn(10000,10000)
    np.add(b, a)

CPU times: user 1min 28s, sys: 6.91 s, total: 1min 34s
Wall time: 1min 34s


In [51]:
%%time
for i in range(10):
    a_cpu = torch.randn([10000, 10000])
    b_cpu = torch.randn([10000, 10000])
    b_cpu.add_(a_cpu)

CPU times: user 24.4 s, sys: 3.92 s, total: 28.3 s
Wall time: 27.8 s


In [52]:
%%time
for i in range(10):
    a = torch.randn([10000, 10000], device=cuda0)
    b = torch.randn([10000, 10000], device=cuda0)
    b.add_(a)

CPU times: user 4.46 ms, sys: 4.54 ms, total: 8.99 ms
Wall time: 45.8 ms


In [53]:
%%time
for i in range(10):
    a = np.random.randn(10000,10000)
    b = np.random.randn(10000,10000)
    np.matmul(b, a)

CPU times: user 10min 23s, sys: 7.85 s, total: 10min 31s
Wall time: 6min 1s


In [54]:
%%time
for i in range(10):
    a_cpu = torch.randn([10000, 10000])
    b_cpu = torch.randn([10000, 10000])
    torch.matmul(a_cpu, b_cpu)

CPU times: user 4min 46s, sys: 6.3 s, total: 4min 52s
Wall time: 2min 39s


In [55]:
%%time
for i in range(10):
    a = torch.randn([10000, 10000], device=cuda0)
    b = torch.randn([10000, 10000], device=cuda0)
    torch.matmul(a, b)

CPU times: user 166 ms, sys: 49.7 ms, total: 215 ms
Wall time: 4.19 s


## Autodiff

In [60]:
torch.ones([3, 2])

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])

In [61]:
x = torch.ones([3, 2], requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [62]:
y = x + 5
print(y)

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [63]:
z = y*y + 1
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [66]:
a = torch.ones(2,3)
print(a)

tensor([[1., 1., 1.],
        [1., 1., 1.]])


In [67]:
a.sum()

tensor(6.)

In [68]:
torch.sum(a)

tensor(6.)

In [69]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


In [70]:
t.backward()               # This will backpropagate and will update all the parameters

In [71]:
print(x.grad)

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


$t = \sum_i z_i, z_i = y_i^2 + 1, y_i = x_i + 5$

$\frac{\partial t}{\partial x_i} = \frac{\partial z_i}{\partial x_i} = \frac{\partial z_i}{\partial y_i} \frac{\partial y_i}{\partial x_i} = 2y_i \times 1$


At x = 1, y = 6, $\frac{\partial t}{\partial x_i} = 12$

In [75]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
print(r)
s = torch.sum(r)
print(s.item())
s.backward()
print(x.grad)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
5.985164642333984
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


In [80]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
a = torch.ones_like(x)               # This is same as torch.ones(3,2)
r.backward(a)                        # This is same as 1) s = torch.sum(a) 2) s.backward() 
print(x.grad)

tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


$\frac{\partial{s}}{\partial{x}} = \frac{\partial{s}}{\partial{r}} \cdot \frac{\partial{r}}{\partial{x}}$

For the above code $a$ represents $\frac{\partial{s}}{\partial{r}}$ and then $x.grad$ gives directly $\frac{\partial{s}}{\partial{x}}$



## Autodiff example that looks like what we have been doing

In [82]:
x = torch.randn([20, 1], requires_grad=True)
y = 3*x - 2

In [83]:
w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

y_hat = w*x + b

loss = torch.sum((y_hat - y)**2)

In [84]:
print(loss)

tensor(417.6251, grad_fn=<SumBackward0>)


In [87]:
loss.backward()              # This updates the w and b

In [88]:
print(w.grad, b.grad)

tensor([-190.6491]) tensor([151.3173])


## Do it in a loop

In [95]:
learning_rate = 0.01

w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

print(w.item(), b.item())
print("******************")
for i in range(50):
  
    x = torch.randn([20, 1])
    y = 3*x - 2

    y_hat = w*x + b
    loss = torch.sum((y_hat - y)**2)

    loss.backward()
  
    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

    print(w.item(), b.item())
  

1.0 1.0
******************
1.4538754224777222 -0.15935003757476807
1.8653771877288818 -0.7478501796722412
2.3057668209075928 -1.366659164428711
2.61867618560791 -1.7204840183258057
2.7921957969665527 -1.818084955215454
2.8656489849090576 -1.8938246965408325
2.922668933868408 -1.9322118759155273
2.9452641010284424 -1.9581767320632935
2.9694762229919434 -1.9755834341049194
2.9881255626678467 -1.9876528978347778
2.9927940368652344 -1.9930614233016968
2.996001720428467 -1.9956861734390259
2.998696804046631 -1.9976470470428467
2.9989092350006104 -1.9984732866287231
2.9990663528442383 -1.9989922046661377
2.9993064403533936 -1.9993153810501099
2.9996612071990967 -1.9995900392532349
2.9997284412384033 -1.9997296333312988
2.9998116493225098 -1.9997941255569458
2.999943971633911 -1.999913215637207
2.9999868869781494 -1.9999524354934692
2.999985694885254 -1.9999690055847168
2.999990940093994 -1.9999816417694092
2.999997854232788 -1.9999899864196777
3.000000476837158 -1.9999942779541016
2.99999952

## Do it for a large problem

In [96]:
%%time
learning_rate = 0.001
N = 10000000
epochs = 200

w = torch.rand([N], requires_grad=True)                    # On CPU
b = torch.ones([1], requires_grad=True)

# print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
    x = torch.randn([N])
    y = torch.dot(3*torch.ones([N]), x) - 2

    y_hat = torch.dot(w, x) + b
    loss = torch.sum((y_hat - y)**2)

    loss.backward()
  
    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

#   print(torch.mean(w).item(), b.item())
  

CPU times: user 42.3 s, sys: 20.3 s, total: 1min 2s
Wall time: 44.9 s


In [97]:
%%time
learning_rate = 0.001
N = 10000000
epochs = 200

w = torch.rand([N], requires_grad=True, device=cuda0)            # On GPU 
b = torch.ones([1], requires_grad=True, device=cuda0)

# print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
    x = torch.randn([N], device=cuda0)
    y = torch.dot(3*torch.ones([N], device=cuda0), x) - 2

    y_hat = torch.dot(w, x) + b
    loss = torch.sum((y_hat - y)**2)

    loss.backward()
  
    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

  #print(torch.mean(w).item(), b.item())
  

CPU times: user 950 ms, sys: 277 ms, total: 1.23 s
Wall time: 1.34 s
