In [1]:
import torch
import matplotlib.pyplot as plt
import pickle
import gzip

## Dataset download

In [8]:
!wget http://deeplearning.net/data/mnist/mnist.pkl.gz
!mkdir datasets
!mv mnist.pkl.gz datasets/

--2020-10-03 10:14:22--  http://deeplearning.net/data/mnist/mnist.pkl.gz
Resolving deeplearning.net (deeplearning.net)... 132.204.26.28
Connecting to deeplearning.net (deeplearning.net)|132.204.26.28|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16168813 (15M) [application/x-gzip]
Saving to: ‘mnist.pkl.gz’


2020-10-03 10:14:28 (2,81 MB/s) - ‘mnist.pkl.gz’ saved [16168813/16168813]

zsh:1: command not found: tgzip


In [2]:
PATH = 'datasets/mnist.pkl.gz'
with gzip.open(PATH, 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

In [3]:
x_train, y_train, x_valid, y_valid = map(torch.tensor, (x_train, y_train, x_valid, y_valid))

In [4]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

In [5]:
x_train.min(), x_train.max(), x_valid.min(), x_valid.max()

(tensor(0.), tensor(0.9961), tensor(0.), tensor(0.9961))

In [6]:
y_train.min(), y_train.max(), y_valid.min(), y_valid.max()

(tensor(0), tensor(9), tensor(0), tensor(9))

## Initial model

In [7]:
weights = torch.randn(784, 10)
bias = torch.randn(10)

### Matrix multiplication

In [8]:
def matmul(a, b):
    ar, ac = a.shape
    br, bc = b.shape
    assert ac == br
    c = torch.zeros(ar, bc)
    for k in range(ac):
        for i in range(ar):
            for j in range(bc):
                c[i, j] += a[i, k] * b[k, j]
    return c

In [9]:
m1 = x_valid[:5]
m2 = weights
m1.shape, m2.shape

(torch.Size([5, 784]), torch.Size([784, 10]))

In [12]:
%time t1 = matmul(m1, m2); t1.shape

CPU times: user 711 ms, sys: 3.89 ms, total: 715 ms
Wall time: 717 ms


torch.Size([5, 10])

725ms with only 5 rows. With the entire 50000 rows equals *(145 ms * 50000 rows)* that is **approximately 2 hours**.


The way to make Python faster, is remove Python.

In [10]:
len(x_train)

50000

### Pytorch elementwise operations

Operations (+, -, *, /, >, <, ==)

In [11]:
a = torch.randn(10)
b = torch.randn(10)

In [12]:
(a < b)

tensor([False, False,  True, False,  True, False,  True, False,  True, False])

In [13]:
(a < b).float().mean()

tensor(0.4000)

60% of **a** are less than **b**

**Frobenius Norm (Matrix Normalization)**

The Frobenius norm, sometimes also called the Euclidean norm (a term unfortunately also used for the vector L^2-norm), is matrix norm of an m×n matrix A defined as the square root of the sum of the absolute squares of its elements. [Wolfram](https://mathworld.wolfram.com/FrobeniusNorm.html)

$$\|A\|_\text{F} = \sqrt{\sum_{i=1}^m \sum_{j=1}^n |a_{ij}|^2} $$

The Frobenius norm can also be considered as a vector norm. 

In [14]:
m = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)

In [15]:
def frobeniusNorm(x):
    a = 0.
    for i in range(x.shape[0]):
        for j in range(x.shape[0]):
            a += x[i, j] * x[i, j] #sum
    return a ** (1/2) #sqrt
%time frobeniusNorm(m)

CPU times: user 555 µs, sys: 343 µs, total: 898 µs
Wall time: 643 µs


tensor(16.8819)

or 

In [16]:
%time (m*m).sum().sqrt()

CPU times: user 166 µs, sys: 96 µs, total: 262 µs
Wall time: 206 µs


tensor(16.8819)

### Matrix multiplication optimization

In [17]:
m1 = torch.randn((2,4))
m2 = torch.randn((4,6))

In [18]:
def matmulv2(a, b):
    ar, ac = a.shape
    br, bc = b.shape
    assert ac == br
    c = torch.zeros(ar, bc)
    for i in range(ar):
        for j in range(bc):
            c[i, j] += (a[i, :] * b[:, j]).sum()
    return c

In [19]:
%timeit -n 10 matmulv2(m1, m2)

359 µs ± 40.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
%timeit -n 10 matmul(m1, m2)

999 µs ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


1.39ms vs 673ms

### Broadcasting Rules

Is called broadcasting when Tensor arguments can be expanded to be of equal sizes, without making copies of the data.

Rules:
- each tensor has at least 1 dim
- starting from trailing dimension, the dimensions must *be equals*, one of then 1, or one not exists.

In [22]:
(torch.empty(5,3,4,1) * torch.empty(  3,1,1)).shape

torch.Size([5, 3, 4, 1])

broadcastable, all rules always hold

In [23]:
(torch.empty(5,2,4,1) * torch.empty(  3,1,1)).shape

RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1

not broadcastable, the 3th dimension not equals or one (2 and 3)

In [24]:
a = torch.tensor([1,2,3,4])
a, a.shape

(tensor([1, 2, 3, 4]), torch.Size([4]))

In [25]:
b = torch.tensor([4,5,6,7])
b, b.shape

(tensor([4, 5, 6, 7]), torch.Size([4]))

How to adds new axis

In [26]:
a[:, None], a[None, :]

(tensor([[1],
         [2],
         [3],
         [4]]),
 tensor([[1, 2, 3, 4]]))

In [27]:
a[:, None] * b[None, :]

tensor([[ 4,  5,  6,  7],
        [ 8, 10, 12, 14],
        [12, 15, 18, 21],
        [16, 20, 24, 28]])

In [28]:
a[:, None] + b[None, :]

tensor([[ 5,  6,  7,  8],
        [ 6,  7,  8,  9],
        [ 7,  8,  9, 10],
        [ 8,  9, 10, 11]])

In [29]:
a = torch.randn(2, 4)
b = torch.randn(4, 2)

In [30]:
a[0, None]

tensor([[ 1.7881,  1.0319, -0.1971, -0.6233]])

In [31]:
b[:, None, 0]

tensor([[ 0.2307],
        [-1.7750],
        [-0.0385],
        [-0.7283]])

In [32]:
(a[0, None] * b[:, None, 0])

tensor([[ 0.4125,  0.2381, -0.0455, -0.1438],
        [-3.1738, -1.8316,  0.3498,  1.1064],
        [-0.0689, -0.0398,  0.0076,  0.0240],
        [-1.3023, -0.7515,  0.1435,  0.4540]])

### Matmul with broadcasting

In [33]:
def matmulv3(a, b):
    ar, ac = a.shape
    br, bc = b.shape
    assert ac == br
    c = torch.zeros(ar, bc)
    for i in range(ar):
        c[i] = (a[i, :, None] * b).sum(dim=0)
    return c

In [34]:
m1.shape, m2.shape

(torch.Size([2, 4]), torch.Size([4, 6]))

In [35]:
x = matmulv3(m1, m2)
x.shape

torch.Size([2, 6])

In [41]:
%timeit matmulv3(m1, m2)

36.7 µs ± 629 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [37]:
a[0, :, None].shape

torch.Size([4, 1])

**a**(4, 1) and **b**(4, 2) dims, now is broadcastable.

In [38]:
a[0, :, None] * b

tensor([[ 0.4125,  0.7097],
        [-1.8316, -0.9520],
        [ 0.0076, -0.4340],
        [ 0.4540, -0.0903]])

### Einstein summation

Einstein summation convention is a notational convention that implies summation over a set of indexed terms in a formula, thus achieving notational brevity. [Wikipedia](https://en.wikipedia.org/wiki/Einstein_notation)

There are essentially [three rules](https://mathworld.wolfram.com/EinsteinSummation.html):

1. Repeated indices are implicitly summed over.

2. Each index can appear at most twice in any term.

3. Each term must contain identical non-repeated indices. 

Example:

$$y = \sum_{i = 1}^3 c_i x^i = c_1 x^1 + c_2 x^2 + c_3 x^3$$

is simplified by the convention to:

$$y = c_i x^i$$

In [39]:
# c[i, j] += a[i, k] * b[k, j]
def matmulv4(a, b): return torch.einsum('ik,kj->ij', a, b)

In [45]:
%timeit -n 10 matmulv4(m1, m2)

49.1 µs ± 19.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Pytorch Matmul

In [46]:
%timeit -n 10 m1.matmul(m2)

The slowest run took 16.24 times longer than the fastest. This could mean that an intermediate result is being cached.
11.2 µs ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
