In [1]:
# Reading files
import io
import numpy as np
import pandas as pd
import PIL.Image as Image
# Training model
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.set_default_dtype(torch.double)

## 1. Data

### 1.1 Load from HF
[USPS](https://huggingface.co/datasets/flwrlabs/usps) is a digit dataset scanned from envelopes by the U.S. Postal Service containing a total of $9,298$ samples of handwritten digits:
* $7,291$ digits are used for training
* $2,007$ digits are used for testing
* each image is $16$ x $16$ pixels grayscale (not binary)
* the images are within $[0,255]$ range, but we will normalize it to $[-1,1]$
* the images are centered
* they show a broad range of font styles

One important feature of these images is that both the training set and the testing set contain numerous examples that are ambiguous, unclassifiable, or even misclassified.

In [2]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/flwrlabs/usps/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/flwrlabs/usps/" + splits["test"])
train_df.head()

Unnamed: 0,image,label
0,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,6
1,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,5
2,"{'bytes': b""\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...",4
3,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,7
4,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,3


### 1.2 Convert bytes to integers

In [3]:
def convert_image_bytes_to_int(img: dict) -> np.ndarray:
    # Convert the image bytes to numpy arrays
    image = Image.open(io.BytesIO(img['bytes']))
    return np.array(image)

def convert_image_df_to_numpy(image_df: pd.DataFrame) -> np.ndarray:
    images = image_df['image'].apply(convert_image_bytes_to_int)
    # Convert to numpy array (consider the shape of all images is the same - 16x16)
    train_images = np.zeros((len(images), 1, 16, 16))
    for i, image in enumerate(images):
        train_images[i][0] = image
    return train_images

train_images = convert_image_df_to_numpy(train_df)

print(f"=== TRAIN SET ===")
print(f"train_images shape: {train_images.shape}")
print(f"train_image_0 shape: {train_images[0][0].shape}")

=== TRAIN SET ===
train_images shape: (7291, 1, 16, 16)
train_image_0 shape: (16, 16)


In [4]:
test_images = convert_image_df_to_numpy(test_df)
print(f"=== TEST SET ===")
print(f"test_images shape: {test_images.shape}")
print(f"test_image_0 shape: {test_images[0][0].shape}")

=== TEST SET ===
test_images shape: (2007, 1, 16, 16)
test_image_0 shape: (16, 16)


## 2. Preprocess

### 2.1 Normalize data [-1, 1]

In order to prevent exploding gradients and slower computation we will scale the data to smaller numbers. 

In the original paper the data was scaled to the range of $[-1,1]$. We will do the same.

In [5]:
# Scale data [-1, 1]. We first scale the data to the range of [0, 2] and then shift it to the range of [-1, 1].
train_images = train_images / 127.5 - 1
test_images = test_images / 127.5 - 1

### 2.2 One-hot encode the labels

In [6]:
### Convert labels to one-hot encoding
def convert_labels_to_one_hot(df: pd.DataFrame) -> np.ndarray:
    labels = df['label'].values
    labels = np.eye(10)[labels]
    return labels

train_labels = convert_labels_to_one_hot(train_df)
# Transform each label from horizontal to vertical
train_labels = train_labels.reshape(train_labels.shape[0], 10)
print(f"=== TRAIN SET ===")
print(f"train_labels.shape: {train_labels.shape}")
print(f"train_labels[0]: {train_labels[0]}")

=== TRAIN SET ===
train_labels.shape: (7291, 10)
train_labels[0]: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]


In [7]:
test_labels = convert_labels_to_one_hot(test_df)
print(f"=== TEST SET ===")
print(f"test_labels.shape: {test_labels.shape}")
print(f"test_labels[0]: {test_labels[0]}")

=== TEST SET ===
test_labels.shape: (2007, 10)
test_labels[0]: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


## 3. Neural Net Design

### 3.1 Random weights
Weights are initialized with random values within $U[-2.4/F, 2.4/F]$ range, where $F$ is the fan-in (number of inputs, connected to a neuron or a layer). 

**Reasoning**: *"tends to keep total inputs in operating range of sigmoid"*


### 3.2 Activation function - LiSHT
Basically, this function computes linearly scaled hyperbolic tangent:

$$\text{lisht}(x) = x * \text{tanh}(x)$$

#### Why Tanh

In this paper LeCun's team applied **scaled hyperbolic tangent** (i.e. tanh) function to the output of each layer in the neural net. So the question is why this function? Why not simple `sigmoid` which is basically the same function, but within the $[0,1]$ range (`tanh`'s range is $[-1, 1]$).

The only reason I can come up with is that `tanh` has steeper gradients than sigmoid (due to the bigger range), which means faster learning. Of course, we can achieve faster learning with higher `learning rate`. However, latter would increase the risk of divergence.

Read more [HERE >>>](https://stats.stackexchange.com/questions/330559/why-is-tanh-almost-always-better-than-sigmoid-as-an-activation-function) .

LeCun himself wrote following text in another paper:
* *Symmetric functions of that kind are believed to yield **faster convergence**, although the learning can be extremely slow if some weights are too small (LeCun 1987).*

### 3.3 Cost function - MSE
The output cost function was the mean squared error. 

### 3.4 Stochastic gradient descent
In this paper SGD is chosen over Batch GD with following argument:
* *The weights were updated according to the so-called stochastic gradient or "on-line" procedure (updating after each presentation of a single pattern) as opposed to the "true" gradient procedure (averaging over the whole training set before updating the weights). From empirical study (supported by theoretical arguments), the stochastic gradient was found to converge much faster than the true gradient, especially on large, redundant data bases. It also finds solutions that are
more robust.*

<center><img src="img/lecun_zip_code_nn.png" alt="Neural Network Architecture" width="921" height="468" /></center>
<p style="text-align: center; font-size: small;"><i><b>Figure 1.</b> 1989 LeCun ConvNet per description in the paper</i></p>

In [8]:
class Net(nn.Module):
    """ 1989 LeCun ConvNet per description in the paper """

    def __init__(self):
        super().__init__()

        # initialization as described in the paper to my best ability, but it doesn't look right...
        winit = lambda fan_in, *shape: (torch.rand(*shape) - 0.5) * 2 * 2.4 / fan_in**0.5
        macs = 0 # keep track of MACs (multiply accumulates)
        acts = 0 # keep track of number of activations

        # H1 layer parameters and their initialization
        self.H1w = nn.Parameter(winit(5*5*1, 12, 1, 5, 5))
        self.H1b = nn.Parameter(torch.zeros(12, 8, 8)) # presumably init to zero for biases
        assert self.H1w.nelement() + self.H1b.nelement() == 1068
        macs += (5*5*1) * (8*8) * 12
        acts += (8*8) * 12

        # H2 layer parameters and their initialization
        """
        H2 neurons all connect to only 8 of the 12 input planes, with an unspecified pattern
        I am going to assume the most sensible block pattern where 4 planes at a time connect
        to differently overlapping groups of 8/12 input planes. We will implement this with 3
        separate convolutions that we concatenate the results of.
        """
        self.H2w = nn.Parameter(winit(5*5*8, 12, 8, 5, 5))
        self.H2b = nn.Parameter(torch.zeros(12, 4, 4)) # presumably init to zero for biases
        assert self.H2w.nelement() + self.H2b.nelement() == 2592
        macs += (5*5*8) * (4*4) * 12
        acts += (4*4) * 12

        # H3 is a fully connected layer
        self.H3w = nn.Parameter(winit(4*4*12, 4*4*12, 30))
        self.H3b = nn.Parameter(torch.zeros(30))
        assert self.H3w.nelement() + self.H3b.nelement() == 5790
        macs += (4*4*12) * 30
        acts += 30

        # output layer is also fully connected layer
        self.outw = nn.Parameter(winit(30, 30, 10))
        self.outb = nn.Parameter(-torch.ones(10)) # 9/10 targets are -1, so makes sense to init slightly towards it
        assert self.outw.nelement() + self.outb.nelement() == 310
        macs += 30 * 10
        acts += 10

        self.macs = macs
        self.acts = acts

    def forward(self, x):
        # x has shape (1, 1, 16, 16)
        x = F.pad(x, (2, 2, 2, 2), 'constant', -1.0) # pad by two using constant -1 for background
        x = F.conv2d(x, self.H1w, stride=2) + self.H1b
        x = torch.tanh(x)

        # x is now shape (1, 12, 8, 8)
        x = F.pad(x, (2, 2, 2, 2), 'constant', -1.0) # pad by two using constant -1 for background
        slice1 = F.conv2d(x[:, 0:8], self.H2w[0:4], stride=2) # first 4 planes look at first 8 input planes
        slice2 = F.conv2d(x[:, 4:12], self.H2w[4:8], stride=2) # next 4 planes look at last 8 input planes
        slice3 = F.conv2d(torch.cat((x[:, 0:4], x[:, 8:12]), dim=1), self.H2w[8:12], stride=2) # last 4 planes are cross
        x = torch.cat((slice1, slice2, slice3), dim=1) + self.H2b
        x = torch.tanh(x)

        # x is now shape (1, 12, 4, 4)
        x = x.flatten(start_dim=1) # (1, 12*4*4)
        x = x @ self.H3w + self.H3b
        x = torch.tanh(x)

        # x is now shape (1, 30)
        x = x @ self.outw + self.outb
        x = torch.tanh(x)

         # x is finally shape (1, 10)
        return x

## 4. Train

* **NB**: In order to run following code you need CUDA enabled GPU. 
* **Running time**: Few minutes
* **Results**: 95% accuracy on the testing set

In [9]:
# init rng
torch.manual_seed(1337)
np.random.seed(1337)
torch.use_deterministic_algorithms(False)

# init a model
model = Net()
model.to('cuda')
print("model stats:")
print("# params:      ", sum(p.numel() for p in model.parameters())) # in paper total is 9,760
print("# MACs:        ", model.macs)
print("# activations: ", model.acts)

# init data
Xtr, Ytr = torch.from_numpy(train_images).to('cuda'), torch.from_numpy(train_labels).to('cuda')
Xte, Yte = torch.from_numpy(test_images).to('cuda'), torch.from_numpy(test_labels).to('cuda')

# init optimizer
optimizer = optim.SGD(model.parameters(), lr=0.03)

def eval_split(split):
    # eval the full train/test set, batched implementation for efficiency
    model.eval()
    X, Y = (Xtr, Ytr) if split == 'train' else (Xte, Yte)
    Yhat = model(X)
    loss = torch.mean((Y - Yhat)**2)
    err = torch.mean((Y.argmax(dim=1) != Yhat.argmax(dim=1)).float())
    print(f"eval: split {split:5s}. loss {loss.item():e}. error {err.item()*100:.2f}%. misses: {int(err.item()*Y.size(0))}")

# train (23 iterations were used in the paper as well)
for pass_num in range(23):

    # perform one epoch of training
    model.train()
    for step_num in range(Xtr.size(0)):

        # fetch a single example into a batch of 1
        x, y = Xtr[[step_num]], Ytr[[step_num]]

        # forward the model and the loss
        yhat = model(x)
        loss = torch.mean((y - yhat)**2)

        # calculate the gradient and update the parameters
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    # after epoch epoch evaluate the train and test error / metrics
    print(pass_num + 1)
    eval_split('train')
    eval_split('test')

model stats:
# params:       9760
# MACs:         63660
# activations:  1000
1
eval: split train. loss 3.133231e-02. error 9.97%. misses: 726
eval: split test . loss 3.674993e-02. error 14.50%. misses: 291
2
eval: split train. loss 2.061267e-02. error 5.95%. misses: 433
eval: split test . loss 2.630060e-02. error 10.06%. misses: 202
3
eval: split train. loss 1.508845e-02. error 4.53%. misses: 329
eval: split test . loss 2.083232e-02. error 8.87%. misses: 178
4
eval: split train. loss 1.185159e-02. error 3.81%. misses: 277
eval: split test . loss 1.776443e-02. error 8.62%. misses: 173
5
eval: split train. loss 9.951142e-03. error 3.42%. misses: 248
eval: split test . loss 1.597637e-02. error 8.27%. misses: 166
6
eval: split train. loss 8.715297e-03. error 3.11%. misses: 226
eval: split test . loss 1.480246e-02. error 7.87%. misses: 158
7
eval: split train. loss 7.805854e-03. error 2.89%. misses: 211
eval: split test . loss 1.393636e-02. error 7.32%. misses: 147
8
eval: split train. loss