# Recognizing Numbers in MNIST Dataset

Today's exercise is focused on recognition of hand written digits from the [MNIST dataset](https://en.wikipedia.org/wiki/MNIST_database).

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn.functional as F

import matplotlib.pyplot as plt
import numpy as np

In [2]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

### Convolutional Network

For detection adn recongnition, we'll use a [LeNet model](https://en.wikipedia.org/wiki/LeNet).

In [3]:
class LeNet(torch.nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        # Convolution (In LeNet-5, 32x32 images are given as input. Hence padding of 2 is done below)
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2, bias=True)
        # Max-pooling
        self.max_pool_1 = torch.nn.MaxPool2d(kernel_size=2)
        # Convolution
        self.conv2 = torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0, bias=True)
        # Max-pooling
        self.max_pool_2 = torch.nn.MaxPool2d(kernel_size=2)
        # Fully connected layer
        self.fc1 = torch.nn.Linear(16*5*5, 120)   # convert matrix with 16*5*5 (= 400) features to a matrix of 120 features (columns)
        self.fc2 = torch.nn.Linear(120, 84)       # convert matrix with 120 features to a matrix of 84 features (columns)
        self.fc3 = torch.nn.Linear(84, 10)        # convert matrix with 84 features to a matrix of 10 features (columns)
        
    def forward(self, x):
        # convolve, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.conv1(x))  
        # max-pooling with 2x2 grid
        x = self.max_pool_1(x)
        # convolve, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.conv2(x))
        # max-pooling with 2x2 grid
        x = self.max_pool_2(x)
        # first flatten 'max_pool_2_out' to contain 16*5*5 columns
        # read through https://stackoverflow.com/a/42482819/7551231
        x = x.view(-1, 16*5*5)
        # FC-1, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.fc1(x))
        # FC-2, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.fc2(x))
        # FC-3
        x = self.fc3(x)
        
        return x

### Model Training

We train the model. You can use saved loss in a text file for charting.

In [4]:
def train(data, model):
    model.train()

    learning_rate = 0.01
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    num_epochs = 5
    p = 1
    with open("loss.txt", "wt") as f:
        for epoch in range(num_epochs):
            running_loss = 0.0
            for i, sample in enumerate(data, 0):
                optimizer.zero_grad()            
                #print(sample[0])
                #print(sample[1])
                inputs = sample[0]
                #img = np.reshape(inputs, (1, 1, 28, 28)) / 255
                #img = torch.from_numpy(img)
                #img = img.type(torch.FloatTensor)
                labels = sample[1]
                
                output = model(inputs)
                loss = criterion(output, labels)
                
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                if i % 500 == 499:    # print every 500 mini-batches
                    print('[%d, %d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 500))
                    s = "{0} {1}\n".format(p, running_loss / 500)
                    f.write(s)
                    p += 1
                    running_loss = 0.0

    torch.save({'state_dict': model.state_dict()}, './model.pth')

### Model Validation

We validate our trained model to the validation set.

You can enable/disable displaying of each validated image by changing the value of `show_image` variable.

In [5]:
def validation(data, model):
    model.eval()
    print("Validating...")
    show_image = True  

    size = len(data)
    num_incorrect = 0
    i = 0
    for sample in data:
        images, labels = sample
        #img = transforms.functional.to_pil_image(images[0][0], mode='L')
        #img.save("img_{}.png".format(i), "png")
        output = model(images)
        predicted = torch.max(output.data, 1)
        if labels[0] != predicted[1].item():
            num_incorrect += 1
            if show_image: 
                s = "Real: {0}\t Predicted: {1}".format(labels[0], predicted[1].item())
                print(s)
                imshow(torchvision.utils.make_grid(images))
        i += 1
    print("Validation Error: {0} %".format(100.0 * num_incorrect / size))

### Your Task

Implement a sliding window to recognize numbers in any location in a given image. We do not expect numbers to be rotated, so this is much simplified.

In [6]:
def sliding_window(model, image, size):
    """
    Implement a sliding window to recognize numbers in any location in a given image.
    We do not expect numbers to be rotated, so this is much simplified.
    """
    pass

### Run the Whole Thing

On the first run, `DataLoader` will download MNIST dataset using `torchvision`'s class.

Also, one trained you don't have to train the model on the next run, just uncoment the `torch.load` line and comment `model = LeNet()` and `train` function.

Uncoment `sliding_window` to test your implementation.

In [7]:
def main():
    transform = torchvision.transforms.Compose([torchvision.transforms.Grayscale(), torchvision.transforms.Resize(28), torchvision.transforms.ToTensor()])
    
    batch_size_train = 16

    train_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST('./data', train=True, download=True, transform=transform), batch_size=batch_size_train, shuffle=True)
    test_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST('./data', train=False, download=True, transform=transform))

    #trainfolder = datasets.ImageFolder("train", transform)
    #train_loader = torch.utils.data.DataLoader(trainfolder, batch_size=batch_size_train, shuffle=True)
    
    model = LeNet() 
    #model = torch.load("./model.pth") 

    train(train_loader, model)
    validation(test_loader, model)

    #img = cv2.imread('numbers.png', 0)
    #sliding_window(model, img, (28, 28))

In [None]:
if __name__ == '__main__':
    main()

[1, 500] loss: 2.293
[1, 1000] loss: 1.781
[1, 1500] loss: 0.620
[1, 2000] loss: 0.401
[1, 2500] loss: 0.320
[1, 3000] loss: 0.244
[1, 3500] loss: 0.202
[2, 500] loss: 0.160
[2, 1000] loss: 0.150
[2, 1500] loss: 0.129
[2, 2000] loss: 0.127
[2, 2500] loss: 0.119
[2, 3000] loss: 0.099
[2, 3500] loss: 0.107
[3, 500] loss: 0.090
[3, 1000] loss: 0.092
[3, 1500] loss: 0.086
[3, 2000] loss: 0.084
[3, 2500] loss: 0.081
