In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
import random

In [3]:
trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
testset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

# Note: Originally I planned to do this with the CIFAR data set but training was going too slowly
# So sticking with MNIST, what we've been working with so far

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw


100.0%


Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz
Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz



100.0%
100.0%


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz
Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw



In [181]:
trainset.data.shape

torch.Size([60000, 28, 28])

In [182]:
testset.data.shape

torch.Size([10000, 28, 28])

In [184]:
testset.data[0]

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   

In [185]:
def imshow(img):
    img = img / 256
    plt.imshow( img )
    plt.show()

imshow( trainset.data[0] )

In [4]:
class LinearSoftmaxRegression(nn.Module):
    def __init__(self):
        super(LinearSoftmaxRegression, self).__init__()

        self.layer_1 = torch.nn.Linear( in_features = 28*28*1, out_features = 10, bias=True )

    def forward(self, input_tensor):
        flattened = nn.Flatten()( input_tensor )

        logits = self.layer_1( flattened )

        return logits

        # NOTE: Correcting a mistake I made previously, I am outputing the results of a linear layer,
        # For softmax to be applied elsewhere. Shiwei correctly pointed out that if we use the built in
        # cross entropy loss function, it expects to receive these linear values, and will apply
        # logOfSoftmax internally when calculating the loss, so that we don't have to.

        # But if we want the probabilities, we do need to apply softmax

In [5]:
def confusion_matrix( model, x, y ):
    identification_counts = np.zeros( shape = (10,10), dtype = np.int32 )

    logits = model.forward( x )
    predicted_classes = torch.argmax( logits, dim = 1 )

    n = x.shape[0]

    for i in range(n):
        actual_class = int( y[i].item() )
        predicted_class = predicted_classes[i].item()
        identification_counts[actual_class, predicted_class] += 1

    return identification_counts

In [6]:
test_x = torch.Tensor( testset.data ) / 256.0 - 0.5
test_y = torch.Tensor( testset.targets ).long()
train_x = torch.Tensor( trainset.data ) / 256.0 - 0.5
train_y = torch.Tensor( trainset.targets ).long()

In [172]:
test_y

tensor([3, 8, 8,  ..., 5, 1, 7])

In [7]:
model = LinearSoftmaxRegression()

loss_function = torch.nn.CrossEntropyLoss()

print("Initial Confusion Matrix")
print( confusion_matrix( model, test_x, test_y ) )

Initial Confusion Matrix
[[  1   3  15 954   0   4   0   1   2   0]
 [  0   3   1 890   0 222   0   0  19   0]
 [  1  33  31 744   3  30   0   1 189   0]
 [  0  18  22 850   0  83   0  14  23   0]
 [  0  79 273 620   0   4   0   0   6   0]
 [  0  37  56 749   0  37   0   6   7   0]
 [  0  22  39 880   0  11   0   0   6   0]
 [  0  13  30 965   0  12   0   0   7   1]
 [  0  83  29 790   0  37   0  11  24   0]
 [  0 185  13 802   0   9   0   0   0   0]]


In [196]:
batch_size = 128

def get_batch(x, y, batch_size):
    n = x.shape[0]

    batch_indices = random.sample( [ i for i in range(n) ], k = batch_size )

    x_batch = x[ batch_indices ]
    y_batch = y[ batch_indices ]

    return x_batch, y_batch

In [197]:
optimizer = optim.Adam(model.parameters(), lr = 0.01 )

In [205]:
for epochs in range(10):
    total_loss = 0
    for batch in range( train_x.shape[0] // batch_size ):
        x_batch, y_batch = get_batch(train_x, train_y, batch_size)

        optimizer.zero_grad()

        logits = model( x_batch )
        loss = loss_function( logits, y_batch )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print( "Total Loss over Batches:",total_loss )
print("Current Confusion Matrix")
print( confusion_matrix( model, test_x, test_y ) )
print()

Total Loss over Batches: 142.96164847165346
Total Loss over Batches: 146.85888404399157
Total Loss over Batches: 142.08966858685017
Total Loss over Batches: 142.24016642570496
Total Loss over Batches: 143.28396253287792
Total Loss over Batches: 144.7485088557005
Total Loss over Batches: 135.97335290163755
Total Loss over Batches: 137.23502687364817
Total Loss over Batches: 141.3939715474844
Total Loss over Batches: 141.48023819178343
Current Confusion Matrix
[[ 886    0    8    8    0   26   35    9    4    4]
 [   0 1116    2    3    0    1    7    1    5    0]
 [   1    9  916   34    6    5   22   12   24    3]
 [   1    0    8  944    2   19    4   12   12    8]
 [   1    1    7    4  886    0   20   10    3   50]
 [   3    2    5   49    9  752   36    8   17   11]
 [   2    3    5    3    3    6  934    2    0    0]
 [   0    8   17   15    8    1    0  955    0   24]
 [   5   12   11   51    8   34   27   15  788   23]
 [   4    7    2   15   24    6    0   28    3  920]]



In [229]:
class SimpleCNNModel(nn.Module):
  def __init__(self):
    super(SimpleCNNModel, self).__init__()

    self.conv_layer_1 = nn.Conv2d(in_channels = 1, out_channels = 5, kernel_size = 3, stride = 2, padding = "valid", bias=True)

    self.linear_layer = torch.nn.Linear( in_features = 13*13*5, out_features = 10, bias=True )

  def forward(self, input_tensor, verbose = False):
    output = torch.reshape( input_tensor, (-1, 1, 28, 28) )
    output = self.conv_layer_1( output )
    if verbose:
      print( output.shape )
    output = nn.ReLU()( output )
    output = nn.Flatten()( output )
    if verbose:
      print( output.shape )
    output = self.linear_layer( output )
    if verbose:
      print( output.shape )
    return output

    # When the input comes in, we have N images, each 28 x 28 pixels
    # The result of reshaping it is to make it N x 1 x 28 x 28 - the first index representing the number of channels of data
    # In the CIFAR data which I'd intended to start with, the number of channels for each pixel is 3, since we have Red Blue Green
    # for color data.

    # The kernel of the convoltional layer is 3x3, so looking a 3x3 pixel paches at a time, and the stride is 2
    # So that if we have input pixels
    # a b c d q
    # e f g h r
    # i j k l s
    # m n o p t
    #  the first kernel is applied to (a,b,c/e,f,g/i,j,k), and the second one is applied to (c,d,q/g,h,r/k,l,s), etc

    # Note that in the verbose mode, after the convolution layer is applied, we get an output of size
    # [10000, 5, 13, 13])
    # since we are computing 5 filters or kernels, at each of the 13 x 13 locations (coming from striding across 28 x 28 at stride 2)

    # Worth experimenting with different kernel numbers and dimensions, and show them how the dimension of the output changes

    # Once we have the convoltion applied, we flatten it, apply an activation function, and go through a linear layer to get 10 outputs

    # One question you may get is about padding - padding refers to adding zeros on the outside boundary to adjust the shape as desired
    # But in this case, valid gives no padding

In [244]:
cnn_model = SimpleCNNModel()

In [245]:
print( cnn_model )

SimpleCNNModel(
  (conv_layer_1): Conv2d(1, 5, kernel_size=(3, 3), stride=(2, 2), padding=valid)
  (linear_layer): Linear(in_features=845, out_features=10, bias=True)
)


In [246]:
cnn_model( test_x, verbose = True )

torch.Size([10000, 5, 13, 13])
torch.Size([10000, 845])
torch.Size([10000, 10])


tensor([[-0.0294, -0.1354, -0.0121,  ..., -0.0560,  0.1276,  0.0249],
        [ 0.0557, -0.1151,  0.0037,  ..., -0.1101,  0.1200,  0.0382],
        [-0.0017, -0.2336, -0.0023,  ..., -0.0286,  0.1588,  0.0508],
        ...,
        [ 0.0440, -0.1301, -0.0213,  ..., -0.0756,  0.1521,  0.0294],
        [-0.0364, -0.1387, -0.0494,  ..., -0.0485,  0.2238,  0.0363],
        [-0.0875, -0.0741, -0.0480,  ..., -0.1541,  0.1741,  0.0697]],
       grad_fn=<AddmmBackward0>)

In [247]:
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr = 0.01 )

In [248]:
for epochs in range(5):
    total_loss = 0
    for batch in range( train_x.shape[0] // batch_size ):
        x_batch, y_batch = get_batch(train_x, train_y, batch_size)

        cnn_optimizer.zero_grad()

        logits = cnn_model( x_batch )
        loss = loss_function( logits, y_batch )

        loss.backward()
        cnn_optimizer.step()

        total_loss += loss.item()

    print( "Total Loss over Batches:",total_loss )
print("Current Confusion Matrix")
print( confusion_matrix( cnn_model, test_x, test_y ) )
print()

Total Loss over Batches: 119.1621857099235
Total Loss over Batches: 48.404143596068025
Total Loss over Batches: 37.85023327637464
Total Loss over Batches: 32.679613939486444
Total Loss over Batches: 31.642268613446504
Current Confusion Matrix
[[ 970    1    1    0    0    1    3    0    2    2]
 [   0 1130    2    0    0    1    0    0    2    0]
 [   4   11  988    3    1    0    2    8   10    5]
 [   3    1    6  987    0    3    0    4    6    0]
 [   2    2    1    0  951    0    5    2    0   19]
 [   2    0    1   10    0  866    4    1    5    3]
 [   9    4    1    0    1    3  934    0    6    0]
 [   2    8   18    3    0    2    0  967    6   22]
 [   8    2    3    3    0    6    1    2  944    5]
 [   4    5    0    4    9    9    1    6    4  967]]



And of course the payoff here is a much higher degree of accuracy and lower loss in even less training.