This code is adopted from the pytorch examples repository. 
It is licensed under BSD 3-Clause "New" or "Revised" License.
Source: https://github.com/pytorch/examples/
LICENSE: https://github.com/pytorch/examples/blob/master/LICENSE

![](https://github.com/rpi-techfundamentals/fall2018-materials/blob/master/10-deep-learning/mnist-comparison.png?raw=1)
Table from [Wikipedia](https://en.wikipedia.org/wiki/MNIST_database)

In [None]:
!pip install torch torchvision



In [None]:
#Import Libraries


from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np


In [None]:
args={}
kwargs={}
args['batch_size']=1000
args['test_batch_size']=1000
args['epochs']=10  #The number of Epochs is the number of times you go through the full dataset. 
args['lr']=0.01 #Learning rate is how fast it will decend. 
args['momentum']=0.5 #SGD momentum (default: 0.5) Momentum is a moving average of our gradients (helps to keep direction).

args['seed']=1 #random seed
args['log_interval']=10
args['cuda']=True


In [None]:
#load the data
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args['batch_size'], shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args['test_batch_size'], shuffle=True, **kwargs)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw



In [None]:
model = nn.Sequential(*[nn.Flatten(), nn.Linear(784, 100), nn.ReLU(), nn.Linear(100, 100), nn.ReLU(), nn.Linear(100, 10), nn.Softmax()])
model_sigm = nn.Sequential(*[nn.Flatten(), #torch.nn.BatchNorm1d(784),
                             nn.Linear(784, 300), nn.ReLU(), #torch.nn.BatchNorm1d(300),
                             nn.Linear(300, 100), nn.ReLU(), #torch.nn.BatchNorm1d(100),
                             nn.Linear(100, 10), 
                             nn.Sigmoid()])

In [None]:

def train(epoch, model):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args['cuda']:
            data, target = data.cuda(), target.cuda()
        #Variables in Pytorch are differenciable. 
        data, target = Variable(data), Variable(target)
        #This will zero out the gradients for this batch. 
        optimizer.zero_grad()
        output = model(data)
        # Calculate the loss The negative log likelihood loss. It is useful to train a classification problem with C classes.
        loss = F.cross_entropy(output, target)
        #dloss/dx for every Variable 
        loss.backward()
        #to do a one-step update on our parameter.
        optimizer.step()
        #Print out the loss periodically. 
        if batch_idx % args['log_interval'] == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if args['cuda']:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.cross_entropy(output, target, size_average=False).item() # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))




In [None]:
# model = Net()


optimizer = optim.Adam(model.parameters(), lr=0.01)

run_sigm = True
if run_sigm:
  model = model_sigm
  optimizer = optim.Adam(model.parameters(), lr=0.01)
  args['epochs'] = 4

if args['cuda']:
    model.cuda()

for epoch in range(1, args['epochs'] + 1):
    train(epoch, model)
    test(model)


  input = module(input)







Test set: Average loss: 1.6146, Accuracy: 8466/10000 (85%)


Test set: Average loss: 1.5997, Accuracy: 8608/10000 (86%)


Test set: Average loss: 1.5508, Accuracy: 9121/10000 (91%)


Test set: Average loss: 1.5090, Accuracy: 9522/10000 (95%)


Test set: Average loss: 1.5040, Accuracy: 9572/10000 (96%)


Test set: Average loss: 1.4980, Accuracy: 9628/10000 (96%)


Test set: Average loss: 1.5029, Accuracy: 9585/10000 (96%)


Test set: Average loss: 1.4987, Accuracy: 9624/10000 (96%)


Test set: Average loss: 1.5048, Accuracy: 9563/10000 (96%)


Test set: Average loss: 1.4999, Accuracy: 9610/10000 (96%)



In [None]:
test(model)

  input = module(input)



Test set: Average loss: 1.4999, Accuracy: 9610/10000 (96%)



In [None]:
# torch.onnx.export(model, 
#                   torch.randn(1, 28, 28),
#                   "model.onnx",
#                   verbose=False,
#                   input_names=["my_input"],
#                   output_names=["my_output"],
#                   export_params=True,
#                   )

In [None]:
torch.onnx.export(model.cpu(), 
                  torch.randn(1, 28, 28),
                  "model_sigm.onnx",
                  verbose=False,
                  input_names=["my_input"],
                  output_names=["my_output"],
                  export_params=True,
                  )

  return forward_call(*input, **kwargs)


#Create Sigmoid network

In [None]:
class l1Net(nn.Module):
  def __init__(self, standard_net, bound=0, neurons=28*28):
    super().__init__()
    self.model = standard_net
    self.neurons=neurons 
    self.bound = bound
    d1 = neurons*2+self.model[1].bias.shape[0]
    d2 = 1 + self.model[3].bias.shape[0]
    self.fc1 = nn.Linear(neurons*2, d1)
    self.fc2 = nn.Linear(d1, d2)
    self.fc3 = nn.Linear(d2, 11)#1 for l1 norm + 10 for class probs
    self.make_net()
    # self.finalNet = nn.Sequential(*[self.fc1, nn.ReLU(), 
    #                              self.fc2, nn.ReLU(), 
    #                              self.fc3, ])
                                #  nn.Sigmoid()]) #do we nned sigmoid?

  def make_net(self):
    neurons = self.neurons

    #fc1
    fc1w = torch.zeros(self.fc1.weight.shape)
    # fc1w[2*neurons:, :neurons] = 0: holds by default
    wnorm = torch.zeros((neurons*2, neurons*2))
    i = np.arange(neurons)
    wnorm[i, i] = 1
    wnorm[i, neurons+i] = -1
    wnorm[neurons+i, i] = -1
    wnorm[neurons+i, neurons+i] = 1
    fc1w[:2*neurons, :2*neurons] = wnorm
    fc1w[2*neurons:, neurons:] = self.model[1].weight
    self.fc1.weight = torch.nn.Parameter(fc1w)

    fc1b = torch.zeros(self.fc1.bias.shape)
    fc1b[neurons*2:]  = self.model[1].bias
    self.fc1.bias = torch.nn.Parameter(fc1b)

    #fc2
    fc2w = torch.zeros(self.fc2.weight.shape)
    fc2w[1:, neurons*2:] = self.model[3].weight
    fc2w[0, :neurons*2] = 1.0
    self.fc2.weight = torch.nn.Parameter(fc2w)

    fc2b = torch.zeros(self.fc2.bias.shape)
    fc2b[1:] = self.model[3].bias
    fc2b[0] = -self.bound  ######### is this correct?
    self.fc2.bias = torch.nn.Parameter(fc2b)

    #fc3
    fc3w = torch.zeros(self.fc3.weight.shape)
    fc3w[0,0] = 1.0
    fc3w[1:, 1:] = self.model[5].weight
    self.fc3.weight = torch.nn.Parameter(fc3w)

    fc3b = torch.zeros(self.fc3.bias.shape)
    fc3b[1:] = self.model[5].bias
    self.fc3.bias = torch.nn.Parameter(fc3b)

    print("fc1: ", self.fc1.weight.shape, self.fc1.bias.shape, " vs ", self.fc1)
    print("fc2: ", self.fc2.weight.shape, self.fc2.bias.shape, " vs ", self.fc2)
    print("fc3: ", self.fc3.weight.shape, self.fc3.bias.shape, " vs ", self.fc3)

    if hasattr(self, "model"):
        delattr(self, "model")

  
  def forward(self, x):
    x = self.fc1(x)
    x = F.relu(x)
    x = self.fc2(x)
    x = F.relu(x)
    x = self.fc3(x)
    return x


In [None]:
model[1].bias.shape[0], model[3].bias.shape[0]

(100, 100)

In [None]:
import numpy as np
a = np.eye((10))
a[np.arange(5), np.arange(5)+5]

array([0., 0., 0., 0., 0.])

In [None]:
a[5:, :3] = 2
a

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [2., 2., 2., 0., 0., 1., 0., 0., 0., 0.],
       [2., 2., 2., 0., 0., 0., 1., 0., 0., 0.],
       [2., 2., 2., 0., 0., 0., 0., 1., 0., 0.],
       [2., 2., 2., 0., 0., 0., 0., 0., 1., 0.],
       [2., 2., 2., 0., 0., 0., 0., 0., 0., 1.]])

In [None]:
l1model=l1Net(model)

fc1:  torch.Size([1668, 1568]) torch.Size([1668])  vs  Linear(in_features=1568, out_features=1668, bias=True)
fc2:  torch.Size([101, 1668]) torch.Size([101])  vs  Linear(in_features=1668, out_features=101, bias=True)
fc3:  torch.Size([11, 101]) torch.Size([11])  vs  Linear(in_features=101, out_features=11, bias=True)


In [None]:
l1model

l1Net(
  (fc1): Linear(in_features=1568, out_features=1668, bias=True)
  (fc2): Linear(in_features=1668, out_features=101, bias=True)
  (fc3): Linear(in_features=101, out_features=11, bias=True)
)

In [None]:
torch.onnx.export(l1model, 
                  torch.randn(1, 28*28*2),
                  "l1model_combined.onnx",
                  verbose=False,
                  input_names=["my_input"],
                  output_names=["my_output"],
                  export_params=True,
                  )

In [None]:
dset1 = datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,)),
                       transforms.Lambda(lambda x: torch.flatten(x)),
                       transforms.Lambda(lambda x: torch.concat([x, x]))
                   ]))
dset1[0][0].shape, dset[0][1]

(torch.Size([1568]), 5)

In [None]:
dset2 = datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,)),
                      #  transforms.Lambda(lambda x: torch.flatten(x)),
                      #  transforms.Lambda(lambda x: torch.concat([x, x]))
                   ]))
dset2[0][0].shape, dset2[0][1]

(torch.Size([1, 28, 28]), 5)

In [None]:
(l1model(dset1[0][0]))

tensor([  0.0000,  -5.0693, -38.3540, -38.9342,  22.8229, -51.9634,  78.2324,
        -35.9210, -16.7899, -50.0841, -26.5146], grad_fn=<AddBackward0>)

In [None]:
model1 = model[:-1]
model1(dset2[0][0])


tensor([[ -5.0693, -38.3540, -38.9342,  22.8228, -51.9634,  78.2324, -35.9210,
         -16.7899, -50.0841, -26.5146]], grad_fn=<AddmmBackward0>)

In [None]:
dset1[0][1]

5

In [None]:
a = dset2[0][0][0].flatten()
b = dset2[0][0][0]

In [None]:
for i in range(28):
  for j in range(28):
    if(a[28*i+j] != b[i][j]):print("ouch @", i, j)

In [None]:
a[1], b[0][1], b[1][0]

In [None]:
import matplotlib.pyplot as plt
img = plt.imread("1.png")
t = torch.from_numpy(img).unsqueeze(0)
print(t.shape, dset2[0][0].shape)
model1(t)

torch.Size([1, 28, 28]) torch.Size([1, 28, 28])


tensor([[-12.4669,  -1.8514,  30.9831,   8.0560, -26.8024, -18.6206, -19.4712,
         -13.1445,  -8.1659, -33.9650]], grad_fn=<AddmmBackward0>)

In [None]:
model1[1].weight.device

device(type='cpu')