In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [16]:
# Check Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define Hyper-parameters 
input_size = 784
hidden_size = 100
num_classes = 10
num_epochs = 2
batch_size = 100
learning_rate = 0.006

In [17]:
# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root='../../data', 
                                           train=True, 
                                           transform=transforms.ToTensor(),  
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='../../data', 
                                          train=False, 
                                          transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)


In [18]:
#Fully connected neural network
#using nn.Batchnorm directly provided by pytorch
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2  = nn.BatchNorm1d(hidden_size)
        self.fc3 =  nn.Linear(hidden_size, num_classes)

    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  


In [19]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backprpagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

            with torch.no_grad():
                correct = 0
                total = 0
                for images, labels in test_loader:
                    images = images.reshape(-1, 28*28).to(device)
                    labels = labels.to(device)
                    outputs = model(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

            with torch.no_grad():
                correct = 0
                total = 0
                for images, labels in train_loader:
                    images = images.reshape(-1, 28*28).to(device)
                    labels = labels.to(device)
                    outputs = model(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                print('Accuracy of the network on the 60000 train images: {} %'.format(100 * correct / total))


Epoch [1/2], Step [100/600], Loss: 0.2421
Accuracy of the network on the 10000 test images: 93.53 %
Accuracy of the network on the 60000 train images: 93.45833333333333 %
Epoch [1/2], Step [200/600], Loss: 0.2424
Accuracy of the network on the 10000 test images: 94.36 %
Accuracy of the network on the 60000 train images: 94.68666666666667 %
Epoch [1/2], Step [300/600], Loss: 0.1210
Accuracy of the network on the 10000 test images: 95.07 %
Accuracy of the network on the 60000 train images: 95.59 %
Epoch [1/2], Step [400/600], Loss: 0.2766
Accuracy of the network on the 10000 test images: 95.85 %
Accuracy of the network on the 60000 train images: 96.22666666666667 %
Epoch [1/2], Step [500/600], Loss: 0.0599
Accuracy of the network on the 10000 test images: 96.31 %
Accuracy of the network on the 60000 train images: 97.02 %
Epoch [1/2], Step [600/600], Loss: 0.1392
Accuracy of the network on the 10000 test images: 96.09 %
Accuracy of the network on the 60000 train images: 96.89 %
Epoch [2/2

In [9]:
#now doing the same but using custom Batch norm class
class CustomBatchNorm(nn.Module):

    def __init__(self, in_size, momentum=0.9, eps = 1e-5):
        super(CustomBatchNorm, self).__init__()
        
        self.momentum = momentum
        self.insize = in_size
        self.eps = eps
        
        self.gamma = nn.Parameter(torch.FloatTensor(self.insize).uniform_())
        self.beta = nn.Parameter(torch.zeros(self.insize))
            
        self.register_buffer('running_mean', torch.zeros(self.insize))
        self.register_buffer('running_var', torch.ones(self.insize))
        
        self.running_mean.zero_()
        self.running_var.fill_(1)

    def forward(self, input):
        
        X = input

        if len(X.shape) not in (2, 4):
            raise ValueError("only support dense or 2dconv")
        
        # dense layer
        elif len(X.shape) == 2:
            if self.training:
                mean = torch.mean(X, axis=0)
                variance = torch.mean((X-mean)**2, axis=0)
                
                self.running_mean = (self.momentum * self.running_mean) + (1.0-self.momentum) * mean
                self.running_var = (self.momentum * self.running_var) + (1.0-self.momentum) * (input.shape[0]/(input.shape[0]-1)*variance)
            
            else:
                mean = self.running_mean
                variance = self.running_var
                
            X_hat = (X-mean) * 1.0 /torch.sqrt(variance + self.eps)
            out = self.gamma * X_hat + self.beta
  
				# convolutional layer
        elif len(X.shape) == 4:
            if self.training:
                N, C, H, W = X.shape
                mean = torch.mean(X, axis = (0, 2, 3))
                variance = torch.mean((X - mean.reshape((1, C, 1, 1))) ** 2, axis=(0, 2, 3))
                
                self.running_mean = (self.momentum * self.running_mean) + (1.0-self.momentum) * mean
                self.running_var = (self.momentum * self.running_var) + (1.0-self.momentum) * (input.shape[0]/(input.shape[0]-1)*variance)
            else:
                mean = self.running_mean
                var = self.running_var
                
            X_hat = (X - mean.reshape((1, C, 1, 1))) * 1.0 / torch.sqrt(variance.reshape((1, C, 1, 1)) + self.eps)
            out = self.gamma.reshape((1, C, 1, 1)) * X_hat + self.beta.reshape((1, C, 1, 1))
        
        return out



In [26]:
class BnNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BnNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = CustomBatchNorm(hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2  = CustomBatchNorm(hidden_size)
        self.fc3 =  nn.Linear(hidden_size, num_classes)

    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

model2 = BnNet(input_size, hidden_size, num_classes).to(device)

# Loss and optimizer
criterion2 = nn.CrossEntropyLoss()
optimizer2 = torch.optim.Adam(model2.parameters(), lr=learning_rate)  


In [27]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model2(images)
        loss = criterion2(outputs, labels)
        
        # Backprpagation and optimization
        optimizer2.zero_grad()
        loss.backward()
        optimizer2.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

            with torch.no_grad():
                correct = 0
                total = 0
                for images, labels in test_loader:
                    images = images.reshape(-1, 28*28).to(device)
                    labels = labels.to(device)
                    outputs = model2(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

            with torch.no_grad():
                correct = 0
                total = 0
                for images, labels in train_loader:
                    images = images.reshape(-1, 28*28).to(device)
                    labels = labels.to(device)
                    outputs = model2(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                print('Accuracy of the network on the 60000 train images: {} %'.format(100 * correct / total))


Epoch [1/2], Step [100/600], Loss: 0.2948
Accuracy of the network on the 10000 test images: 92.73 %
Accuracy of the network on the 60000 train images: 92.195 %
Epoch [1/2], Step [200/600], Loss: 0.1432
Accuracy of the network on the 10000 test images: 94.27 %
Accuracy of the network on the 60000 train images: 94.34166666666667 %
Epoch [1/2], Step [300/600], Loss: 0.2694
Accuracy of the network on the 10000 test images: 95.37 %
Accuracy of the network on the 60000 train images: 95.76 %
Epoch [1/2], Step [400/600], Loss: 0.1553
Accuracy of the network on the 10000 test images: 95.75 %
Accuracy of the network on the 60000 train images: 96.35 %
Epoch [1/2], Step [500/600], Loss: 0.1210
Accuracy of the network on the 10000 test images: 96.29 %
Accuracy of the network on the 60000 train images: 96.92333333333333 %
Epoch [1/2], Step [600/600], Loss: 0.1328
Accuracy of the network on the 10000 test images: 96.31 %
Accuracy of the network on the 60000 train images: 97.10333333333334 %
Epoch [2/