# PyTorch - Logistic Regression

In [1]:
import torch, torchvision
from torchvision import transforms
from torch import nn
import numpy as np
import sys

## Fully-Connected Neural Network

Let's load the MNIST dataset.  Our architecture is simple:

1. Input layer receiving 784 features
2. Hidden layer with size of 89 neurons
3. Output layer with size of 10 neurons

We will be using Sigmoid activation.

In [2]:
#set gpu if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
# Hyper-parameters 
input_size = 784
hidden_size = 89
num_classes = 10
num_epochs = 1
batch_size = 100
learning_rate = 0.001

## 1. Defining dataset

In [4]:
# MNIST dataset 
train_dataset = torchvision.datasets.MNIST(root='data', 
                                           train=True, 
                                           transform=transforms.ToTensor(),  #convert 0-255 to 0-1
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='data', 
                                          train=False, 
                                          transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

x_sample, y_sample = next(iter(train_loader))
print("X: ", x_sample.shape)
print("X min: ", x_sample.min())
print("X max: ", x_sample.max())
print("y: ", y_sample.shape)
print("y unique: ", y_sample.unique())

X:  torch.Size([100, 1, 28, 28])
X min:  tensor(0.)
X max:  tensor(1.)
y:  torch.Size([100])
y unique:  tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


## 2. Defining the model

In [5]:
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()  #super(Model, self)
        self.fc1 = nn.Linear(input_size, hidden_size) 
        #add non-linearity; recall ReLU is max(input, 0) 
        #->Go study about LeakyReLU (max (input, a * input)) and Swish (x * sigmoid)
        self.relu = nn.ReLU()  
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

Let's now define the model using the class.  Every <code>nn.Module</code> can also use the <code>.to(device)</code> to fully use the GPU capabilities.

In [6]:
model = NeuralNet(input_size, hidden_size, num_classes).to(device)

Let's define the Loss and optimizer.

Here we will be using Adam which is an adaptive learning rate optimization.  Comparing Adam and SGD, Adam is more adaptive in terms of how it uses momentum and learning rate.  Namely, Adam uses the **squared gradients to scale the learning rate** and it takes advantage of momentum by using **moving average of the gradient** instead of gradient itself like SGD with momentum

Whether Adam vs. SGD is still very debatable. Adam is proposed in 2015 to great success and many recent papers found that SGD can be more generalized than Adam...so I really don't know.  It's best to try both, I guess.

In [7]:
# Loss and optimizer

#this is softmax + cross-entropy loss together, thus the output layer does not need to do softmax;
criterion = nn.CrossEntropyLoss()   
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

## 3. Training

Let's train the model

In [8]:
# Train the model
total_step = len(train_loader)  #for printing purpose
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        
        #images shape is [100, 1, 28, 28] [batch_size, channel, height, width]
        
        # Move tensors to the configured device
        # also reshape to [100, 784] so it can be inputted into the Dense layer
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
                
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)  #note that outputs shape [batch, num_classes]) while labels shape ([batch, ])
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            sys.stdout.write('\rEpoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/1], Step [600/600], Loss: 0.2947

## 4. Testing

Let's test the model

In [9]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)  #returns max value, indices
        total += labels.size(0)  #keep track of total
        correct += (predicted == labels).sum().item()  #.item() give the raw number

    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'models/dense-mnist.ckpt')

Accuracy of the network on the 10000 test images: 93.23 %
