# Deep_CNN_MNIST
This code below is all about CNN model to classify MNIST datasets. It is demonstrating a deep CNN model code simulation. 

In this notebook, I use cross-validation to check the performnace in validation set and trianing dataset. To make cross-validation easily, I use F-fold cross validation which is supported by sklearn.


In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets,transforms
import torch.nn.init as init
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler, ConcatDataset
from sklearn.model_selection import KFold

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device: {}'.format(device))         

device: cuda


In [3]:
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Normalize(mean=[0.5],
                         std=[0.5])
    ])

MNIST_train = datasets.MNIST(root='/MNIST', train=True , transform=transform , download=True)
MNIST_test  = datasets.MNIST(root='/MNIST', train=False, transform=transform , download=True)

print('The length of MNIST train set:',len(MNIST_train))
print('The length of MNIST test set :',len(MNIST_test))

The length of MNIST train set: 60000
The length of MNIST test set : 10000


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [5]:
class ConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2)
        self.max_pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=16,out_channels=32, kernel_size=5, stride=1, padding=2)
        self.max_pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        self.drop1 = nn.Dropout2d(p=0.5)
        self.drop2 = nn.Dropout2d(p=0.1)
        
        self.fc1 = nn.Linear(32 * 7 * 7, 96)
        self.fc2 = nn.Linear(96, 10)

    def forward(self,x):
         # convolution
         # x:[64, 1, 28, 28] 

         # (activation width + 2*padding - kernel_size) / stride + 1 => next width of next activation map
         # (28 + 4 - 5) / 1 + 1 = 28 => (28 -2) / 2 + 1 = 14
         # [64, 16, 14, 14]
         out = self.relu(self.max_pool1(self.conv1(x)))
         
         # (14 + 2*2 - 5)/ 1 + 1 = 14 => ((14-2)+2*0) / 2 + 1 = 7
         # [64, 32, 7, 7]
         out = self.relu(self.max_pool2(self.conv2(out)))
         
         # flatten => convert 4 dimension into 2 dimension([64, 32, 7, 7]->[64, 1568])
         # [64, 1568]
         out = out.view(out.size(0),-1)
         
         # fully connected layer
         # [64, 96]
         out = self.relu(self.fc1(out))
         out = self.drop2(out)
         
         # [64, 10]
         out = self.fc2(out)
        
         # return shape: [64, 10]
         return out


In [6]:
model = ConvNet()

In [7]:
criterion = nn.CrossEntropyLoss()

# Concat two datasets to make a one big datset
dataset = ConcatDataset([MNIST_train, MNIST_test])
print("The length of total dataset:", len(dataset))

# Hyper parameters
num_epochs = 10
batch_size = 64
k = 10
splits = KFold(n_splits = k, shuffle=True, random_state=777)

# Store performance per fold
fold_performance={}

The length of total dataset: 70000


In [8]:
def train_epoch(model, device, dataloader,criterion, optimizer):
    train_loss, train_correct = 0., 0
    model.train()
    for num, (images, labels) in enumerate(dataloader):
        images, labels = images.to(device), labels.to(device)
        # images: [64,1,28,28] labels: [64]
        # labels are not one-hot encoded. Instead, they are consist of 1-dimentional numbers such as 0, 1, 2,...,9.
        output = model(images)
        loss=criterion(output,labels)
        #print(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # loss is the mean of mini batch(64 datas) cross entropy loss. 
        # Therefore, we can get approximate total mini batch cross entropy loss by multiply the number of images with loss.
        # train_loss = train_loss + loss * the number of images(batch size)
        train_loss += loss.item() * images.size(0)

        # To get the maximum float value in 10 dimentional vector(output), use torch.max(torch.Tensor, axis)
        scores, predictions = torch.max(output,1) 

        # To get the number of correct predictions
        train_correct +=(predictions==labels).sum().item()
    return train_loss, train_correct

def valid_epoch(model, device, dataloader, criterion):
    valid_loss, valid_correct = 0., 0
    model.eval()
    for num, (images, labels) in enumerate(dataloader):
        images, labels = images.to(device), labels.to(device)
        output = model(images)
        loss = criterion(output, labels)
        valid_loss += loss.item()*images.size(0)
        scores, predictions = torch.max(output, 1)
        valid_correct += (predictions==labels).sum().item()
        
    return valid_loss, valid_correct

In [9]:
# for k(=10) times
for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(dataset)))):
    print('Fold {}'.format(fold+1))
    # Split indexes for two parts, train_idx and val_dix
    # By using SubsetRandomSampler, get indexes randomly to get an effect like dataloader attribute shuffle = 'True'
    # In summary, ① use KFold to split indexes into two parts, train_idx, val_idx. 
    # train_idx is used to get train datasets which are mapping for dataset[train_idx] and this is also applied to val_idx.
    # ② And then, use SubseRandomSampler to get random train_idx and val_idx.
    # ③ Finally, use dataloader to get batch size data which is mapping to dataset[train_idx] and dataset[val_idx]. 
    # when we use dataloader, we don't use shuffle attribution because SubsetRadomSampler support giving random index function.
    # Therefore, it is fine to get data in fixed order.
    train_sampler = SubsetRandomSampler(train_idx)   
    test_sampler = SubsetRandomSampler(val_idx)      
    train_loader = DataLoader(dataset, batch_size = batch_size, sampler = train_sampler) 
    test_loader = DataLoader(dataset, batch_size = batch_size, sampler=test_sampler)

    model = ConvNet().to(device)
    optimizer = optim.Adam(model.parameters(), lr=2e-3)

    history = {'train_loss': [], 'test_loss': [],'train_acc':[],'test_acc':[]}

    # Whole data is used in one epoch everytime by using dataloader.
    for epoch in range(num_epochs):
        train_loss, train_correct = train_epoch(model, device, train_loader, criterion, optimizer)
        test_loss, test_correct = valid_epoch(model, device, test_loader, criterion)
        # The length of train_loader.sampler is 63000 because of 10-Fold cross validation in the 70000 legnth of dataset which is composed of 7000 validation set and 63000 training set per fold.
        # Therefore, the length of test_loader is 7000.
        
        train_loss = train_loss / len(train_loader.sampler) 
        train_acc = train_correct / len(train_loader.sampler) * 100
        test_loss = test_loss / len(test_loader.sampler)
        test_acc = test_correct / len(test_loader.sampler) * 100
    
        print("Epoch:{}/{} AVG Training Loss: {:.4f} AVG Test Loss: {:.3f} AVG Training ACC: {:.3f}% AVG Test ACC: {:.3f}%".format(
           epoch+1, num_epochs, train_loss, test_loss, train_acc, test_acc
        ))

        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)
    fold_performance['fold{}'.format(fold+1)] = history

torch.save(model,'k_cross_CNN.pt') 


Fold 1


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch:1/10 AVG Training Loss: 0.1500 AVG Test Loss: 0.052 AVG Training ACC: 95.440% AVG Test ACC: 98.386%
Epoch:2/10 AVG Training Loss: 0.0507 AVG Test Loss: 0.053 AVG Training ACC: 98.433% AVG Test ACC: 98.400%
Epoch:3/10 AVG Training Loss: 0.0370 AVG Test Loss: 0.031 AVG Training ACC: 98.813% AVG Test ACC: 98.886%
Epoch:4/10 AVG Training Loss: 0.0281 AVG Test Loss: 0.033 AVG Training ACC: 99.159% AVG Test ACC: 99.100%
Epoch:5/10 AVG Training Loss: 0.0248 AVG Test Loss: 0.041 AVG Training ACC: 99.190% AVG Test ACC: 98.857%
Epoch:6/10 AVG Training Loss: 0.0226 AVG Test Loss: 0.031 AVG Training ACC: 99.297% AVG Test ACC: 99.171%
Epoch:7/10 AVG Training Loss: 0.0186 AVG Test Loss: 0.030 AVG Training ACC: 99.397% AVG Test ACC: 99.314%
Epoch:8/10 AVG Training Loss: 0.0173 AVG Test Loss: 0.038 AVG Training ACC: 99.459% AVG Test ACC: 98.986%
Epoch:9/10 AVG Training Loss: 0.0158 AVG Test Loss: 0.037 AVG Training ACC: 99.484% AVG Test ACC: 99.114%
Epoch:10/10 AVG Training Loss: 0.0149 AVG Test

In [10]:
print(len(fold_performance))

for f in range(1,k+1):

     print('fold{}'.format(f),len(fold_performance['fold{}'.format(f)]['train_loss']))
     print('fold{}'.format(f),len(fold_performance['fold{}'.format(f)]['test_loss']))
     print('fold{}'.format(f),len(fold_performance['fold{}'.format(f)]['train_acc']))
     print('fold{}'.format(f),len(fold_performance['fold{}'.format(f)]['test_acc']))

10
fold1 10
fold1 10
fold1 10
fold1 10
fold2 10
fold2 10
fold2 10
fold2 10
fold3 10
fold3 10
fold3 10
fold3 10
fold4 10
fold4 10
fold4 10
fold4 10
fold5 10
fold5 10
fold5 10
fold5 10
fold6 10
fold6 10
fold6 10
fold6 10
fold7 10
fold7 10
fold7 10
fold7 10
fold8 10
fold8 10
fold8 10
fold8 10
fold9 10
fold9 10
fold9 10
fold9 10
fold10 10
fold10 10
fold10 10
fold10 10


In [11]:
testl_f,tl_f,testa_f,ta_f=[],[],[],[]
k=10

for f in range(1,k+1):

     tl_f.append(np.mean(fold_performance['fold{}'.format(f)]['train_loss']))
     testl_f.append(np.mean(fold_performance['fold{}'.format(f)]['test_loss']))

     ta_f.append(np.mean(fold_performance['fold{}'.format(f)]['train_acc']))
     testa_f.append(np.mean(fold_performance['fold{}'.format(f)]['test_acc']))

print('Performance of {} fold cross validation'.format(k))
print("Average Training Loss: {:.3f} \t Average Test Loss: {:.3f} \t Average Training Acc: {:.2f} \t Average Test Acc: {:.2f}".format(
    np.mean(tl_f),
    np.mean(testl_f),
    np.mean(ta_f),
    np.mean(testa_f)))     

Performance of 10 fold cross validation
Average Training Loss: 0.036 	 Average Test Loss: 0.043 	 Average Training Acc: 98.87 	 Average Test Acc: 98.85
