# Mini Project 1: MNSIT Pair Comparison

In [1]:
import torch
import numpy as np
from torch import nn
import matplotlib.pyplot as plt
import dlc_practical_prologue as prologue

# we imported pandas and IPython to display and better visualize the results
import pandas as pd
from IPython.display import display, Markdown


## Definition of the Models

We define a class CNN that takes as a parameter the type of model to create. For example `CNN('deep')`.

In [2]:
#Layer used to flatten the images
class Flatten(torch.nn.Module):
    def forward(self, x):
        batch_size = x.shape[0]
        return torch.reshape(x, (batch_size, -1))

## Architectures: 

# The architecture represents the part of the model that recognizes the images.

# Note: arch1, arch2, arch3 are functions that instanciate a new architecture not  architectures 

# Deep

arch1 = lambda :  nn.Sequential(                    
            nn.Conv2d(
                in_channels=1,              
                out_channels=35,           
                kernel_size=3,             
                stride=1,                   
                padding=1,                 
            ),                           
            nn.ReLU(),                      # activation
            nn.MaxPool2d(kernel_size=2),    
            nn.Conv2d(35, 32, 5, 1, 4),   
            nn.ReLU(),                      # activation
            nn.MaxPool2d(2),                
            Flatten(),        
            nn.Linear(800,25),            
            nn.Linear(25,25), 
            nn.Linear(25, 10) ,            
            nn.BatchNorm1d(10),
            nn.ReLU())


# Fully connected

arch2 = lambda  : nn.Sequential(                     
            Flatten(),
            nn.Linear(196, 128), 
            nn.ReLU(),
            nn.Linear(128, 128), 
            nn.ReLU(),
            nn.Linear(128, 56),
            nn.ReLU(),
            nn.Linear(56, 10),              
            nn.ReLU()
            )



# Deep with sigmoids

arch3 = lambda :  nn.Sequential(           
            nn.Conv2d(
                in_channels=1,              # input height
                out_channels=35,            # n_filters
                kernel_size=3,              # filter size
                stride=1,                   # filter movement/step
                padding=1,                 
            ),                              
            nn.Sigmoid(),                      # activation
            nn.MaxPool2d(kernel_size=2),   
            nn.Conv2d(35, 32, 5, 1, 4),  
            nn.Sigmoid(),                      # activation
            nn.MaxPool2d(2),              
            Print('a'),
            Flatten(),        
            Print('b'),
            nn.Linear(800,25),              # fully connected layer, output 25 classes
            nn.Linear(25,25), 
            nn.Linear(25, 10) ,             # fully connected layer, output 10 classes
            nn.BatchNorm1d(10),
            nn.Sigmoid())







class CNN(nn.Module):
    ''' To create this class use CNN(True, 'deep) for example.
    other options are False, 'deep with sigmoids' and 'fully connected'. '''
 
    def __init__(self, weight_sharing, architecture):
        super(CNN, self).__init__()
        self.weight_sharing = weight_sharing
        self.architecture = architecture

        # select the proper architecture
        if architecture == 'deep':
            # arch_copy is used with no weight sharing only
            self.arch = arch1()
            self.arch_copy = arch1()
        if architecture == 'deep with sigmoids':
            self.arch = arch3()
            self.arch_copy = arch3()
        if architecture == 'fully connected':
            self.arch = arch2()
            self.arch_copy = arch2()

        self.fc = nn.Sequential(
            nn.Linear(20, 1) ,  
            nn.BatchNorm1d(1),
            nn.Dropout(0.05),
            nn.Sigmoid()
        )
    # this function resets the weights to random
    def reset(self):
        self.__init__(self.weight_sharing, self.architecture)
    
    
    def forward(self, x):
        #first convolutional layer


        _x = torch.reshape(x[:,0,:,:], (-1, 1, 14, 14))
        _x1 = self.arch(_x)
        _x = torch.reshape(x[:,1,:,:], (-1, 1, 14, 14))

        # if there is no weight sharing use the arch_copy layers
        if self.weight_sharing: _x2 = self.arch(_x)
        else: _x2 = self.arch_copy(_x)

        #concatenate and retrun auxilary output
        _x = torch.cat((_x1, _x2), 1)   
        aux_out = (_x1, _x2)


        #fc layer to merge the two recognitions
        _x = self.fc(_x)
        
        # we print _x[:,0] because otherwise _x is of size (N,1) which is not usefull 
        # it should be of size (N)
        return aux_out, _x[:,0]






In [1]:
# With this function we compute the number of parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

    


In [4]:
#  We used the pandas library in order to better compare and plot the tables 
#  our test.py file doesn't use pandas so that it can run on the VM


DATA=pd.DataFrame(columns = ['architecture', 
                             'training mode',
                             'weight sharing', 
                             'history',
                             'model', 
                            'loss function'])

for architecture in ['deep', 'fully connected', 'deep with sigmoids']:
    for weight_sharing in [True, False]:
        for training_mode in ['without auxiliar loss', 'with auxiliar loss']:
            
            model = CNN(weight_sharing, architecture)
            
            if training_mode == 'without auxiliar loss':
                loss_function = lambda loss_main, loss_aux: loss_main
            elif training_mode == 'with auxiliar loss':
                loss_function = lambda loss_main, loss_aux: loss_main + loss_aux
            else:
                raise Exception('wrong loss function')
            
            # history is an dictionary we created to store the values required to plot the graph
            
            DATA = DATA.append({'architecture':architecture,
                               'training mode': training_mode,
                               'weight sharing': weight_sharing,
                               'history': pd.DataFrame(),
                               'model':model,
                               'loss function':loss_function,
                               'number parameters': count_parameters(model)},
                               ignore_index=True)


In [5]:
# this table contains all the models, all the training history's of the models and the loss functions
# the history objects contain the data to make visualize the training curves. Those curves can be 
# visualized in our test.py file

DATA

Unnamed: 0,architecture,training mode,weight sharing,history,model,loss function,number parameters
0,deep,without auxiliar loss,True,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Conv2d(1...,<function <lambda> at 0x7f777ad0cae8>,98697.0
1,deep,with auxiliar loss,True,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Conv2d(1...,<function <lambda> at 0x7f777acb2ae8>,98697.0
2,deep,without auxiliar loss,False,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Conv2d(1...,<function <lambda> at 0x7f777accea60>,98697.0
3,deep,with auxiliar loss,False,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Conv2d(1...,<function <lambda> at 0x7f777ac699d8>,98697.0
4,fully connected,without auxiliar loss,True,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Print()\...,<function <lambda> at 0x7f777ac82048>,99067.0
5,fully connected,with auxiliar loss,True,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Print()\...,<function <lambda> at 0x7f777ac16510>,99067.0
6,fully connected,without auxiliar loss,False,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Print()\...,<function <lambda> at 0x7f777ac2d9d8>,99067.0
7,fully connected,with auxiliar loss,False,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Print()\...,<function <lambda> at 0x7f777ac42ea0>,99067.0
8,deep with sigmoids,without auxiliar loss,True,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Conv2d(1...,<function <lambda> at 0x7f777abdee18>,98697.0
9,deep with sigmoids,with auxiliar loss,True,Empty DataFrame Columns: [] Index: [],CNN(\n (arch): Sequential(\n (0): Conv2d(1...,<function <lambda> at 0x7f777abfad90>,98697.0


In [6]:
# Load the data
train_input, train_target, train_classes, test_input, test_target, test_classes = prologue.generate_pair_sets(1000)

In [8]:
# function to compute classes accuracy of the digit recognition
# it allows to plot the accuracy function of the digit recognition (this was not mandatory but 
# it helps to visualize how the training is done)

def accuracy_classes(predicted, target):
    '''
    computes the accuracy of the predicted classes in %
    '''

    predicted_1 = predicted[0]
    predicted_2 = predicted[1]
    predicted_1 = predicted_1.argmax(dim=1)
    predicted_2 = predicted_2.argmax(dim=1)
    target_1=target[:,0]
    target_2=target[:,1]
    return ( 100 -( ( (target_1 != predicted_1) | (target_2 != predicted_2 ) ).sum() ).item() /target_1.shape[0] * 100 )



In [9]:
# this function calculated the accuracy for the output
def accuracy_comparison(predicted, target):
    '''computes accuracy for the output'''
    return( np.array((torch.abs(predicted - target) < 0.5).sum().float() / target.shape[0] * 100))



In [10]:
# d is a history  containing the info of the training
# this function plots the graphs of the learning curves
def plot_graphs(data_row, d):
    # plotting accuracy
    plt.figure(figsize=(10,6))

    plt.subplot(1,2,1)


    description =  'model of type '+data_row['architecture']+',\n with lr= '+str(d['learning rate'])+\
    (', with weight sharing, ' if data_row['weight sharing'] else ', without weight sharing, ') +\
        'and loss function '+ data_row['training mode'] 

    plt.suptitle('Learning curves of the '+ description)

    plt.plot(d['comparison acc'], label='comparison acc')
    if data_row['training mode']== 'with auxiliar loss': plt.plot(d['recognition acc'], label='recognition acc')
    plt.legend()
    plt.xlabel('epoch')
    plt.ylabel('accuracy  in %')
    plt.ylim((0,100))

    # plotting loss
    plt.subplot(1,2,2)



    plt.plot(d['comparison loss'], label='comparison loss')
    if data_row['training mode']== 'with auxiliar loss': plt.plot(d['recognition loss'], label='recognition loss')
    plt.legend()
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.ylim((0,2))
    plt.show()

In [11]:
def run_number(i):
'''This function is used to train a model, save the results in the DATA
table and plot the results of the training, the input is the number of the model.
The number of the model is the index of the row of the dataframe DATA'''
    data_row = DATA.iloc[i]
    display(data_row)
    display(Markdown('''### Model:
     - '''+data_row['architecture']+'''
     - '''+data_row['training mode']+'''
     - '''+('with weight sharing' if data_row['weight sharing'] else 'without weight sharing')+'''
    '''))
    model = data_row['model']
    model.reset()
    loss_function = data_row['loss function']
    architecture = data_row['architecture']


    
    # Training of the model
    if architecture == 'fully connected': eta = 0.01
    if architecture == 'deep with sigmoids': eta = 0.01
    if architecture == 'deep': eta = 0.2
    mini_batch_size = 100
    epochs = 30

    # dictionnary to store the values a.k.a history
    d= ({'epochs': epochs,
                'comparison loss':[],
               'recognition loss':[],
               'comparison acc':[],
               'recognition acc':[],
               'learning rate':eta})
    
    
    criterion_aux = nn.CrossEntropyLoss() # criterion for digit recognition
    criterion_main = torch.nn.BCELoss() # criterion for digit comparison

    # use adam optimizer for SGD
    optimizer = torch.optim.Adam(model.parameters(), lr=eta)

    # compute minibatch test target
    minibatch_test_target = test_target.narrow(0, 0, mini_batch_size)
    minibatch_test_input = test_input.narrow(0, 0, mini_batch_size)


    # print total number of epochs
    print('epoch: (../ '+str(epochs-1)+' )')

    # necessary for loss_function
    aux_validation_acc_item = 0

    for e in range(0, epochs):
        #print current epoch
        print(str(e), sep=' ', end=' ', flush=True)
        
        if e in [epochs//2, epochs//3, epochs//4]: eta /=2
        

        # We do this with mini-batches
        for b in range(0, train_input.size(0), mini_batch_size):

            mini_batch_input = train_input.narrow(0, b, mini_batch_size)
            mini_batch_target = train_target.narrow(0, b, mini_batch_size) #classification labels Nx1
            mini_batch_target_aux = train_classes.narrow(0, b, mini_batch_size) #binary 'what number are these images' Nx20


            #output_aux is the Nx20 output of the second fc layer corresponding to what image pairs were predicted
            #output is the Nx1 output corresponding to: if image 0 or image 1 is bigger
            output_aux, output = model(mini_batch_input)  
            loss_aux = criterion_aux(output_aux[0], mini_batch_target_aux[:,0]) +\
            criterion_aux(output_aux[1], mini_batch_target_aux[:,1])
            loss_main = criterion_main(output, mini_batch_target.float())
            
            # we get the values of the losses at time 0
            if b==0 and e==0: loss_main0, loss_aux0 = loss_main.data.item(), loss_aux.data.item()
            
            # we normalize the losses 
            loss_main/=loss_main0
            loss_aux/=loss_aux0
            
            
            


        # compute validation loss and accuracy
            if b ==0:
                with torch.no_grad():


                    #compute outputs for test data
                    validation_output_aux, validation_output = model(test_input)

                    # compute loss for test data
                    main_validation_loss = criterion_main( validation_output, test_target.float()) /loss_main0
                    aux_validation_loss = criterion_aux( validation_output_aux[0], test_classes[:,0].long()) / loss_aux0 + criterion_aux(validation_output_aux[1], test_classes[:,1].long()) / loss_aux0

                    # compute accuracy for test and train data
                    main_validation_acc_item = accuracy_comparison( validation_output, test_target.float())
                    aux_validation_acc_item = accuracy_classes(validation_output_aux, test_classes)


                    # append to arrays
                # save results in d


                d['comparison loss'].append(main_validation_loss.item())
                d['recognition loss'].append(aux_validation_loss.item())
                d['comparison acc'].append(main_validation_acc_item)
                d['recognition acc'].append(aux_validation_acc_item)
            
            
            optimizer.zero_grad()
            loss = loss_function( loss_main, loss_aux )
            loss.backward()
            optimizer.step()


            
    history_of_historys = DATA.iloc[i,3]
    history_of_historys = history_of_historys.append(d, ignore_index = True)
    DATA.at[i,'history'] = history_of_historys


    plot_graphs(data_row, d )
    



In [None]:
# We used this code to compute all the data.

for j in range(10):
    # When we reimport the data the images are randomized as requested
    train_input, train_target, train_classes, test_input, test_target, test_classes = prologue.generate_pair_sets(1000)
    for i in range(len(DATA)):
            run_number(i)

In [13]:
# We used this code to save the data in a file DATA2
#DATA.iloc[:,range(4)].to_pickle('./DATA2')


In [None]:
# this code reloads the saved data
DATA = pd.read_pickle('./DATA2')


In [25]:
# DATA2 is a new dataframe containing only the columns usefull for the visualization

DATA2=DATA.loc[:,['architecture', 'training mode', 'weight sharing']]

# here we compute the mean and the std of the training results. This allows us to 
# interpret them.
DATA2['mean comparison acc'] = [np.array([ np.array(comparison_acc).max()  for comparison_acc in history['comparison acc'] ]).mean() for history in  DATA['history']]
DATA2['std comparison acc'] = [np.array([ np.array(comparison_acc).max()  for comparison_acc in history['comparison acc'] ]).std() for history in  DATA['history']]


In [33]:
# With this code we display the following tables.
display(Markdown('### Mean and std by architecture'))
display(DATA2.groupby('architecture').mean().sort_values('mean comparison acc', ascending = False))

display(Markdown('### Ranking of all models'))
display(DATA2.sort_values('mean comparison acc', ascending = False))

display(Markdown('### Weight sharing vs. no weight sharing'))
display(DATA2.groupby('weight sharing').mean().sort_values('mean comparison acc', ascending = False))


display(Markdown('### Auxiliary loss vs. normal loss with convolutional arch.'))
display(DATA2[DATA2['architecture']!='fully connected'].groupby('training mode').mean().sort_values('mean comparison acc', ascending = False))


display(Markdown('### Before tricks'))

DATA2[(DATA2['training mode']=='without auxiliar loss') & (DATA2['weight sharing']==False)].sort_values('mean comparison acc', ascending = False)

### Mean and std by architecture

Unnamed: 0_level_0,mean comparison acc,std comparison acc
architecture,Unnamed: 1_level_1,Unnamed: 2_level_1
deep,80.197504,0.925939
deep with sigmoids,76.534317,0.856359
fully connected,73.787498,4.430183


### Ranking of all models

Unnamed: 0,architecture,training mode,weight sharing,mean comparison acc,std comparison acc
1,deep,with auxiliar loss,True,81.910004,1.208679
0,deep,without auxiliar loss,True,80.62001,0.442268
3,deep,with auxiliar loss,False,79.430008,1.337198
2,deep,without auxiliar loss,False,78.829994,0.71561
9,deep with sigmoids,with auxiliar loss,True,78.027275,0.598619
4,fully connected,without auxiliar loss,True,76.709999,1.060612
8,deep with sigmoids,without auxiliar loss,True,76.639999,1.078146
11,deep with sigmoids,with auxiliar loss,False,75.909996,0.703492
10,deep with sigmoids,without auxiliar loss,False,75.559998,1.045181
6,fully connected,without auxiliar loss,False,74.579994,0.491528


### Weight sharing vs. no weight sharing

Unnamed: 0_level_0,mean comparison acc,std comparison acc
weight sharing,Unnamed: 1_level_1,Unnamed: 2_level_1
True,77.369548,2.797872
False,76.309999,1.343782


### Auxiliary loss vs. normal loss with convolutional arch.

Unnamed: 0_level_0,mean comparison acc,std comparison acc
training mode,Unnamed: 1_level_1,Unnamed: 2_level_1
with auxiliar loss,78.819321,0.961997
without auxiliar loss,77.9125,0.820301


### Before tricks

Unnamed: 0,architecture,training mode,weight sharing,mean comparison acc,std comparison acc
2,deep,without auxiliar loss,False,78.829994,0.71561
10,deep with sigmoids,without auxiliar loss,False,75.559998,1.045181
6,fully connected,without auxiliar loss,False,74.579994,0.491528
