In [31]:
import torch
import math

# Just use for the plots at the end
import matplotlib.pyplot as plt
from numpy import exp,arange
from pylab import meshgrid,cm,imshow,contour,clabel,colorbar,axis,title,show

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x21862185088>

In [32]:
class Module(object) :
    def __init__(self):
        super().__init__()
    
    def forward(self , *input):
        raise  NotImplementedError
        
    def backward(self , *gradwrtoutput):
        raise  NotImplementedError
        
    def param(self): # These are the layers of the network
        return  []

In [33]:
class Layer(Module):
    def __init__(self):
        super().__init__()
        self.dropout = False
        self.linear = False
        
    def is_dropout(self):
        self.dropout = True
    
    def is_linear(self):
        self.linear = True

In [34]:
class Sequential(Module):
    def __init__(self, param, Loss):
        super().__init__()
        self.model = (param)
        self.loss = Loss
    
    def forward(self, x):
        for layer in self.model:
            x = layer.forward(x)
        return x
    
    def backward(self, output, target):
        grad = self.loss.backward(target, output)
        
        for layer in reversed(self.model):
            grad = layer.backward(grad)
        
        Loss = self.loss.forward(target, output)
        return Loss
    
    def Train(self):
        for layer in self.model:
            if layer.dropout:
                layer.Train()
        
    def Eval(self):
        for layer in self.model:
            if layer.dropout:
                layer.Eval()
    
    def lr_method(self, method, lr):
        for layer in self.model:
            if layer.linear:
                layer.change_lr_method(method, lr)

In [35]:
class Linear(Layer):
    def __init__(self, in_, out_):
        super().__init__()
        self.in_ = in_
        self.out_ = out_
        self.is_linear()
        self.lr = 0.005
        self.lr_method = 'constant'
        
        # Capture the term at each layer before the passage in the layer
        # and the activation function.
        self.x = torch.zeros(out_)
        
        # Initialization of Adam for weight and bias
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.eps = 1.0e-8
        self.eta = 1.0e-1
        self.mw = torch.zeros(out_)
        self.mb = torch.zeros(out_)
        self.vw = 0.0
        self.vb = 0.0
        
        # Initialization of the weights and the bias
        param = 1. / math.sqrt(in_)
        self.weight = torch.empty(self.in_, self.out_).uniform_(-param, param)
        self.bias = torch.empty(self.out_).uniform_(-param, param)
        
    def forward(self, x):
        self.x = x
        return x.mm(self.weight) + self.bias
    
    def set_Lr(self, lr):
        self.lr = lr
        return
        
    def backward(self, grad):
        
        if self.lr_method == "Adam":
            
            # Adam method for the learning rate
            gw = self.x.t().mm(grad)
            self.mw = ((self.beta1 * self.mw) + ((1 - self.beta1) * gw))
            mh = (1 / (1 - self.beta1)) * self.mw
            self.vw = ((self.beta2 * self.vw) + ((1 - self.beta2) * (gw.norm()**2)))
            vh = (1 / (1 - self.beta2)) * self.vw
            self.weight = self.weight - ((self.eta / (vh.sqrt() + self.eps)) * mh)

            self.mb = ((self.beta1 * self.mb) + ((1 - self.beta1) * grad))
            mh = (1 / (1 - self.beta1)) * self.mb
            self.vb = ((self.beta2 * self.vb) + ((1 - self.beta2) * (grad.norm()**2)))
            vh = (1 / (1 - self.beta2)) * self.vb
            self.bias = self.bias - ((self.eta / (vh.sqrt() + self.eps)) * mh)
            grad = grad.mm(self.weight.t())
            
        elif self.lr_method == "constant":
            
            # Constant learning rate
            self.weight = self.weight - self.lr * self.x.t().mm(grad)
            self.bias = self.bias - self.lr * grad * 1
            grad = grad.mm(self.weight.t())
            
        return grad
    
    def weight(self):
        return self.weight
    
    def bias(self):
        return self.bias
    
    def change_lr_method(self, method, lr):
        self.lr = lr
        self.lr_method = method

In [36]:
class Dropout(Layer) :
    def __init__(self):
        super().__init__()
        self.p = 0.
        self.is_dropout()
        self.train = True
        
    
    def forward(self, x):
        n = torch.ones(x.size())
        if self.train:
            n = torch.bernoulli(n) * (1 - self.p)
        return x * n
        
    def backward(self, x):
        return x
    
    def Train(self):
        self.train = True
        
    def Eval(self):
        self.train = False
        

In [37]:
class LossMSE(Layer):
    def __init__(self):
        super().__init__() 
    
    def forward(self, data_target, data_output):
        loss = (data_output - data_target).pow(2).sum()
        return loss
    
    def backward(self, data_target, data_output):
        dloss = 2 * (data_output - data_target)
        return dloss
    
    def is_MSE(self):
        return True

In [38]:
class CrossEntropyLoss(Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, data_target, data_output):
        output = data_output.to(dtype=torch.float)
        target = data_target.resize_(data_target.size(0), 1)
        
        zer = torch.zeros(target.size()).int()
        target = torch.cat((target,zer), 1)
    
        first_column = torch.tensor([0])
        loss = output.gather(1,target).index_select(1,first_column).exp()
        
        # To avoid numerical error in the computation
        maxx = loss.max()
        
        loss = (loss * maxx) / (output.exp().sum(1) * maxx)
        loss = -(loss.log().mean())
        return loss
    
    def backward(self, data_target, data_output):
        # New version
        N = data_target.size(0)
        dloss = data_output.exp()
        dloss = dloss / dloss.sum(1).resize_(N,1)
        
        add = data_target-1
        add = torch.cat((add, -data_target), 1)
        dloss = (1/N) * (dloss + add)
        return dloss
    
    def is_MSE(self):
        return False

In [39]:
class ReLU(Layer):
    
    def __init__(self ):
        super().__init__()
        self.save = 0
        
    def forward(self, x):
        y = x.clamp(min = 0)
        self.save = x
        return y
    
    def backward(self, x):
        y = self.save > 0
        return y.float() * x
         
    def print(self):
        return

In [40]:
class Leaky_ReLU(Layer):
    
    def __init__(self ):
        super().__init__()
        self.s = 0
        self.alpha = 0.01
        
    def forward(self, x):
        y = torch.max(self.alpha * x, x)
        self.s = x
        return y
    
    def backward(self, x):
        y = ((self.s > 0) * (1 - self.alpha)) + self.alpha
        return y.float() * x
         
    def print(self):
        return

In [41]:
class ELU(Layer):
    
    def __init__(self):
        super().__init__()
        self.s = 0
        self.alpha = 0.01
        
    def forward(self, x):
        y = ((x > 0).float() * x) + (0 >= x) * self.alpha * (torch.exp(x) - 1)
        self.s = x
        return y
    
    def backward(self, x):
        y = ((self.s > 0) * (1 - self.alpha * torch.exp(self.s))) + self.alpha * torch.exp(self.s)
        return y.float() * x

In [42]:
class Tanh(Layer) :
    def __init__(self, ):
        super().__init__()
        self.save = 0
    
    def  forward(self, x):
        self.save = x
        return torch.div(x.exp() - (-x).exp(), x.exp() + (-x).exp())
        
    def  backward(self, x):
        return (1 - torch.div(self.save.exp() - 
                    (-self.save).exp(), self.save.exp() + (-self.save).exp())**2) * x
        
    def print(self):
        return

In [43]:
class Sigmoid(Layer):
    
    def __init__(self):
        super().__init__()
        self.s = 0
        self.lbd = 3
        
    def forward(self, x):
        y = 1 / (1 + torch.exp(-self.lbd * x))
        self.s = x
        return y
    
    def backward(self, x):
        y = self.lbd * torch.exp(-self.s) / ((1 + torch.exp(-self.lbd * self.s))**2)
        return y.float() * x  

In [44]:
def create_random_batch(input_size, mini_batch_size):
    
    # This function return a 2D tensor that is the rando selection of inputs for our
    # stochastic gradient method, taking in count the number of mini_batches.
    
    # We suppose here that our mini_batch_size is well chosen taking in count the fact
    # that it divides input_size.
    
    # Initialization
    L = int(input_size / mini_batch_size)
    new_batch = torch.ones(L, mini_batch_size)
    
    indices = torch.randperm(input_size)
    for k in range(L):
        new_batch[k] = indices[k * mini_batch_size : (k+1) * mini_batch_size]
    
    return new_batch

In [45]:
def train_model(model, train_input, train_classes, nb_epochs, mini_batch_size):
    
    h_step = 1e-3
    
    for epoch in range(nb_epochs):
        random_batches = create_random_batch(train_input.size(0), mini_batch_size).tolist()
        for batch in range(0, train_input.size(0), mini_batch_size):
            output = model.forward(train_input.narrow(0, batch, mini_batch_size))
            loss = model.backward(output, train_classes.narrow(0, batch, mini_batch_size))

In [46]:
def compute_nb_errors(model, data_input, data_target, mini_batch_size):
    
    nb_data_errors = 0

    for b in range(0, data_input.size(0), mini_batch_size):
        result = model.forward(data_input.narrow(0, b, mini_batch_size))
        
        if model.loss.is_MSE():
            # If the loss function is MSE
            predicted_classes = (result >= 0.5).int()
        else:
            # If the loss function is CrossEntropy
            _, predicted_classes = torch.max(result, 1)
        
        for k in range(mini_batch_size):
            if data_target[b + k] != predicted_classes[k]:
                nb_data_errors = nb_data_errors + 1
                
    return nb_data_errors

In [47]:
def create_problem(nb_samples):
    
    # Remark: the function .uniform return a uniform distribution on [0,1) instead of [0,1],
    # but in our case it's not a problem since it is only a train and a test set on a circle
    # that do not touch the border of the set [0,1]^2.
    train_input = torch.empty(nb_samples, 2).uniform_(0, 1)
    test_input = torch.empty(nb_samples, 2).uniform_(0, 1)
    
    # Radius of our circle
    R = 1 / math.sqrt(2 * math.pi)
    
    train_classes = train_input.sub(0.5).pow(2).sum(1).sub(R**2).sign().sub(1).div(-2).long().resize_((nb_samples,1))
    test_classes = test_input.sub(0.5).pow(2).sum(1).sub(R**2).sign().sub(1).div(-2).long().resize_((nb_samples,1))
    
    return train_input, train_classes, test_input, test_classes

In [48]:
def get_tests(n):
    M = []
    for k in range (n):
        L = []
        _, _, test_input, test_classes =  create_problem(1000)
        L.append(test_input)
        L.append(test_classes)
        M.append(L)
    return M

In [49]:
model_ReLU = Sequential([Linear(2,25), ReLU(), Linear(25,25), ReLU(), Linear(25,25), ReLU(), Linear(25,1), ReLU()], LossMSE())
model_Tanh = Sequential([Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25,25), Tanh(), Linear(25,1), Tanh()], LossMSE())
model_Sigmoid = Sequential([Linear(2,25), Sigmoid(), Linear(25,25), Sigmoid(), Linear(25,25), Sigmoid(), Linear(25,1), Sigmoid()], LossMSE())
model_Leaky_ReLU = Sequential([Linear(2,25), Leaky_ReLU(), Linear(25,25), Leaky_ReLU(), Linear(25,25), Leaky_ReLU(), Linear(25,1), Leaky_ReLU()], LossMSE())
model_ELU = Sequential([Linear(2,25), ELU(), Linear(25,25), ELU(), Linear(25,25), ELU(), Linear(25,1), ELU()], LossMSE())

In [50]:
train_input_list = []
train_classes_list = []
test_input_list = []
test_classes_list = []

for i in range (0, 20):
    train_input, train_classes, test_input, test_classes = create_problem(1000)
    train_input_list.append(train_input)
    train_classes_list.append(train_classes)
    test_input_list.append(test_input)
    test_classes_list.append(test_classes)

In [51]:
Model_ReLU_List = []
Model_Tanh_List = []
Model_Sigmoid_List = []
Model_Leaky_ReLU_List = []
Model_ELU_List = []
Loss = LossMSE()

for k in range (0, 20):
    Model_ReLU_List.append(Sequential([Linear(2,25), ReLU(), Linear(25,25), ReLU(), Linear(25,25), ReLU(), Linear(25,1), ReLU()], LossMSE()))
    Model_Tanh_List.append(Sequential([Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25,25), Tanh(), Linear(25,1), Tanh()], LossMSE()))
    Model_Sigmoid_List.append(Sequential([Linear(2,25), Sigmoid(), Linear(25,25), Sigmoid(), Linear(25,25), Sigmoid(), Linear(25,1), Sigmoid()], LossMSE()))
    Model_Leaky_ReLU_List.append(Sequential([Linear(2,25), Leaky_ReLU(), Linear(25,25), Leaky_ReLU(), Linear(25,25), Leaky_ReLU(), Linear(25,1), Leaky_ReLU()], LossMSE()))
    Model_ELU_List.append(Sequential([Linear(2,25), ELU(), Linear(25,25), ELU(), Linear(25,25), ELU(), Linear(25,1), ELU()], LossMSE()))

In [52]:
def train_and_test_model(Model, train_input_list, train_classes_list, test_input_list, test_classes_list):
    
    nb_epochs = 100
    mini_batch_size = 10
    
    Train_error = []
    Test_error = []
    std_deviation = 0.0
    train_error = 0.0
    avg_nb_test_error = torch.tensor(())
    
    for i in range (0, len(Model)):
        Model[i].lr_method("Adam", 1.0e-3)
        
        train_model(Model[i], train_input_list[i], train_classes_list[i], nb_epochs, mini_batch_size)
        
        nb_train_errors = compute_nb_errors(Model[i], train_input_list[i], train_classes_list[i], mini_batch_size)
        train_error += nb_train_errors / 10
            
        nb_test_errors = compute_nb_errors(Model[i], test_input_list[i], test_classes_list[i], mini_batch_size)
        nb_test_errors = torch.tensor([nb_test_errors/10]).float()
        #print('train error {:f}'.format(nb_train_errors))
        #print('test error {:f}'.format(nb_test_errors.item()))
        avg_nb_test_error = torch.cat((avg_nb_test_error, nb_test_errors), 0)
        
    Train_error.append(train_error / len(Model))
    Test_error.append(avg_nb_test_error.mean().tolist())
    std_deviation = avg_nb_test_error.std().tolist()
    
    return Train_error, Test_error, std_deviation

In [30]:
Train_error_ReLU, Test_error_ReLU, std_deviation_ReLU = train_and_test_errors(Model_ReLU_List, train_input_list, train_classes_list, test_input_list, test_classes_list)

print(Train_error_ReLU)
print(Test_error_ReLU)
print(std_deviation_ReLU)

[36.790000000000006]
[37.77000045776367]
19.75116539001465


In [28]:
Train_error_ELU, Test_error_ELU, std_deviation_ELU = train_and_test_errors(Model_ELU_List, train_input_list, train_classes_list, test_input_list, test_classes_list)
print(Train_error_ELU)
print(Test_error_ELU)
print(std_deviation_ELU)

[2.9400000000000004]
[6.2049994468688965]
1.0927873849868774


In [29]:
Train_error_Leaky_ReLU, Test_error_Leaky_ReLU, std_deviation_Leaky_ReLU = train_and_test_errors(Model_Leaky_ReLU_List, train_input_list, train_classes_list, test_input_list, test_classes_list)
print(Train_error_Leaky_ReLU)
print(Test_error_Leaky_ReLU)
print(std_deviation_Leaky_ReLU)

[2.795]
[6.059999942779541]
1.23048996925354


In [87]:
Train_error_Sigmoid, Test_error_Sigmoid, std_deviation_Sigmoid = train_and_test_errors(Model_Sigmoid_List, train_input_list, train_classes_list, test_input_list, test_classes_list)
print(Train_error_Sigmoid)
print(Test_error_Sigmoid)
print(std_deviation_Sigmoid)

train error 47.000000
test error 7.900000
train error 43.000000
test error 7.500000
train error 89.000000
test error 12.500000
train error 40.000000
test error 5.900000
train error 38.000000
test error 6.600000
train error 59.000000
test error 10.000000
train error 141.000000
test error 15.000000
train error 74.000000
test error 10.100000
train error 32.000000
test error 5.200000
train error 61.000000
test error 8.300000
train error 42.000000
test error 6.500000
train error 52.000000
test error 7.400000
train error 27.000000
test error 7.600000
train error 103.000000
test error 15.800000
train error 45.000000
test error 7.400000
train error 354.000000
test error 34.599998
train error 462.000000
test error 49.599998
train error 29.000000
test error 5.800000
train error 48.000000
test error 8.700000
train error 67.000000
test error 8.400000
[9.265000000000002]
[12.039999008178711]
10.938507080078125


In [88]:
Train_error_Tanh, Test_error_Tanh, std_deviation_Tanh = train_and_test_errors(Model_Tanh_List, train_input_list, train_classes_list, test_input_list, test_classes_list)
print(Train_error_Tanh)
print(Test_error_Tanh)
print(std_deviation_Tanh)

train error 25.000000
test error 5.900000
train error 25.000000
test error 7.600000
train error 30.000000
test error 5.500000
train error 13.000000
test error 5.600000
train error 21.000000
test error 5.200000
train error 11.000000
test error 5.300000
train error 36.000000
test error 4.900000
train error 31.000000
test error 7.300000
train error 24.000000
test error 4.900000
train error 22.000000
test error 4.100000
train error 24.000000
test error 4.500000
train error 29.000000
test error 4.700000
train error 22.000000
test error 5.300000
train error 37.000000
test error 7.500000
train error 27.000000
test error 5.900000
train error 34.000000
test error 4.800000
train error 19.000000
test error 4.600000
train error 13.000000
test error 4.800000
train error 15.000000
test error 4.800000
train error 25.000000
test error 4.400000
[2.415]
[5.380000591278076]
1.0175305604934692


In [53]:
Model_ReLU_List_CE = []
Model_Tanh_List_CE = []
Model_Sigmoid_List_CE = []
Model_Leaky_ReLU_List_CE = []
Model_ELU_List_CE = []

for k in range (0, 20):
    Model_ReLU_List_CE.append(Sequential([Linear(2,25), ReLU(), Linear(25,25), ReLU(), Linear(25,25), ReLU(), Linear(25,2), ReLU()], CrossEntropyLoss()))
    Model_Tanh_List_CE.append(Sequential([Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25,25), Tanh(), Linear(25,2), Tanh()], CrossEntropyLoss()))
    Model_Sigmoid_List_CE.append(Sequential([Linear(2,25), Sigmoid(), Linear(25,25), Sigmoid(), Linear(25,25), Sigmoid(), Linear(25,2), Sigmoid()], CrossEntropyLoss()))
    Model_Leaky_ReLU_List_CE.append(Sequential([Linear(2,25), Leaky_ReLU(), Linear(25,25), Leaky_ReLU(), Linear(25,25), Leaky_ReLU(), Linear(25,2), Leaky_ReLU()], CrossEntropyLoss()))
    Model_ELU_List_CE.append(Sequential([Linear(2,25), ELU(), Linear(25,25), ELU(), Linear(25,25), ELU(), Linear(25,2), ELU()], CrossEntropyLoss()))

In [None]:
Train_error_ReLU_CE, Test_error_ReLU_CE, std_deviation_ReLU_CE = train_and_test_errors(Model_ReLU_List_CE, train_input_list, train_classes_list, test_input_list, test_classes_list)

print(Train_error_ReLU_CE)
print(Test_error_ReLU_CE)
print(std_deviation_ReLU_CE)

In [None]:
Train_error_ELU_CE, Test_error_ELU_CE, std_deviation_ELU_CE = train_and_test_errors(Model_ELU_List_CE, train_input_list, train_classes_list, test_input_list, test_classes_list)
print(Train_error_ELU_CE)
print(Test_error_ELU_CE)
print(std_deviation_ELU_CE)

In [None]:
Train_error_Tanh_CE, Test_error_Tanh_CE, std_deviation_Tanh_CE = train_and_test_errors(Model_Tanh_List_CE, train_input_list, train_classes_list, test_input_list, test_classes_list)
print(Train_error_Tanh_CE)
print(Test_error_Tanh_CE)
print(std_deviation_Tanh_CE)

In [58]:
nb_train_errors = compute_nb_errors(model, train_input, train_classes, mini_batch_size)
print('train error {:0.2f}% {:f}/{:f}'.format((100 * nb_train_errors) / train_input.size(0), nb_train_errors, train_classes.size(0)))

L = get_tests(10)
average_nb_test_error = 0
for k in range (0, len(L)):
    nb_test_errors = compute_nb_errors(model, L[k][0], L[k][1], mini_batch_size)
    average_nb_test_error += nb_test_errors
    print('test error {:0.2f}% {:f}/{:f}'.format((100 * nb_test_errors) / L[k][0].size(0), nb_test_errors, L[k][0].size(0)))
print('Average test error {:0.2f}% {:0.1f}/{:d}'.format((100*average_nb_test_error/len(L)) / L[0][0].size(0), average_nb_test_error/len(L), L[0][0].size(0)))

train error 48.20% 482.000000/1000.000000
test error 50.50% 505.000000/1000.000000
test error 48.60% 486.000000/1000.000000
test error 47.20% 472.000000/1000.000000
test error 51.50% 515.000000/1000.000000
test error 53.30% 533.000000/1000.000000
test error 51.90% 519.000000/1000.000000
test error 51.70% 517.000000/1000.000000
test error 48.70% 487.000000/1000.000000
test error 49.10% 491.000000/1000.000000
test error 49.90% 499.000000/1000.000000
Average test error 50.24% 502.4/1000
