> 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import torch 

# Tensors
Torch use tensors to represent data in a very efficent way, you can perfom several mathematical operations efficiently and fast

In [None]:
#we create some random input data
inputs=torch.randn(1,5)
print(inputs,"\n",inputs.shape)

We can build neural networks perfoming the corresponding mathematical operations between tensors for example let's say we want to create a single layer network with 3 hidden units (neurons) and 4 inputs so
in this case our inputs X should be multiplied by the weight matrix W of the layer 
$$
\begin{equation*}
{X} \times {W}= \begin{bmatrix}
\mathbf{x}_1 & \mathbf{x}_2 & \mathbf{x}_3 & \mathbf{x}_4\\
\end{bmatrix}
\times
\begin{vmatrix}
\mathbf{h}_1 & \mathbf{h}_2 & \mathbf{h}_3 \\
{w}_{11}&{w}_{21}&{w}_{31}\\
{w}_{12}&{w}_{22}&{w}_{32}\\
{w}_{13}&{w}_{23}&{w}_{33}\\
{w}_{14}&{w}_{24}&{w}_{34}
\end{vmatrix}
\end{equation*}
$$
Where each column of W represent the 4 weights of a hidden unit $$h_{1-3}$$ corresponding to each input $$x_{1-4}$$ 

W will have as many columns as neurons and as many rows as inputs,we are performing the dot product of each column and the inputs to get a score for each neuron,those scores will be stored in a hidden vector h

then we will sum the bias to the hidden state vector h = X * W 
$$
\begin{bmatrix}
\mathbf{h}_1 & \mathbf{h}_2 & \mathbf{h}_3 \\
\end{bmatrix}
+
\begin{bmatrix}
\mathbf{b}_1 & \mathbf{b}_2 & \mathbf{b}_3 \\
\end{bmatrix}
$$
at the end our network will be 
$$
\mathbf{Y}_{prediction}=W\times X + b
$$



Implementing the basic dense layer

In [None]:
def no_activation(x):
    return x

In [None]:
def sigmoid(x):
    return 1/(1+torch.exp(x))

In [None]:

class dense():
    def __init__(self,input_d,units):
        self.weights=torch.randn((input_d,units))
        self.bias=torch.randn((1,units))
    def __call__(self,inputs,activation=no_activation):
        return activation(torch.mm(inputs,self.weights)+self.bias)

Creating a simple model using 1 hidden layer with two neurons and and output layer with one neuron 

In [None]:
class model():
    def __init__(self,input_d):
        self.layer1=dense(input_d,2)
        self.layer2=dense(self.layer1.weights.shape[1],1)
    def __call__(self,inputs):
        h1= self.layer1(inputs,activation=sigmoid)
        h2= self.layer2(h1,activation=sigmoid)
        return h2

In [None]:

torch.manual_seed(7)
features = torch.randn((1, 3))
firstnet=model(features.shape[1])
firstnet(features)

we can create tensors from numpy arrays 

In [None]:
a=np.random.rand(4,3)
a

in this case the new tensor will share mamoery whit the numpy array so wherever change we made in any of them will affect each other

In [None]:
b=torch.from_numpy(a)
b

In [None]:
b.mul_(2)

In [None]:
a

We can also create new tensor from numpy arrays without sharing memory between them, so this new tensor will have its own memory, and the changes on it will not affect the numpy array

In [None]:
b_nosharedmemory = torch.Tensor(a)

In [None]:
b_nosharedmemory

In [None]:
b_nosharedmemory.mul_(2)

In [None]:
a

In [None]:
b_nosharedmemory

Lets create a neural net two solve the mnist from of predict the number appearing on an image

We defined a transform to normalize the data , we are going to tranform each image to a vectorand also normalize it in a range of [-1,1] (standart normal distribution) image = (image - mean) / std the function normalize recieves two parameters mean and std in our case mean=0.5 and std=0.5 this normalization provides a better workfield to gradient descent since in the end we want to capture the distribution of the data its easier to capture a normal distribution 0 mean and equal variance

In [None]:
from torchvision import datasets,transforms



transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
# Download and load the data
data= datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)


In [None]:
#this dataloader will split the data in batches of 64 samples and will shuffle them 
dataloader = torch.utils.data.DataLoader(data, batch_size=64, shuffle=True)



As we can see the data loader iterates giving us a batch of 64  images  each time

In [None]:
dataiter = iter(dataloader)
images, labels = dataiter.next()
print(type(images))
print(images.shape)
print(labels.shape)

In [None]:
plt.imshow(images[1].numpy().squeeze(), cmap='Greys_r');

[](http://)

we will apply exp to all the elements in our data, then we know each row of our data is a prediction, so we should sum by rows (in dim=1) and also in order to divide 10 values by 10 values we should transpose that summaroty of the rows two divide the the row by a value 

In [None]:
def softmax(x):
    return torch.exp(x)/torch.sum(torch.exp(x), dim=1).view(-1, 1)

In [None]:
class model(object):
    def __init__(self,input_d):
        self.layer1=dense(input_d,256)
        self.outputlayer=dense(self.layer1.weights.shape[1],10)
    def __call__(self,inputs):
        h1=self.layer1(inputs,activation=sigmoid)
        out=self.outputlayer(h1,activation=softmax)
        return out

In [None]:
inputs = images.view(images.shape[0], -1)
inputs.shape

In [None]:
mnist_net=model(inputs.shape[1])
outputs=mnist_net(inputs)

In [None]:
print(outputs.shape)
print(torch.sum(outputs,dim=1))

Using the nn module to create models

In [None]:
import torch.nn as nn
class model(torch.nn.Module):
    def __init__(self,input_d):
        super().__init__()
        self.layer1=nn.Linear(784,256)
        self.out_Layer=nn.Linear(256,10)
    def __call__(self,inputs):
        h1=torch.sigmoid(self.layer1(inputs))
        out=torch.softmax(self.out_Layer(h1),dim=1) #we will calculate the softamx by row 
        return out
    

In [None]:
torchmodel=model(inputs.shape[1])
torchmodel

In [None]:
outputs=torchmodel(inputs)
outputs.shape

In [None]:
torch.sum(outputs,dim=1)

Lets build a more complex architecture also using other activation functions such as ReLU

In [None]:
import torch.nn.functional as F
class Model(torch.nn.Module):
    def __init__(self,input_d):
        super().__init__()
        self.layer1=nn.Linear(input_d,128)
        self.layer2=nn.Linear(self.layer1.out_features,64)
        self.out_layer=nn.Linear(self.layer2.out_features,10)
    def __call__(self,inputs):
        h1=F.relu(self.layer1(inputs))
        h2=F.relu(self.layer2(h1))
        out=torch.softmax(self.out_layer(h2),dim=1)
        return out
        
        
    

In [None]:
relumodel=Model(inputs.shape[1])
relumodel

In [None]:
torch.sum(relumodel(inputs),dim=1)


Let's build a model and calculate the loss noticed that we calculate the loss with the output scores and not whit the probabilities coming from the activation function (softmax)

In [None]:

# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10))

# Define the loss
criterion = nn.CrossEntropyLoss()

# Get our data
images, labels = next(iter(dataloader))
# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our logits
logits = model(images)
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)

 it's more convenient to build the model with a log-softmax output ,then you can get the actual probabilities by taking the exponential torch.exp(output). With a log-softmax output, we should use  to use the negative log likelihood loss
 
 
The softmax function returns probabilities between [0, 1].
The log of these probabilities returns values between [-inf, 0], since log(0) = -inf and log(1) = 0.
That is why the order won’t change.

However, you should use the NLLLoss with a log_softmax output
or CrossEntropyLoss with logits if you prefer not to add an extra log_softmax layer into your model.

In [None]:

# TODO: Build a feed-forward network
model = nn.Sequential(nn.Linear(images.shape[1],250),
                      nn.ReLU(),
                      nn.Linear(250,120),
                      nn.ReLU(),
                      nn.Linear(120,10),
                      nn.LogSoftmax(dim=1) 
                      #because we want to softmax by row (each row contains the score of a input), 
                      #we have 64 rows because our batch is of 64 samples
                      )

# TODO: Define the loss
criterion = nn.NLLLoss()


# Forward pass, get our logits
logits = model(images)
print(logits.shape)
print(torch.sum(torch.exp(logits),dim=1))
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)

![](http://)

In [None]:
images.shape

Autograd

Autograd allow us to track the operations done on a tensor and when you tell it to do a backward pass , it will go backwards  trought  each of these operations and calculate the gradients whit resect to the input parameters 

In torch you need to indicate that you require to calculate the gradients of a tensor

In [None]:
tensor=torch.randn(2,2,requires_grad=True)
tensor

You also can turn the grads of and turn in on again on a tensor

In [None]:
tensor.requires_grad=False
print("grads off:\n",tensor)
tensor.requires_grad=True
print("grads on:\n",tensor)

you can also use contexts to control the grads tracking

In [None]:
with torch.no_grad():
    y=tensor**2
y.requires_grad

In [None]:
print(tensor)
with torch.no_grad():
    tensor=tensor**2
print(tensor)
tensor.requires_grad=True

in this case we see the torch wil track the operations in the tensor since we stablish requieres_grad as True
we can also globally enable or disable grads globally using 
```python
torch.set_grad_enabled(True|false)
```

In [None]:
tensor=torch.sqrt(tensor)
tensor

then let's say we have a function 
$$
y=x^2
$$

and also
$$
z = \left[\frac{1}{n}\sum_i^n y_i\right]
$$

so then we want to calculate the gradients of z with respect x

$$
\frac{\partial z}{\partial x} = \frac{\partial}{\partial x}\left[\frac{1}{n}\sum_i^n x_i^2\right] = \frac{x}{2}
$$

we can easily perfom this with autograd in pytorch

In [None]:
x=torch.randn(2,2,requires_grad=True)
print(f"x: \n{x}")
y=x**2
print(f"y: \n{y}")
z=torch.mean(y)
print(f"z: \n{z}")


So then to perfom the gradients of z with respect x we must perfom z.backward() and check the result in x.grad, we know that the derivative with respect of x of z are x/2 so lets check if are the same

In [None]:
z.backward()
print(f"gradients of z with respect x: \n{x.grad}")
print(f"x/2: \n{x/2}")


## Optimizers

Then  we know that in this way we can calculate the gradients of the loss and then update our parameters.
Pytorch provides a set of optimizers that made this process automatically, such as stochastic gradient descent or adam.

In [None]:
from torch import optim

Optimizer = optim.SGD(model.parameters(),lr=0.01)

Then using this optimizer we can actually train our network:
first we need two perfom a feedforward pass, calculate the loss with those values, then backpropagate to calculate our gradients (backward pass) and finally update our parameters(weights and biases)


In [None]:
# Get our data
images, labels = next(iter(dataloader))
# Flatten images
images = images.view(images.shape[0], -1)

In [None]:
Optimizer.zero_grad()
output=model(images)
loss=criterion(output,labels) #this is not reciprocable take care
loss.backward()#we calculate the gradients of the loss with respect the parameters
print(f"Weights before optimize(layer1): {model[0].weight}")
print(f"Example gradients(layer1): {model[0].weight.grad}")

Then after calculate the loss and it's gradients we use our optimizer to update te parameters 
it's important to clean the gradients of the Optimizer cause they maybe acumulated (clean it before calulate the gradients of the loss) if you clean it here you will delete the new gradients

In [None]:
Optimizer.step()
print(f"Updated Weights after optimize(layer1): {model[0].weight}")

Ok now lets put all togheter below, first lets import our data,define our model, define our loss, create and optimizer and code the training loop

In [None]:
from torchvision import datasets,transforms



transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
# Download and load the data
data= datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
dataloader = torch.utils.data.DataLoader(data, batch_size=64, shuffle=True)
dataloader.dataset.train_data.shape

In [None]:

# TODO: Build a feed-forward network
model = nn.Sequential(nn.Linear(784,250),
                      nn.ReLU(),
                      nn.Linear(250,120),
                      nn.ReLU(),
                      nn.Linear(120,10),
                      nn.LogSoftmax(dim=1) 
                      #because we want to softmax by row (each row contains the score of a input), 
                      #we have 64 rows because our batch is of 64 samples
                      )

In [None]:
criterion = nn.NLLLoss()

In [None]:
def train(model,n_epochs,criterion):
    Optimizer = optim.SGD(model.parameters(),lr=0.003)
    for epoch in range(n_epochs):#epochs
        epoch_loss=0
        for images,labels in dataloader: #batches
            Optimizer.zero_grad()
            images=images.view(images.shape[0], -1)
            predictions=model(images)
            loss=criterion(predictions,labels)
            loss.backward()
            Optimizer.step()
            epoch_loss+=loss.item()#we want just the number inside the tensor
        print(f"Epoch: {epoch} Training loss: {epoch_loss/len(dataloader)}")        

In [None]:
train(model,5,criterion)

In [None]:
def view_classify(img, ps, version="MNIST"):
    ''' Function for viewing an image and it's predicted classes.
    '''
    ps = ps.data.numpy().squeeze()

    fig, (ax1, ax2) = plt.subplots(figsize=(6,9), ncols=2)
    ax1.imshow(img.resize_(1, 28, 28).numpy().squeeze())
    ax1.axis('off')
    ax2.barh(np.arange(10), ps)
    ax2.set_aspect(0.1)
    ax2.set_yticks(np.arange(10))
    if version == "MNIST":
        ax2.set_yticklabels(np.arange(10))
    elif version == "Fashion":
        ax2.set_yticklabels(['T-shirt/top',
                            'Trouser',
                            'Pullover',
                            'Dress',
                            'Coat',
                            'Sandal',
                            'Shirt',
                            'Sneaker',
                            'Bag',
                            'Ankle Boot'], size='small');
    ax2.set_title('Class Probability')
    ax2.set_xlim(0, 1.1)


In [None]:
%matplotlib inline
import helper

images, labels = next(iter(dataloader))

img = images[0].view(1, 784)
# Turn off gradients to speed up this part
with torch.no_grad():
    logps = model(img)

# Output of the network are log-probabilities, need to take exponential for probabilities
ps = torch.exp(logps)
view_classify(img.view(1, 28, 28), ps)

So then let's move into a complex model, so we are trying to predict over 10 different classes of clothes where also there is more vaiance in the data

In [None]:
import torch
from torchvision import datasets, transforms
import helper

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])
# Download and load the training data
trainset = datasets.FashionMNIST('~/.pytorch/F_MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

# Download and load the test data
testset = datasets.FashionMNIST('~/.pytorch/F_MNIST_data/', download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)

In [None]:
import torch.nn.functional as F
from torch import optim

class fashionModel(torch.nn.Module):
    def __init__(self,input_d):
        super().__init__()
        self.layer1=torch.nn.Linear(input_d,256)
        self.nivel1Layer1 =torch.nn.Linear(256,80)
        self.nivel1Layer2 =torch.nn.Linear(256,80)
        self.Layer3=torch.nn.Linear(80+80,50)#we will concatenate the output of the 2 previous layers 200,200=400
        self.Layer4=torch.nn.Linear(256+50,34)
        self.outLayer=torch.nn.Linear(34,10)

    
    def forward(self,inputs):
        h1=F.relu(self.layer1(inputs))
        h2_1=F.relu(self.nivel1Layer1(h1))
        h2_2=F.relu(self.nivel1Layer2(h1))
        h3=F.relu(self.Layer3(torch.cat((h2_1,h2_2),dim=1)))
        h4=F.relu(self.Layer4(torch.cat((h3,h1),dim=1)))
        outputs=F.log_softmax(self.outLayer(h4),dim=1)
        return outputs


    def fit(self,batch_generator,num_epochs,criterion,Optimizer=None):
        if not Optimizer:
            Optimizer=optim.Adam(self.parameters(),lr=0.003)
        train_samples=testloader.dataset.data.shape[0]
        print(f"{train_samples} training samples")
        for epoch in range(num_epochs):
            epoch_loss=0
            for images,labels in batch_generator:
                Optimizer.zero_grad()#clean the gradients of the optimizer
                outputs=self.forward(images.view(images.shape[0], -1))
                loss=criterion(outputs,labels)#calculates the loss
                loss.backward()#calculate the gradients of the loss with respect the model parameters
                Optimizer.step()#we update the model parameters
                epoch_loss+=loss.item()
            print(f"EPOCH:{epoch} loss:{epoch_loss}")
                

In [None]:
x = torch.zeros(20,784, requires_grad=False)
print("Test tensor: ",x.shape)
model=fashionModel(784)
out=model(x)
print("Test output:",out.shape)
print("Check if output its ok (sum1):\n",torch.sum(torch.exp(out),dim=1))

In [None]:
criterion = torch.nn.NLLLoss()

In [None]:
model.fit(trainloader,7,criterion)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import helper

# Test out your network!

dataiter = iter(testloader)
images, labels = dataiter.next()
img = images[0]
# Convert 2D image to 1D vector
img = img.resize_(1, 784)

# TODO: Calculate the class probabilities (softmax) for img
ps=torch.exp(model(img))
print(ps)
# Plot the image and probabilities
view_classify(img.resize_(1, 28, 28), ps, version='Fashion')

In [None]:
pip install hiddenlayer

In [None]:
import hiddenlayer as hl
hl.build_graph(model, torch.zeros(1, 784))

## Validation
We also have to validate our models using some matrics in order to know if is generalizing well or  if we have to implement some techniques to improve generalization such as dropout or L regularizations

### Accuracy
we'll calculate the accuracy on our test set for our previous model

In [None]:
dataiter = iter(testloader)
images, labels = dataiter.next()

In [None]:
with torch.no_grad(): #we don't need grads in the predictions so let's speed up this part
    predictions=model(images.view(images.shape[0],-1))
    predictions=torch.exp(predictions)

in order to get the predicted class since our predictions give us a probability for each class we can pickup the highest probability using the method .topk(n,dim) of our prediction tensors, this method returns a tuple (value,index) so for us will be (probability,class) 

In [None]:
probabilities,p_classes=predictions.topk(1,dim=1) #to pickup the top 1 of the row (since each row is a probability vector of the image)

In [None]:
print(p_classes.shape,"\n",p_classes[:10])

In [None]:
print(labels.shape)

then we want two know how many samples have been correctly classified so to perfom == operations p_classes and labels should have the same shapes so we can squeeze p_classes or unsqueeze labels (also view(*p_classes.shape)

In [None]:
equals = p_classes.squeeze(1) == labels

the we just calculate the mean of corectly classified samples
note*: we cast the equals vector to a float vector in order to get a float division and not a integer division

In [None]:
print(f"Accuracy: {torch.mean(equals.type(torch.FloatTensor)).item()*100}")

let's implement the validation step in the trainig pass

In [None]:
import torch.nn.functional as F
from torch import optim

class fashionModel(torch.nn.Module):
    def __init__(self,input_d):
        super().__init__()
        self.layer1=torch.nn.Linear(input_d,256)
        self.nivel1Layer1 =torch.nn.Linear(256,80)
        self.nivel1Layer2 =torch.nn.Linear(256,80)
        self.Layer3=torch.nn.Linear(80+80,50)#we will concatenate the output of the 2 previous layers 200,200=400
        self.Layer4=torch.nn.Linear(256+50,34)
        self.outLayer=torch.nn.Linear(34,10)

    
    def forward(self,inputs):
        h1=F.relu(self.layer1(inputs))
        h2_1=F.relu(self.nivel1Layer1(h1))
        h2_2=F.relu(self.nivel1Layer2(h1))
        h3=F.relu(self.Layer3(torch.cat((h2_1,h2_2),dim=1)))
        h4=F.relu(self.Layer4(torch.cat((h3,h1),dim=1)))
        outputs=F.log_softmax(self.outLayer(h4),dim=1)
        return outputs
    
    def accuracy(self,predictions,labels):
        predictions=torch.exp(predictions)#as we have log softmax at the end we need to calculate te actual outputs
        _,predictions=predictions.topk(1,dim=1)
        equals=predictions==labels
        return torch.mean(equals.type(torch.FloatTensor)).item()*100
    #todo
    def metrics():
        pass
        

    def fit(self,batch_generator,num_epochs,criterion,Optimizer=None,validation_generator=None):
        if not Optimizer:
            Optimizer=optim.Adam(self.parameters(),lr=0.003)
        train_samples=batch_generator.dataset.data.shape[0]
        val_samples=validation_generator.dataset.data.shape[0]
        print(f"{train_samples} Training samples , {val_samples} Validation samples")
        for epoch in range(num_epochs):
            epoch_train_loss=0
            epoch_train_accuracy=0
            for images,labels in batch_generator:
                Optimizer.zero_grad()#clean the gradients of the optimizer
                outputs=self.forward(images.view(images.shape[0], -1))
                loss=criterion(outputs,labels)#calculates the loss
                loss.backward()#calculate the gradients of the loss with respect the model parameters
                Optimizer.step()#we update the model parameters
                accuracy=self.accuracy(outputs,labels)
                epoch_train_loss+=loss.item()
                epoch_train_accuracy+=accuracy
            print(f"EPOCH:{epoch} Train loss:{epoch_train_loss} Train accuracy:{epoch_train_accuracy/train_samples}")
            if validation_generator:
                epoch_val_loss=0
                epoch_val_accuracy=0
                with torch.no_grad():
                    for images_val,labels_val in validation_generator:
                        outputs=self.forward(images_val.view(images_val.shape[0],-1))
                        val_loss=criterion(outputs,labels_val)
                        val_accuracy=self.accuracy(outputs,labels_val)
                        epoch_val_loss+=val_loss.item()
                        epoch_val_accuracy+=val_accuracy
            print(f"Validation loss:{epoch_val_loss} , Validation accuracy:{epoch_val_accuracy/val_samples}")
                        
                    
            

lests train the model remeber we have a batch generator or training called  trainloader and another for validation called testloader

In [None]:
model=fashionModel(784)
criterion = torch.nn.NLLLoss()
model.fit(trainloader,7,criterion,validation_generator=testloader)