In [3]:
import torch
from torch import nn
from torch import optim
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm

# get and prepare data
training_data = datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())
train, val = random_split(training_data, [55000, 5000])
train_loader = DataLoader(train, batch_size=32)
val_loader = DataLoader(val, batch_size=32)

#  Definir modelo
model = nn.Sequential(
  nn.Linear(28 * 28, 64),
  nn.ReLU(),
  nn.Linear(64, 64),
  nn.ReLU(),
  nn.Linear(64, 10),
  nn.ReLU()
).cuda()
  
params = model.parameters()
optimizer = optim.SGD(params, lr=1e-2)

# Definir funcion de perdida
loss = nn.CrossEntropyLoss()


# definir ciclo de entrenamiento
n_epochs = 5


for epoch in range(n_epochs):
  losses = list()
  accuracies = list()
  for batch in tqdm(train_loader):
    x, y = batch
    # x = batchsize * 1 * 28 * 28
    b = x.size(0)
    x = x.view(b, -1).cuda() # el -1 multiplica los otros numeros no declarados

    # 1) forward
    # l = logit = funcion que devuelve una probabilidad entre 0 y 1 (log odds function) el output de la ultima capa
    l = model(x)

    # 2) compute the objective function
    # the loss is the distance from the network prediction and the real value
    # the objective funtion is equal to the loss in value but it's a function of it's parameters not the output
    J = loss(l, y.cuda())

    # 3) cleaning the gradient
    model.zero_grad()
    # optimizer.zero_grad()
    

    # 4) accumulate the partial derivatives of J with respect to the parameters   
    J.backward()
    # parameters.grad._sum(dJ/dparams)
    # basicamente como cambia el output de la red respecto de los parametros
    # como no queremos acumular todas las perdidas (ya que queremos hacer SGD) es que ejecutamos el paso 3! 
    
    # pytorch acumula porque te permite hacer cosas mas chetas
    # 5) step in the opposite direction of the gradient
    optimizer.step()

    # si usamos J vamos a lekear memoria (tiene todo el grafo)
    # por eso appendeamos J.items
    losses.append(J.item())
    
    accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())
  # torch.tensor(J).mean():.2f (:.2f es para imprimir 2 decimales )
  # generamos un tensor de la lista y despues computamos el promedio
  print(f"train epoch: {epoch+1}, train_loss: {torch.tensor(losses).mean():.2f}, train_accuracy: {torch.tensor(accuracies).mean():.2f}")

  losses = list()
  accuracies = list()
  for batch in tqdm(val_loader):
    x, y = batch
    # x = batchsize * 1 * 28 * 28
    b = x.size(0)
    x = x.view(b, -1).cuda() # el -1 multiplica los otros numeros no declarados

    # 1) forward
    # l = logit = funcion que devuelve una probabilidad entre 0 y 1 (log odds function) el output de la ultima capa
    with torch.no_grad():
      l = model(x)

    # 2) compute the objective function
    # the loss is the distance from the network prediction and the real value
    # the objective funtion is equal to the loss in value but it's a function of it's parameters not the output
    J = loss(l, y.cuda())
    losses.append(J.item())
    accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

  print(f"validation epoch: {epoch+1}, validation_loss: {torch.tensor(losses).mean():.2f}, validation_accuracy: {torch.tensor(accuracies).mean():.2f}")
  




You should consider upgrading via the '/home/gocandra/workspace/chess/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/home/gocandra/workspace/chess/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from torch import nn
from torch import optim
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm
import pdb
# get and prepare data
training_data = datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())
train, val = random_split(training_data, [55000, 5000])
train_loader = DataLoader(train, batch_size=32)
val_loader = DataLoader(val, batch_size=32)

#  Definir modelo
class ResNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.l1 = nn.Linear(28 * 28, 64)
    self.l2 = nn.Linear(64, 64)
    self.l3 = nn.Linear(64, 10)
    self.do = nn.Dropout(0.1)
  
  def forward(self, x):
    h1 = nn.functional.relu(self.l1(x))
    h2 = nn.functional.relu(self.l2(h1))
    do = self.do(h2 +h1)
    logits = self.l3(do)
    return logits

model = ResNet() #.cuda()
  

# ResNet (highway networks lo precede)
# entrena mas rapido porque la velocidad de aprendizaje para cada capa cuanto mas cercana al input aprende mas lento, 
# se dice que el input es abajo y el output (head) arriba.
# los gradientes cerca del output son mas cercanos a la perdida
# pero a medida que va bajando (acercandose al input) van decreciendo estos gradientes (ya que los pesos son menores a 1).
# por lo que las capas mas lejanas del output aprenden mas lento

params = model.parameters()
optimizer = optim.SGD(params, lr=1e-2)

# Definir funcion de perdida
loss = nn.CrossEntropyLoss()


# definir ciclo de entrenamiento
n_epochs = 5


for epoch in range(n_epochs):
  losses = list()
  accuracies = list()
  
  # para que funcione el dropout
  model.train() 
  # tambien hay que hacerlo cuando tenemos batch_norm 
  for batch in tqdm(train_loader):
    x, y = batch
    # x = batchsize * 1 * 28 * 28
    b = x.size(0)
    x = x.view(b, -1) #.cuda() 
    # el -1 multiplica los otros numeros no declarados

    # 1) forward
    # l = logit = funcion que devuelve una probabilidad entre 0 y 1 (log odds function) el output de la ultima capa
    l = model(x)
    pdb.set_trace()
    # 2) compute the objective function
    # the loss is the distance from the network prediction and the real value
    # the objective funtion is equal to the loss in value but it's a function of it's parameters not the output
    J = loss(l, y) #.cuda())

    # 3) cleaning the gradient
    model.zero_grad()
    # optimizer.zero_grad()
    

    # 4) accumulate the partial derivatives of J with respect to the parameters   
    J.backward()
    # parameters.grad._sum(dJ/dparams)
    # basicamente como cambia el output de la red respecto de los parametros
    # como no queremos acumular todas las perdidas (ya que queremos hacer SGD) es que ejecutamos el paso 3! 
    
    # pytorch acumula porque te permite hacer cosas mas chetas
    # 5) step in the opposite direction of the gradient
    optimizer.step()

    # si usamos J vamos a lekear memoria (tiene todo el grafo)
    # por eso appendeamos J.items
    losses.append(J.item())
    
    accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())
  # torch.tensor(J).mean():.2f (:.2f es para imprimir 2 decimales )
  # generamos un tensor de la lista y despues computamos el promedio
  print(f"train epoch: {epoch+1}, train_loss: {torch.tensor(losses).mean():.2f}, train_accuracy: {torch.tensor(accuracies).mean():.2f}")

  losses = list()
  accuracies = list()
  model.eval()
  for batch in tqdm(val_loader):
    x, y = batch
    # x = batchsize * 1 * 28 * 28
    b = x.size(0)
    x = x.view(b, -1) #.cuda() # el -1 multiplica los otros numeros no declarados

    # 1) forward
    # l = logit = funcion que devuelve una probabilidad entre 0 y 1 (log odds function) el output de la ultima capa
    with torch.no_grad():
      l = model(x)

    # 2) compute the objective function
    # the loss is the distance from the network prediction and the real value
    # the objective funtion is equal to the loss in value but it's a function of it's parameters not the output
    J = loss(l, y) #.cuda())
    losses.append(J.item())
    accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

  print(f"validation epoch: {epoch+1}, validation_loss: {torch.tensor(losses).mean():.2f}, validation_accuracy: {torch.tensor(accuracies).mean():.2f}")
  




  0%|          | 0/1719 [00:00<?, ?it/s]

> [0;32m/tmp/ipykernel_552304/50047693.py[0m(72)[0;36m<cell line: 51>[0;34m()[0m
[0;32m     70 [0;31m    [0;31m# the loss is the distance from the network prediction and the real value[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     71 [0;31m    [0;31m# the objective funtion is equal to the loss in value but it's a function of it's parameters not the output[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 72 [0;31m    [0mJ[0m [0;34m=[0m [0mloss[0m[0;34m([0m[0ml[0m[0;34m,[0m [0my[0m[0;34m)[0m [0;31m#.cuda())[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     73 [0;31m[0;34m[0m[0m
[0m[0;32m     74 [0;31m    [0;31m# 3) cleaning the gradient[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


  0%|          | 1/1719 [00:25<12:09:54, 25.49s/it]

--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
> [0;32m/tmp/ipykernel_551386/1932041763.py[0m(68)[0;36m<cell line: 51>[0;34m()[0m
[0;32m     66 [0;31m    [0;31m# l = logit = funcion que devuelve una probabilidad entre 0 y 1 (log odds function) el output de la ultima capa[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     67 [0;31m    [0ml[0m [0;34m=[0m [0mmodel[0m[0;34m([0m[0mx[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 68 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     69 [0;31m    [0;31m# 2) compute the objective function[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     70 [0;31m    [0;31m# the loss is the distance from the network prediction and the real value[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
