In [1]:
import torch
import torch.nn as nn
torch.manual_seed(42)

X = torch.randint(1,20, (10, 3)).float()
print("="*60)
print("Data antes de ser Normalizada: ")
print("="*60)
X

Data antes de ser Normalizada: 


tensor([[16., 14., 12.],
        [15.,  3., 15.],
        [16.,  7.,  6.],
        [14.,  3., 16.],
        [ 4.,  4., 18.],
        [ 8., 11.,  9.],
        [18., 18., 14.],
        [ 5., 17., 11.],
        [15.,  9., 11.],
        [10.,  7.,  7.]])

## 1. BatchNorm

In [2]:
bn = nn.BatchNorm1d(3)
bn.train()
output_pytorch= bn(X)
print("="*60)
print("Forward Pass Train utilizando Pytorch")
print("="*60)
output_pytorch

Forward Pass Train utilizando Pytorch


tensor([[ 0.8261,  0.8912,  0.0270],
        [ 0.6142, -1.1946,  0.8378],
        [ 0.8261, -0.4361, -1.5946],
        [ 0.4024, -1.1946,  1.1081],
        [-1.7157, -1.0050,  1.6486],
        [-0.8684,  0.3224, -0.7838],
        [ 1.2497,  1.6498,  0.5676],
        [-1.5038,  1.4601, -0.2432],
        [ 0.6142, -0.0569, -0.2432],
        [-0.4448, -0.4361, -1.3243]], grad_fn=<NativeBatchNormBackward0>)

In [3]:
print("="*60)
print("Parámetros Utilizados: ")
print("="*60)
bn.weight.data, bn.bias.data

Parámetros Utilizados: 


(tensor([1., 1., 1.]), tensor([0., 0., 0.]))

In [4]:
print("="*60)
print("Promedio y Varianza Acumuladas")
print("="*60)
bn.running_mean, bn.running_var

Promedio y Varianza Acumuladas


(tensor([1.2100, 0.9300, 1.1900]), tensor([3.3767, 3.9900, 2.4211]))

## Cálculo Manual

In [5]:
eps = 1e-5
batch_mean = X.mean(dim=0, keepdim=True)
batch_var_train = X.var(dim=0, unbiased=False, keepdim=True)
x_norm = (X-batch_mean)/torch.sqrt(batch_var_train + eps)
w = torch.tensor([1,1,1])
b = torch.tensor([0,0,0])


print("="*60)
print("Forward Pass Train obtenido Manualmente")
print("="*60)
w*x_norm + b

Forward Pass Train obtenido Manualmente


tensor([[ 0.8261,  0.8912,  0.0270],
        [ 0.6142, -1.1946,  0.8378],
        [ 0.8261, -0.4361, -1.5946],
        [ 0.4024, -1.1946,  1.1081],
        [-1.7157, -1.0050,  1.6486],
        [-0.8684,  0.3224, -0.7838],
        [ 1.2497,  1.6498,  0.5676],
        [-1.5038,  1.4601, -0.2432],
        [ 0.6142, -0.0569, -0.2432],
        [-0.4448, -0.4361, -1.3243]])

In [6]:
batch_var_train

tensor([[22.2900, 27.8100, 13.6900]])

In [7]:
print("="*60)
print("Test Time: ")
print("="*60)

batch_var_eval = X.var(dim=0, unbiased=True, keepdim=True)

alpha = 0.1
rm = (1-alpha)*torch.tensor([0,0,0]) + alpha*batch_mean
rv = (1-alpha)*torch.tensor([1,1,1]) + alpha*batch_var_eval
print("Media: ")
print(rm)
print("Varianza: ")
print(rv)

print("Normalización en Modo Evaluación obtenido de manera manual...")
print("="*60)
x_normalized_eval = (X - rm)/torch.sqrt(rv + eps)
bn.weight.data*x_normalized_eval+bn.bias.data

Test Time: 
Media: 
tensor([[1.2100, 0.9300, 1.1900]])
Varianza: 
tensor([[3.3767, 3.9900, 2.4211]])
Normalización en Modo Evaluación obtenido de manera manual...


tensor([[ 8.0487,  6.5432,  6.9473],
        [ 7.5045,  1.0363,  8.8753],
        [ 8.0487,  3.0388,  3.0913],
        [ 6.9603,  1.0363,  9.5180],
        [ 1.5183,  1.5369, 10.8034],
        [ 3.6951,  5.0413,  5.0193],
        [ 9.1370,  8.5457,  8.2327],
        [ 2.0625,  8.0451,  6.3046],
        [ 7.5045,  4.0400,  6.3046],
        [ 4.7835,  3.0388,  3.7339]])

In [8]:
bn.eval()
print("="*60)
print("Forward Pass en Modo Evaluación usando Pytorch...")
print("="*60)
bn(X)

Forward Pass en Modo Evaluación usando Pytorch...


tensor([[ 8.0487,  6.5432,  6.9473],
        [ 7.5045,  1.0363,  8.8753],
        [ 8.0487,  3.0388,  3.0913],
        [ 6.9603,  1.0363,  9.5180],
        [ 1.5183,  1.5369, 10.8034],
        [ 3.6951,  5.0413,  5.0193],
        [ 9.1370,  8.5457,  8.2327],
        [ 2.0625,  8.0451,  6.3046],
        [ 7.5045,  4.0400,  6.3046],
        [ 4.7835,  3.0388,  3.7339]], grad_fn=<NativeBatchNormBackward0>)

## 2. Layer Norm

In [9]:
ln = nn.LayerNorm(3)
output_pytorch = ln(X)
print("="*60)
print("Forward Pass obtenido utilizando Pytorch")
print("="*60)
output_pytorch

Forward Pass obtenido utilizando Pytorch


tensor([[ 1.2247,  0.0000, -1.2247],
        [ 0.7071, -1.4142,  0.7071],
        [ 1.4084, -0.5930, -0.8154],
        [ 0.5249, -1.3997,  0.8748],
        [-0.7071, -0.7071,  1.4142],
        [-1.0690,  1.3363, -0.2673],
        [ 0.7071,  0.7071, -1.4142],
        [-1.2247,  1.2247,  0.0000],
        [ 1.3363, -1.0690, -0.2673],
        [ 1.4142, -0.7071, -0.7071]], grad_fn=<NativeLayerNormBackward0>)

In [10]:
print("="*60)
print("Parámetros Iniciales: ")
print("="*60)
ln.weight.data, ln.bias.data

Parámetros Iniciales: 


(tensor([1., 1., 1.]), tensor([0., 0., 0.]))

In [11]:
eps = 1e-5
sample_mean = X.mean(dim=1, keepdim=True)
sample_var = X.var(dim=1, unbiased=False, keepdim=True)
x_normalized = (X - sample_mean) / torch.sqrt(sample_var + eps)
print("="*60)
print("Forward Pass Obtenido de manera Manual")
print("="*60)
ln.weight.data*x_normalized + ln.bias.data

Forward Pass Obtenido de manera Manual


tensor([[ 1.2247,  0.0000, -1.2247],
        [ 0.7071, -1.4142,  0.7071],
        [ 1.4084, -0.5930, -0.8154],
        [ 0.5249, -1.3997,  0.8748],
        [-0.7071, -0.7071,  1.4142],
        [-1.0690,  1.3363, -0.2673],
        [ 0.7071,  0.7071, -1.4142],
        [-1.2247,  1.2247,  0.0000],
        [ 1.3363, -1.0690, -0.2673],
        [ 1.4142, -0.7071, -0.7071]])

## 3. RMSNorm

In [12]:
print("="*60)
print("Forward Pass Obtenido utilizando Pytorch")
print("="*60)
rms_layer = nn.RMSNorm(3)
rms_layer(X)

Forward Pass Obtenido utilizando Pytorch


tensor([[1.1352, 0.9933, 0.8514],
        [1.2127, 0.2425, 1.2127],
        [1.5007, 0.6566, 0.5628],
        [1.1294, 0.2420, 1.2907],
        [0.3672, 0.3672, 1.6524],
        [0.8496, 1.1682, 0.9558],
        [1.0732, 1.0732, 0.8347],
        [0.4152, 1.4118, 0.9135],
        [1.2573, 0.7544, 0.9220],
        [1.2309, 0.8616, 0.8616]], grad_fn=<MulBackward0>)

In [13]:
print("="*60)
print("Forward Pass Obtenido de manera Manual")
print("="*60)
rms = torch.sqrt((X**2).mean(dim=1, keepdims = True))
rms_layer.weight.data*X/rms

Forward Pass Obtenido de manera Manual


tensor([[1.1352, 0.9933, 0.8514],
        [1.2127, 0.2425, 1.2127],
        [1.5007, 0.6566, 0.5628],
        [1.1294, 0.2420, 1.2907],
        [0.3672, 0.3672, 1.6524],
        [0.8496, 1.1682, 0.9558],
        [1.0732, 1.0732, 0.8347],
        [0.4152, 1.4118, 0.9135],
        [1.2573, 0.7544, 0.9220],
        [1.2309, 0.8616, 0.8616]])

## Dropout

In [14]:
torch.manual_seed(42)
p = 0.5
do = nn.Dropout(p = p)
do.train()
output_pytorch = do(X)
print("="*60)
print("Output obtenido utilizando Pytorch")
print("="*60)
output_pytorch

Output obtenido utilizando Pytorch


tensor([[32., 28., 24.],
        [30.,  0., 30.],
        [ 0.,  0., 12.],
        [28.,  6., 32.],
        [ 0.,  0., 36.],
        [ 0., 22.,  0.],
        [ 0., 36., 28.],
        [ 0., 34., 22.],
        [30., 18.,  0.],
        [20., 14., 14.]])

In [15]:
print("="*60)
print("Se genera una máscara aleatoria de los Elementos a mantenerse...")
print("="*60)
mask = torch.where(output_pytorch!=0, 1, 0)
mask

Se genera una máscara aleatoria de los Elementos a mantenerse...


tensor([[1, 1, 1],
        [1, 0, 1],
        [0, 0, 1],
        [1, 1, 1],
        [0, 0, 1],
        [0, 1, 0],
        [0, 1, 1],
        [0, 1, 1],
        [1, 1, 0],
        [1, 1, 1]])

In [16]:
print("="*60)
print("A todos los elementos escogidos para mantenerse son escalados...")
print("="*60)
scale = 1/(1-p)
print("")
print("="*60)
print("Output Obtenido Manualmente")
print("="*60)
x_do = X*mask*scale
x_do

A todos los elementos escogidos para mantenerse son escalados...

Output Obtenido Manualmente


tensor([[32., 28., 24.],
        [30.,  0., 30.],
        [ 0.,  0., 12.],
        [28.,  6., 32.],
        [ 0.,  0., 36.],
        [ 0., 22.,  0.],
        [ 0., 36., 28.],
        [ 0., 34., 22.],
        [30., 18.,  0.],
        [20., 14., 14.]])

In [17]:
do = nn.Dropout(p = p)
do.eval()
output_pytorch = do(X)
output_pytorch

tensor([[16., 14., 12.],
        [15.,  3., 15.],
        [16.,  7.,  6.],
        [14.,  3., 16.],
        [ 4.,  4., 18.],
        [ 8., 11.,  9.],
        [18., 18., 14.],
        [ 5., 17., 11.],
        [15.,  9., 11.],
        [10.,  7.,  7.]])

## Inicialización de Parámetros

In [18]:
fc = nn.Linear(5,3)
fc.weight.data, fc.bias.data

(tensor([[ 0.0282, -0.3052,  0.1379, -0.1540,  0.1370],
         [-0.0932,  0.3709, -0.2651, -0.2667, -0.2667],
         [ 0.4022,  0.1490,  0.4303, -0.3691, -0.4436]]),
 tensor([-0.3499, -0.3008,  0.1811]))

In [19]:
nn.init.ones_(fc.weight)
fc.weight.data, fc.bias.data

(tensor([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]),
 tensor([-0.3499, -0.3008,  0.1811]))

In [20]:
nn.init.ones_(fc.weight)
nn.init.zeros_(fc.bias)
fc.weight.data, fc.bias.data

(tensor([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]),
 tensor([0., 0., 0.]))

In [21]:
nn.init.ones_(fc.weight)
nn.init.zeros_(fc.bias)
fc.weight.data, fc.bias.data

(tensor([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]),
 tensor([0., 0., 0.]))

In [22]:
nn.init.constant_(fc.weight, 0.3)
nn.init.constant_(fc.bias, 0.5)
fc.weight.data, fc.bias.data

(tensor([[0.3000, 0.3000, 0.3000, 0.3000, 0.3000],
         [0.3000, 0.3000, 0.3000, 0.3000, 0.3000],
         [0.3000, 0.3000, 0.3000, 0.3000, 0.3000]]),
 tensor([0.5000, 0.5000, 0.5000]))

In [23]:
nn.init.uniform_(fc.weight, 1,100)
fc.weight.data

tensor([[68.2248, 91.6308, 24.9369, 16.7553, 76.7636],
        [30.4919, 80.5427, 38.7536, 78.8163, 12.0401],
        [25.5198, 65.5914, 60.9647, 37.8795, 80.0054]])

In [24]:
torch.nn.init.calculate_gain("relu", param=None)
nn.init.xavier_uniform_(fc.weight.data, gain=nn.init.calculate_gain("relu"))

tensor([[ 0.8326, -0.8882, -0.6539,  1.1215, -0.4133],
        [-0.4342, -1.1851, -0.7014,  0.3059, -0.1617],
        [-0.8890,  0.0287, -0.8366, -1.0391, -0.6744]])

In [25]:
nn.init.kaiming_uniform_(fc.weight.data, mode="fan_in", nonlinearity="relu")

tensor([[-0.9587, -0.6975,  1.0950,  0.2069,  0.3376],
        [-1.0217, -0.7195, -0.3646,  0.1713, -0.9639],
        [-0.4720, -0.6558,  0.0030, -0.4076, -0.0759]])

## Checkpoint

In [26]:
## Checkpoint

X = torch.randint(1,20, (10, 3)).float()
y = torch.randint(0,2, (10,)).float()

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(3, 1)

    def forward(self, x):
        return self.fc(x)



In [27]:
torch.manual_seed(42)
epochs=20
epoch_loss = []
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()
for e in range(epochs):
  model.train()
  optimizer.zero_grad()
  logits = model(X)
  loss = criterion(logits, y.unsqueeze(-1))
  loss.backward()
  optimizer.step()
  epoch_loss.append(loss.item())
  print(f"Epoch: {e+1}. Loss: {loss.item()}")

model.fc.weight.data

Epoch: 1. Loss: 5.684591770172119
Epoch: 2. Loss: 4.632599830627441
Epoch: 3. Loss: 3.6506359577178955
Epoch: 4. Loss: 2.812757968902588
Epoch: 5. Loss: 2.171171188354492
Epoch: 6. Loss: 1.7123682498931885
Epoch: 7. Loss: 1.4345756769180298
Epoch: 8. Loss: 1.284837007522583
Epoch: 9. Loss: 1.182861089706421
Epoch: 10. Loss: 1.0974228382110596
Epoch: 11. Loss: 1.0225789546966553
Epoch: 12. Loss: 0.9574964642524719
Epoch: 13. Loss: 0.9016749262809753
Epoch: 14. Loss: 0.8543142080307007
Epoch: 15. Loss: 0.8144165277481079
Epoch: 16. Loss: 0.7809440493583679
Epoch: 17. Loss: 0.7529222965240479
Epoch: 18. Loss: 0.7294866442680359
Epoch: 19. Loss: 0.7098948359489441
Epoch: 20. Loss: 0.6935199499130249


tensor([[ 0.1235, -0.1139, -0.0554]])

In [28]:
checkpoint = {
    'epoch': epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss
}

torch.save(checkpoint, 'checkpoint.pth')

In [29]:
checkpoint = torch.load('checkpoint.pth')

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

print(f"✅ Loaded checkpoint from epoch {epoch}, loss={loss}")

✅ Loaded checkpoint from epoch 20, loss=0.6935199499130249


In [30]:
model.fc.weight.data

tensor([[ 0.1235, -0.1139, -0.0554]])

## Gradient Accumulation

In [31]:
## Training with Accumulation
torch.manual_seed(42)
epochs=20
accumulation_steps=4
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()
model.zero_grad()                                   # Resetea Gradientes Iniciales
for e in range(epochs):
  logits = model(X)
  loss = criterion(logits, y.unsqueeze(-1))
  print(f"Epoch: {e+1}. Loss: {loss.item()}")
  loss = loss / accumulation_steps                # Normaliza Loss
  loss.backward()                                 # Backward pass (Recordar que Pytorch Acumula Gradientes hasta que se use .zero_grad())
  if (e+1) % accumulation_steps == 0:
      optimizer.step()                            # Se actualizan pesos sólo cada ciertos steps
      model.zero_grad()                           # Y ahora se resetea


Epoch: 1. Loss: 5.684591770172119
Epoch: 2. Loss: 5.684591770172119
Epoch: 3. Loss: 5.684591770172119
Epoch: 4. Loss: 5.684591770172119
Epoch: 5. Loss: 4.632599830627441
Epoch: 6. Loss: 4.632599830627441
Epoch: 7. Loss: 4.632599830627441
Epoch: 8. Loss: 4.632599830627441
Epoch: 9. Loss: 3.6506359577178955
Epoch: 10. Loss: 3.6506359577178955
Epoch: 11. Loss: 3.6506359577178955
Epoch: 12. Loss: 3.6506359577178955
Epoch: 13. Loss: 2.812757968902588
Epoch: 14. Loss: 2.812757968902588
Epoch: 15. Loss: 2.812757968902588
Epoch: 16. Loss: 2.812757968902588
Epoch: 17. Loss: 2.171171188354492
Epoch: 18. Loss: 2.171171188354492
Epoch: 19. Loss: 2.171171188354492
Epoch: 20. Loss: 2.171171188354492


In [32]:
model.fc.weight.data

tensor([[ 0.2156,  0.1241, -0.2904]])

In [33]:
## Training with Accumulation
torch.manual_seed(42)
epochs=80
accumulation_steps=4
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()
model.zero_grad()                                   # Resetea Gradientes Iniciales
for e in range(epochs):
  logits = model(X)
  loss = criterion(logits, y.unsqueeze(-1))
  print(f"Epoch: {e+1}. Loss: {loss.item()}")
  loss = loss / accumulation_steps                # Normaliza Loss
  loss.backward()                                 # Backward pass (Recordar que Pytorch Acumula Gradientes hasta que se use .zero_grad())
  if (e+1) % accumulation_steps == 0:
      optimizer.step()                            # Se actualizan pesos sólo cada ciertos steps
      model.zero_grad()                           # Y ahora se resetea

model.fc.weight.data

Epoch: 1. Loss: 5.684591770172119
Epoch: 2. Loss: 5.684591770172119
Epoch: 3. Loss: 5.684591770172119
Epoch: 4. Loss: 5.684591770172119
Epoch: 5. Loss: 4.632599830627441
Epoch: 6. Loss: 4.632599830627441
Epoch: 7. Loss: 4.632599830627441
Epoch: 8. Loss: 4.632599830627441
Epoch: 9. Loss: 3.6506359577178955
Epoch: 10. Loss: 3.6506359577178955
Epoch: 11. Loss: 3.6506359577178955
Epoch: 12. Loss: 3.6506359577178955
Epoch: 13. Loss: 2.812757968902588
Epoch: 14. Loss: 2.812757968902588
Epoch: 15. Loss: 2.812757968902588
Epoch: 16. Loss: 2.812757968902588
Epoch: 17. Loss: 2.171171188354492
Epoch: 18. Loss: 2.171171188354492
Epoch: 19. Loss: 2.171171188354492
Epoch: 20. Loss: 2.171171188354492
Epoch: 21. Loss: 1.7123682498931885
Epoch: 22. Loss: 1.7123682498931885
Epoch: 23. Loss: 1.7123682498931885
Epoch: 24. Loss: 1.7123682498931885
Epoch: 25. Loss: 1.4345756769180298
Epoch: 26. Loss: 1.4345756769180298
Epoch: 27. Loss: 1.4345756769180298
Epoch: 28. Loss: 1.4345756769180298
Epoch: 29. Loss: 

tensor([[ 0.1235, -0.1139, -0.0554]])