In [1]:
import torchvision
from torchvision.models import resnet152, ResNet152_Weights
from torchvision.models import vit_b_16, ViT_B_16_Weights
import torch
import torch.nn as nn
from torchinfo import summary
import timm
from tqdm import tqdm
import torchvision.transforms as transforms

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def train_model(model, dataloader, criterion, optimizer, device, num_epochs=2):
    """
    Trains the model using the provided dataloader, criterion, and optimizer.

    Args:
        model (torch.nn.Module): The model to train.
        dataloader (torch.utils.data.DataLoader): The dataloader to fetch the data.
        criterion (torch.nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer for updating the model parameters.
        device (torch.device): The device to perform the computation on (e.g., 'cuda' or 'cpu').
        num_epochs (int): The number of epochs to train for. Default is 2.

    Returns:
        None
    """
    for epoch in range(num_epochs):
        running_loss = 0.0
        
        # Loop through dataloader with tqdm for batch progress
        batch_iterator = tqdm(enumerate(dataloader, 0), desc=f"Epoch {epoch+1}", leave=False, total=len(dataloader))
        
        for i, data in batch_iterator:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Update the description of the progress bar with the running loss
            batch_iterator.set_postfix(loss=running_loss / (i + 1))

        epoch_loss = running_loss / len(dataloader)
        print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss:.4f}')

    print('Finished Training')

## Transfer Learning

### resnet152

In [19]:
weights = ResNet152_Weights.DEFAULT
model = resnet152(weights=weights)
preprocess = weights.transforms()

In [20]:
preprocess

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [21]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [22]:
model.fc = nn.Linear(in_features=2048, out_features=10, bias=True)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

In [23]:
print(model)

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [25]:
summary(model)

Layer (type:depth-idx)                   Param #
ResNet                                   --
├─Conv2d: 1-1                            3,136
├─BatchNorm2d: 1-2                       128
├─ReLU: 1-3                              --
├─MaxPool2d: 1-4                         --
├─Sequential: 1-5                        --
│    └─Bottleneck: 2-1                   --
│    │    └─Conv2d: 3-1                  4,096
│    │    └─BatchNorm2d: 3-2             128
│    │    └─Conv2d: 3-3                  36,864
│    │    └─BatchNorm2d: 3-4             128
│    │    └─Conv2d: 3-5                  16,384
│    │    └─BatchNorm2d: 3-6             512
│    │    └─ReLU: 3-7                    --
│    │    └─Sequential: 3-8              16,896
│    └─Bottleneck: 2-2                   --
│    │    └─Conv2d: 3-9                  16,384
│    │    └─BatchNorm2d: 3-10            128
│    │    └─Conv2d: 3-11                 36,864
│    │    └─BatchNorm2d: 3-12            128
│    │    └─Conv2d: 3-13               

In [27]:
preprocess = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),  # Convertir a escala de grises (1 canal)
    transforms.ToTensor()  # Convertir a tensor
])

In [28]:
train_dataset = torchvision.datasets.CIFAR10('cifar10/', train=True, download=True, transform=preprocess)
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

Files already downloaded and verified


In [29]:
data_iter = iter(dataloader)
images, labels = next(data_iter)
print(images.shape)

torch.Size([16, 1, 32, 32])


In [30]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

criterion = nn.CrossEntropyLoss()

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [31]:
train_model(model, dataloader, criterion, optimizer, device, num_epochs=2)

                                                                      

KeyboardInterrupt: 

### vit_b_16

In [2]:
weights = ViT_B_16_Weights.DEFAULT
model = vit_b_16(weights=weights)
preprocess = weights.transforms()

In [3]:
preprocess

ImageClassification(
    crop_size=[224]
    resize_size=[256]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [4]:
print(model)

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [5]:
# cambio al classifier
model.heads[0] = nn.Linear(in_features=768, out_features=10, bias=True)

In [6]:
print(model)

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [7]:
summary(model)

Layer (type:depth-idx)                                            Param #
VisionTransformer                                                 768
├─Conv2d: 1-1                                                     590,592
├─Encoder: 1-2                                                    151,296
│    └─Dropout: 2-1                                               --
│    └─Sequential: 2-2                                            --
│    │    └─EncoderBlock: 3-1                                     7,087,872
│    │    └─EncoderBlock: 3-2                                     7,087,872
│    │    └─EncoderBlock: 3-3                                     7,087,872
│    │    └─EncoderBlock: 3-4                                     7,087,872
│    │    └─EncoderBlock: 3-5                                     7,087,872
│    │    └─EncoderBlock: 3-6                                     7,087,872
│    │    └─EncoderBlock: 3-7                                     7,087,872
│    │    └─EncoderBlock: 3-8         

In [9]:
train_dataset = torchvision.datasets.CIFAR10('cifar10/', train=True, download=True, transform=preprocess)
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

Files already downloaded and verified


In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

criterion = nn.CrossEntropyLoss()

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [11]:
train_model(model, dataloader, criterion, optimizer, device, num_epochs=2)

                                                                      

KeyboardInterrupt: 

### xception

In [3]:
model = timm.create_model('xception', pretrained=True, num_classes=10)

  model = create_fn(


In [4]:
print(model)

Xception(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), bias=False)
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU(inplace=True)
  (block1): Block(
    (skip): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
    (skipbn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (rep): Sequential(
      (0): SeparableConv2d(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
        (pointwise): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): SeparableConv2d(
        (conv1): Conv

In [5]:
summary(model)

Layer (type:depth-idx)                   Param #
Xception                                 --
├─Conv2d: 1-1                            864
├─BatchNorm2d: 1-2                       64
├─ReLU: 1-3                              --
├─Conv2d: 1-4                            18,432
├─BatchNorm2d: 1-5                       128
├─ReLU: 1-6                              --
├─Block: 1-7                             --
│    └─Conv2d: 2-1                       8,192
│    └─BatchNorm2d: 2-2                  256
│    └─Sequential: 2-3                   --
│    │    └─SeparableConv2d: 3-1         8,768
│    │    └─BatchNorm2d: 3-2             256
│    │    └─ReLU: 3-3                    --
│    │    └─SeparableConv2d: 3-4         17,536
│    │    └─BatchNorm2d: 3-5             256
│    │    └─MaxPool2d: 3-6               --
├─Block: 1-8                             --
│    └─Conv2d: 2-4                       32,768
│    └─BatchNorm2d: 2-5                  512
│    └─Sequential: 2-6                   --
│  

In [7]:
train_dataset = torchvision.datasets.CIFAR10('cifar10/', train=True, download=True, transform=torchvision.transforms.ToTensor())
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

Files already downloaded and verified


In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

criterion = nn.CrossEntropyLoss()

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [9]:
train_model(model, dataloader, criterion, optimizer, device, num_epochs=2)

                                                                       

Epoch 1/2 - Loss: 1.3656


                                                                        

Epoch 2/2 - Loss: 0.7531
Finished Training




## Fine Tuning

### vit_b_16

In [32]:
weights = ViT_B_16_Weights.DEFAULT
model = vit_b_16(weights=weights)
preprocess = weights.transforms()

In [33]:
print(model)

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [34]:
for param in model.parameters():
  param.requires_grad = False

In [35]:
# cambio al classifier
model.heads[0] = nn.Linear(in_features=768, out_features=10, bias=True)
model.conv_proj = nn.Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16))

In [36]:
print(model)

VisionTransformer(
  (conv_proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [37]:
summary(model)

Layer (type:depth-idx)                                            Param #
VisionTransformer                                                 768
├─Conv2d: 1-1                                                     197,376
├─Encoder: 1-2                                                    151,296
│    └─Dropout: 2-1                                               --
│    └─Sequential: 2-2                                            --
│    │    └─EncoderBlock: 3-1                                     (7,087,872)
│    │    └─EncoderBlock: 3-2                                     (7,087,872)
│    │    └─EncoderBlock: 3-3                                     (7,087,872)
│    │    └─EncoderBlock: 3-4                                     (7,087,872)
│    │    └─EncoderBlock: 3-5                                     (7,087,872)
│    │    └─EncoderBlock: 3-6                                     (7,087,872)
│    │    └─EncoderBlock: 3-7                                     (7,087,872)
│    │    └─EncoderBlock

In [42]:
preprocess = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),  # Convertir a escala de grises (1 canal)
    transforms.Resize((224, 224)),  # Redimensionar a 50x50 píxeles
    transforms.ToTensor()  # Convertir a tensor
])

In [43]:
train_dataset = torchvision.datasets.CIFAR10('cifar10/', train=True, download=True, transform=preprocess)
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

Files already downloaded and verified


In [44]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

criterion = nn.CrossEntropyLoss()

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [45]:
train_model(model, dataloader, criterion, optimizer, device, num_epochs=2)

                                                                       

KeyboardInterrupt: 

In [2]:
def vit(n_channels: int, num_classes: int, fine_tune: str = 'classifier'):
    weights = ViT_B_16_Weights.DEFAULT
    model = vit_b_16(weights=weights)

    # Fine-tuning
    if fine_tune == 'classifier':
        # Congelar todas las capas excepto el clasificador
        for param in model.parameters():
            param.requires_grad = False
        # Descongelar el clasificador
        for param in model.heads[0].parameters():
            param.requires_grad = True

    elif fine_tune == 'classifier+conv1':
        # Modificar la proyección conv para adaptarla a n_channels
        model.conv_proj = nn.Conv2d(n_channels, 768, kernel_size=(16, 16), stride=(16, 16))
        # Congelar todas las capas excepto el clasificador y conv_proj
        for name, param in model.named_parameters():
            if 'conv_proj' in name or 'heads' in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

    elif fine_tune == 'full':
        # Modificar la proyección conv para adaptarla a n_channels
        model.conv_proj = nn.Conv2d(n_channels, 768, kernel_size=(16, 16), stride=(16, 16))
        # Descongelar todas las capas
        for param in model.parameters():
            param.requires_grad = True

    # Modificar el clasificador
    model.heads[0] = nn.Linear(in_features=768, out_features=num_classes, bias=True)
    return model

In [11]:
model = vit(1, 1, 'classifier+conv1')

In [12]:
print(model)

VisionTransformer(
  (conv_proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [13]:
summary(model)

Layer (type:depth-idx)                                            Param #
VisionTransformer                                                 768
├─Conv2d: 1-1                                                     197,376
├─Encoder: 1-2                                                    151,296
│    └─Dropout: 2-1                                               --
│    └─Sequential: 2-2                                            --
│    │    └─EncoderBlock: 3-1                                     (7,087,872)
│    │    └─EncoderBlock: 3-2                                     (7,087,872)
│    │    └─EncoderBlock: 3-3                                     (7,087,872)
│    │    └─EncoderBlock: 3-4                                     (7,087,872)
│    │    └─EncoderBlock: 3-5                                     (7,087,872)
│    │    └─EncoderBlock: 3-6                                     (7,087,872)
│    │    └─EncoderBlock: 3-7                                     (7,087,872)
│    │    └─EncoderBlock