In [1]:
import torch
import json
import os

def save_model(model, optimizer, path: str, epoch: int):

    # Salvar o estado do modelo e do otimizador junto com a época
    state = {
        "epoch": epoch,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict()
    }
    torch.save(state, path)
    print(f"Modelo e estado salvos em: {path}")

def load_model(model, optimizer, path: str):

    if not os.path.exists(path):
        raise FileNotFoundError(f"O arquivo {path} não foi encontrado.")
    
    # Carregar o estado salvo
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint["model_state"])
    optimizer.load_state_dict(checkpoint["optimizer_state"])
    epoch = checkpoint["epoch"]

    print(f"Modelo carregado de: {path}")
    print(f"Época carregada: {epoch}")
    
    return model, optimizer, epoch


In [2]:
""" NoisyNN in PyTorch

'NoisyNN: Exploring the Influence of Information Entropy Change in Learning Systems'
- https://arxiv.org/pdf/2309.10625v2.pdf

Note that it's not an official implementation
"""
import torch
from timm.models.vision_transformer import VisionTransformer

def quality_matrix(k, alpha=0.3):
    """r
    Quality matrix Q. Described in the eq (17) so that eps = QX, where X is the input. 
    Alpha is 0.3, as mentioned in Appendix D.
    """
    identity = torch.diag(torch.ones(k))
    shift_identity = torch.zeros(k, k)
    for i in range(k):
        shift_identity[(i+1)%k, i] = 1
    opt = -alpha * identity + alpha * shift_identity
    return opt

def optimal_quality_matrix(k):
    """r
    Optimal Quality matrix Q. Described in the eq (19) so that eps = QX, where X is the input. 
    Suppose 1_(kxk) is torch.ones
    """
    return torch.diag(torch.ones(k)) * -k/(k+1) + torch.ones(k, k) / (k+1)

class NoisyViT(VisionTransformer):
    """r
    Args:
        optimal: Determine the linear transform noise is produced by the quality matrix or the optimal quality matrix.
        res: Inference resolution. Ensure the aspect ratio = 1
    
    """
    def __init__(self, optimal: bool, res: int, **kwargs):
        self.stage3_res = res // 16
        if optimal:
            linear_transform_noise = optimal_quality_matrix(self.stage3_res)
        else:
            linear_transform_noise = quality_matrix(self.stage3_res)
        super().__init__(**kwargs)
        self.linear_transform_noise = torch.nn.Parameter(linear_transform_noise, requires_grad=False)

    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
        if self.grad_checkpointing and not torch.jit.is_scripting():
            return super().forward_features(x)
        
        x = self.patch_embed(x)
        x = self._pos_embed(x)
        x = self.patch_drop(x)
        x = self.norm_pre(x)
        # Add noise when training/testing
        # See https://openreview.net/forum?id=Ce0dDt9tUT for more detail
        x = self.blocks[:-1](x)
        # Suppose the token dim = 1
        token = x[:, 0, :].unsqueeze(1)
        x = x[:, 1:, :].permute(0, 2, 1)
        B, C, L = x.shape
        x = x.reshape(B, C, self.stage3_res, self.stage3_res)
        x = self.linear_transform_noise@x + x
        x = x.flatten(2).transpose(1, 2).contiguous()
        x = torch.cat([token, x], dim=1)
        x = self.blocks[-1](x)

        x = self.norm(x)
        return x

# We don't specify more args because the paper didn't reveal more details
def vit_t(optimal=True, res=224) -> NoisyViT:
    model = NoisyViT(
        optimal=optimal, 
        res=res, 
        patch_size=4, 
        embed_dim=192, 
        depth=12, 
        num_heads=3
    )
    return model

def vit_s(optimal=True, res=224) -> NoisyViT:
    model = NoisyViT(
        optimal=optimal, 
        res=res, 
        patch_size=4, 
        embed_dim=384, 
        depth=12, 
        num_heads=6
    )
    return model

def vit_b(optimal=True, res=224) -> NoisyViT:
    model = NoisyViT(
        optimal=optimal, 
        res=res, 
        patch_size=4, 
        embed_dim=768, 
        depth=12, 
        num_heads=12
    )
    return model

def vit_l(optimal=True, res=224) -> NoisyViT:
    model = NoisyViT(
        optimal=optimal, 
        res=res, 
        patch_size=16, 
        embed_dim=1024, 
        depth=24, 
        num_heads=16
    )
    return model

# Easy test
# if __name__ == '__main__':
#     model = vit_l().cuda()
#     inputs = torch.rand((2, 3, 224, 224)).cuda()
#     output = model(inputs)
#     print('Pass')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from timm.loss import LabelSmoothingCrossEntropy
from timm.scheduler.cosine_lr import CosineLRScheduler
from tqdm import tqdm
import os
import torch.nn as nn
import torch.optim as optim

# Configurar transformações para treino e validação
transformacoes_de_imagens = {
    'treino': transforms.Compose([
        transforms.RandomResizedCrop((224, 224), scale=(0.05, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ]),
    'validacao': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
}

# Carregar datasets Fer-2013
dataset = '../data/Fer-2013'
pasta_treino = os.path.join(dataset, 'treino')
pasta_validacao = os.path.join(dataset, 'validacao')

data = {
    'treino': datasets.ImageFolder(root=pasta_treino, transform=transformacoes_de_imagens['treino']),
    'validacao': datasets.ImageFolder(root=pasta_validacao, transform=transformacoes_de_imagens['validacao'])
}

# Criar DataLoaders
batch_size = 8
data_loader_treino = DataLoader(data['treino'], batch_size=batch_size, shuffle=True, num_workers=4)
data_loader_validacao = DataLoader(data['validacao'], batch_size=batch_size, shuffle=False, num_workers=4)

# Verificar dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Função de treinamento para NoisyViT
def train_noisy_vit(model, epochs=10, lr=1e-4, weight_decay=1e-4, save_path="noisy_vit.pth",resume=True):
    # Configurar otimizador, scheduler e função de perda
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = CosineLRScheduler(optimizer, t_initial=epochs, lr_min=1e-6, warmup_t=2)
    loss_fn = LabelSmoothingCrossEntropy(0.1)

    model.to(device)
    start_epoch = 0
    if resume:
        try:
            model, optimizer, start_epoch = load_model(model, optimizer, save_path)
            print(f"Retomando o treinamento a partir da época {start_epoch + 1}")
        except FileNotFoundError:
            print("Nenhum modelo salvo encontrado. Iniciando do zero.")    
    for epoch in range(start_epoch, epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        print(f"Época {epoch + 1}/{epochs}")
        for inputs, labels in tqdm(data_loader_treino, desc="Treinando"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
        
        train_acc = 100. * correct / total
        print(f"Treino - Loss: {train_loss / len(data_loader_treino):.4f}, Acc: {train_acc:.2f}%")

        # Validação
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, labels in tqdm(data_loader_validacao, desc="Validando"):
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

        val_acc = 100. * correct / total
        print(f"Validação - Loss: {val_loss / len(data_loader_validacao):.4f}, Acc: {val_acc:.2f}%")
        save_model(model, optimizer, save_path, epoch=epoch + 1)
        scheduler.step(epoch + 1)

# Treinar NoisyViT
if __name__ == '__main__':
    # Criar o modelo NoisyViT
    
    model = vit_l()  # Escolha do modelo: vit_t, vit_s, vit_b, vit_l
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
    train_noisy_vit(model, epochs=10, lr=1e-4, weight_decay=1e-4)


Nenhum modelo salvo encontrado. Iniciando do zero.
Época 1/10


  x = F.scaled_dot_product_attention(
Treinando: 100%|██████████| 3589/3589 [7:02:04<00:00,  7.06s/it]  


Treino - Loss: 7.5230, Acc: 0.00%


Validando: 100%|██████████| 449/449 [03:29<00:00,  2.15it/s]


Validação - Loss: 7.5285, Acc: 0.00%
Modelo e estado salvos em: noisy_vit.pth
Época 2/10


Treinando:  52%|█████▏    | 1878/3589 [4:06:01<2:58:43,  6.27s/it] 

In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from timm.loss import LabelSmoothingCrossEntropy
from timm.scheduler.cosine_lr import CosineLRScheduler
from tqdm import tqdm
import os

# Configurações
batch_size = 8  # Reduzido para evitar OOM
epochs = 10
lr = 1e-4
weight_decay = 1e-4

# Transformações
transformacoes_de_imagens = {
    'treino': transforms.Compose([
        transforms.RandomResizedCrop((224, 224), scale=(0.05, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ]),
    'validacao': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
}

# Dataset
dataset = '../data/Fer-2013'
pasta_treino = os.path.join(dataset, 'treino')
pasta_validacao = os.path.join(dataset, 'validacao')

data = {
    'treino': datasets.ImageFolder(root=pasta_treino, transform=transformacoes_de_imagens['treino']),
    'validacao': datasets.ImageFolder(root=pasta_validacao, transform=transformacoes_de_imagens['validacao'])
}

data_loader_treino = DataLoader(data['treino'], batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
data_loader_validacao = DataLoader(data['validacao'], batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# Verificar dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Função de Treinamento
def train_noisy_vit(model, epochs, lr, weight_decay):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = CosineLRScheduler(optimizer, t_initial=epochs, lr_min=1e-6, warmup_t=2)
    loss_fn = LabelSmoothingCrossEntropy(0.1)

    scaler = torch.cuda.amp.GradScaler()  # Mixed Precision
    model.to(device)
    model.set_grad_checkpointing(enable=True)  # Ativar Gradient Checkpointing

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        print(f"Época {epoch + 1}/{epochs}")
        for inputs, labels in tqdm(data_loader_treino, desc="Treinando"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():  # Mixed Precision
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = 100. * correct / total
        print(f"Treino - Loss: {train_loss / len(data_loader_treino):.4f}, Acc: {train_acc:.2f}%")

        # Validação
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, labels in tqdm(data_loader_validacao, desc="Validando"):
                inputs, labels = inputs.to(device), labels.to(device)
                with torch.cuda.amp.autocast():
                    outputs = model(inputs)
                    loss = loss_fn(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

        val_acc = 100. * correct / total
        print(f"Validação - Loss: {val_loss / len(data_loader_validacao):.4f}, Acc: {val_acc:.2f}%")

        scheduler.step(epoch + 1)

# Treinar o modelo
if __name__ == '__main__': # Altere para o modelo menor
    model = vit_t()  # Escolha um modelo menor para evitar OOM
    train_noisy_vit(model, epochs=epochs, lr=lr, weight_decay=weight_decay)
