In [4]:
import os
import random
import shutil

# Diretórios
original_dir = "/content/drive/MyDrive/Colab Notebooks/Base"
train_dir = os.path.join(original_dir, "train")
val_dir = os.path.join(original_dir, "val")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

random.seed(42)  # Reprodutibilidade

# Mover 20% de cada classe para val/
for class_name in os.listdir(original_dir):
    class_path = os.path.join(original_dir, class_name)
    if not os.path.isdir(class_path) or class_name in ["train", "val"]:
        continue

    images = os.listdir(class_path)
    random.shuffle(images)
    split_idx = int(0.8 * len(images))
    train_imgs = images[:split_idx]
    val_imgs = images[split_idx:]

    os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
    os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)

    for img in train_imgs:
        shutil.copy(os.path.join(class_path, img), os.path.join(train_dir, class_name, img))
    for img in val_imgs:
        shutil.copy(os.path.join(class_path, img), os.path.join(val_dir, class_name, img))

print("Divisão concluída.")


Divisão concluída.


In [5]:
from google.colab import drive
drive.mount('/content/drive')

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, RandomHorizontalFlip
from transformers import ViTForImageClassification, ViTImageProcessor
from torch import nn, optim
from sklearn.metrics import classification_report

# Configurações
train_dir = "/content/drive/MyDrive/Colab Notebooks/Base/train"
val_dir = "/content/drive/MyDrive/Colab Notebooks/Base/val"
batch_size = 16
num_epochs = 10
learning_rate = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Normalização do ViT
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)

transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(),
    ToTensor(),
    normalize
])

# Datasets e dataloaders
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
val_dataset   = datasets.ImageFolder(root=val_dir, transform=transform)
num_classes = len(train_dataset.classes)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)

# Modelo ViT
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=num_classes,
    ignore_mismatched_sizes=True
)
model.classifier = nn.Linear(model.config.hidden_size, num_classes)
model.to(device)

# Otimizador e função perda
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Treinamento
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        outputs = model(imgs)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")

    # Validação
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            _, preds = torch.max(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    val_acc = correct / total
    print(f"Val Accuracy: {val_acc:.4f}")

# Avaliação final com relatório
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for imgs, labels in val_loader:
        imgs = imgs.to(device)
        outputs = model(imgs)
        _, preds = torch.max(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

print("\nRelatório de Classificação:\n")
print(classification_report(all_labels, all_preds, target_names=val_dataset.classes))

# Salvar modelo
torch.save(model.state_dict(), f"vit_finetuned_acc{val_acc:.2f}.pth")
print(f"Modelo salvo: vit_finetuned_acc{val_acc:.2f}.pth")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Usando dispositivo: cuda


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 - Loss: 0.3672
Val Accuracy: 1.0000
Epoch 2/10 - Loss: 0.0259
Val Accuracy: 0.9950
Epoch 3/10 - Loss: 0.0035
Val Accuracy: 1.0000
Epoch 4/10 - Loss: 0.0020
Val Accuracy: 1.0000
Epoch 5/10 - Loss: 0.0014
Val Accuracy: 1.0000
Epoch 6/10 - Loss: 0.0011
Val Accuracy: 1.0000
Epoch 7/10 - Loss: 0.0009
Val Accuracy: 0.9950
Epoch 8/10 - Loss: 0.0007
Val Accuracy: 0.9950
Epoch 9/10 - Loss: 0.0006
Val Accuracy: 0.9950
Epoch 10/10 - Loss: 0.0005
Val Accuracy: 0.9950

Relatório de Classificação:

              precision    recall  f1-score   support

     cavalos       1.00      1.00      1.00        20
      comida       1.00      1.00      1.00        20
        dino       1.00      1.00      1.00        20
    elefante       1.00      1.00      1.00        20
      flores       1.00      1.00      1.00        20
     humanos       0.95      1.00      0.98        20
   montanhas       1.00      1.00      1.00        20
       obras       1.00      1.00      1.00        20
      onibus

In [7]:
from sklearn.metrics import confusion_matrix
import pandas as pd

# Gera a matriz de confusão
cm = confusion_matrix(all_labels, all_preds)

# Transforma em DataFrame para facilitar leitura
cm_df = pd.DataFrame(cm, index=val_dataset.classes, columns=val_dataset.classes)

# Exibe no notebook
print("Matriz de Confusão:\n")
print(cm_df)


Matriz de Confusão:

           cavalos  comida  dino  elefante  flores  humanos  montanhas  obras  \
cavalos         20       0     0         0       0        0          0      0   
comida           0      20     0         0       0        0          0      0   
dino             0       0    20         0       0        0          0      0   
elefante         0       0     0        20       0        0          0      0   
flores           0       0     0         0      20        0          0      0   
humanos          0       0     0         0       0       20          0      0   
montanhas        0       0     0         0       0        0         20      0   
obras            0       0     0         0       0        0          0     20   
onibus           0       0     0         0       0        0          0      0   
praia            0       0     0         0       0        1          0      0   

           onibus  praia  
cavalos         0      0  
comida          0      0  
dino  

In [9]:
# 1. Montar o Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Imports
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, RandomHorizontalFlip
from transformers import ViTForImageClassification, ViTImageProcessor
from torch import nn, optim
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# 3. Configurações
train_dir = "/content/drive/MyDrive/Colab Notebooks/Base/train"
val_dir   = "/content/drive/MyDrive/Colab Notebooks/Base/val"
batch_size = 16
num_epochs = 10
learning_rate = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# 4. Transformações
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)

transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(),
    ToTensor(),
    normalize
])

# 5. Datasets e Loaders
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
val_dataset   = datasets.ImageFolder(root=val_dir, transform=transform)
num_classes = len(train_dataset.classes)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)

# 6. Modelo ViT
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=num_classes,
    ignore_mismatched_sizes=True
)
model.classifier = nn.Linear(model.config.hidden_size, num_classes)
model.to(device)

# 7. Otimizador e Perda
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 8. Treinamento
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        outputs = model(imgs)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")

    # Validação
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            _, preds = torch.max(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    val_acc = correct / total
    print(f"Val Accuracy: {val_acc:.4f}")

# 9. Relatório de Classificação
print("\nRelatório de Classificação:")
print(classification_report(all_labels, all_preds, target_names=val_dataset.classes))

# 10. Matriz de confusão (sem plotar)
cm = confusion_matrix(all_labels, all_preds)
cm_df = pd.DataFrame(cm, index=val_dataset.classes, columns=val_dataset.classes)
print("\nMatriz de Confusão:")
print(cm_df)

# 11. Salvar modelo
torch.save(model.state_dict(), "vit_finetuned.pth")
print("Modelo salvo como vit_finetuned.pth")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Usando dispositivo: cuda


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 - Loss: 0.3668
Val Accuracy: 1.0000
Epoch 2/10 - Loss: 0.0093
Val Accuracy: 0.9950
Epoch 3/10 - Loss: 0.0033
Val Accuracy: 0.9950
Epoch 4/10 - Loss: 0.0019
Val Accuracy: 0.9950
Epoch 5/10 - Loss: 0.0014
Val Accuracy: 0.9950
Epoch 6/10 - Loss: 0.0011
Val Accuracy: 0.9950
Epoch 7/10 - Loss: 0.0009
Val Accuracy: 0.9950
Epoch 8/10 - Loss: 0.0007
Val Accuracy: 0.9950
Epoch 9/10 - Loss: 0.0006
Val Accuracy: 0.9950
Epoch 10/10 - Loss: 0.0005
Val Accuracy: 0.9950

Relatório de Classificação:
              precision    recall  f1-score   support

     cavalos       1.00      1.00      1.00        20
      comida       1.00      1.00      1.00        20
        dino       1.00      1.00      1.00        20
    elefante       1.00      1.00      1.00        20
      flores       1.00      1.00      1.00        20
     humanos       0.95      1.00      0.98        20
   montanhas       1.00      1.00      1.00        20
       obras       1.00      1.00      1.00        20
      onibus 