In [21]:
import torch
import torchvision
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.transforms import functional as F
import numpy as np
from PIL import Image
import cv2
import os
import xml.etree.ElementTree as ET

In [22]:
class TrafficLightDataset(torch.utils.data.Dataset):
    def __init__(self, imgs_path, annot_path, transforms=None):
        self.imgs_path = imgs_path
        self.annot_path = annot_path
        self.transforms = transforms
        
        # Lista de imagens e anotações
        self.imgs = sorted([img for img in os.listdir(imgs_path) if img.endswith('.jpg')])
        
    def __getitem__(self, idx):
        # Carregar imagem
        img_path = os.path.join(self.imgs_path, self.imgs[idx])
        img = Image.open(img_path).convert("RGB")
        
        # Converter imagem para tensor imediatamente
        img = F.to_tensor(img)
        
        # Carregar anotação XML correspondente
        ann_path = os.path.join(self.annot_path, self.imgs[idx].replace('.jpg', '.xml'))
        tree = ET.parse(ann_path)
        root = tree.getroot()
        
        boxes = []
        labels = []
        
        # Extrair informações do XML
        for obj in root.findall('object'):
            if obj.find('name').text.lower() == 'traffic light':
                bbox = obj.find('bndbox')
                xmin = float(bbox.find('xmin').text)
                ymin = float(bbox.find('ymin').text)
                xmax = float(bbox.find('xmax').text)
                ymax = float(bbox.find('ymax').text)
                
                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(1)  # 1 para semáforo
        
        # Converter para tensores
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)
        
        # Criar máscaras dummy
        h, w = img.shape[-2:]  # Pegar altura e largura do tensor da imagem
        masks = torch.ones((len(boxes), h, w), dtype=torch.uint8)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)
        
        return img, target
    
    def __len__(self):
        return len(self.imgs)

In [53]:
from torchvision.transforms import transforms as T

def get_transform():
    transforms = []
    transforms.append(T.ToTensor())
    transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

# Usar as transformações
dataset = TrafficLightDataset(
    imgs_path='./../data/openimages/traffic light/images',
    annot_path='./../data/openimages/traffic light/pascal',
    #transforms=get_transform()
)

In [54]:
# Definir o data loader
train_data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=lambda x: tuple(zip(*x))  # Importante para processar corretamente as imagens e targets
)

In [55]:
def train_model(model, data_loader, device, num_epochs=10):
    # Otimizador
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                               momentum=0.9, weight_decay=0.0005)
    
    for epoch in range(num_epochs):
        model.train()
        i = 0
        print(f"Epoch {epoch}")
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            if i % 10 == 0:
                print(f"Iteration {i}, Loss: {losses.item()}")
            i += 1

In [56]:
# Código para usar o modelo
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_classes = 2  # Background + traffic light

In [57]:
def get_model_instance_segmentation(num_classes):
    # Carregar o modelo pré-treinado
    model = maskrcnn_resnet50_fpn(pretrained=True)
    
    # Substituir a cabeça do modelo para o número de classes desejado
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
    
    return model

In [58]:
model = get_model_instance_segmentation(num_classes)
model.to(device)
# Treinar o modelo
train_model(model, train_data_loader, device, num_epochs=10)
# Salvar o modelo treinado
torch.save(model.state_dict(), 'traffic_light_model.pth')
# Carregar o modelo treinado
model.load_state_dict(torch.load('traffic_light_model.pth'))
model.eval()

Epoch 0
Iteration 0, Loss: 2.910612106323242


KeyboardInterrupt: 

In [13]:
# Testar o modelo em uma imagem
def test_model(model, image_path):
    img = Image.open(image_path).convert("RGB")
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
    ])
    img_tensor = transform(img).unsqueeze(0).to(device)
    
    with torch.no_grad():
        prediction = model(img_tensor)
    
    return prediction

In [14]:
# Testar o modelo em uma imagem de exemplo
test_image_path = './../data/openimages/traffic light/images/000f4b8faa997297.jpg'
prediction = test_model(model, test_image_path)
# Visualizar as previsões
def visualize_predictions(image_path, prediction):
    img = cv2.imread(image_path)
    boxes = prediction[0]['boxes'].cpu().numpy()
    labels = prediction[0]['labels'].cpu().numpy()
    scores = prediction[0]['scores'].cpu().numpy()
    
    for box, label, score in zip(boxes, labels, scores):
        if score > 0.5:  # Limite de confiança
            x1, y1, x2, y2 = box.astype(int)
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(img, f'Class: {label}, Score: {score:.2f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
    
    cv2.imshow('Predictions', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

AssertionError: targets should not be none when in training mode

In [5]:
def detect_traffic_lights(model, image_path, threshold=0.5):
    model.eval()
    img = Image.open(image_path).convert("RGB")
    transform = torchvision.transforms.ToTensor()
    img_tensor = transform(img)
    
    with torch.no_grad():
        prediction = model([img_tensor])
        
    # Filtrar detecções com base no threshold
    boxes = prediction[0]['boxes'][prediction[0]['scores'] > threshold]
    masks = prediction[0]['masks'][prediction[0]['scores'] > threshold]
    
    return boxes, masks

# Exemplo de uso
image_path = "caminho/para/imagem_teste.jpg"
boxes, masks = detect_traffic_lights(model, image_path)

NameError: name 'model' is not defined