In [37]:
import os
import xml.etree.ElementTree as ET

In [38]:
root_path = 'e:\\COMP\\COMP9444\\project\\dumpsite_data\\VOC2012'
train_path = os.path.join(root_path, "train")
test_path = os.path.join(root_path, "test")
file_Annotations = os.path.join(train_path, "Annotations")

In [39]:
object_class = []
for each_xml in os.listdir(file_Annotations):
    pic_xml = os.path.join(file_Annotations, each_xml)
    tree = ET.parse(pic_xml)
    root = tree.getroot()
    for object_elem in root.findall('object'):
        name_elem_value = object_elem.find('name').text
        object_class.append(name_elem_value)
classes = set(object_class)
print(classes)


{'construction waste', 'industry waste', 'disposed garbage', 'mining waste', 'agriculture forestry', 'domestic garbage'}


In [40]:
label_maps = {}
for index, class_name in enumerate(classes):
    label_maps[class_name] = index
print(label_maps)

{'construction waste': 0, 'industry waste': 1, 'disposed garbage': 2, 'mining waste': 3, 'agriculture forestry': 4, 'domestic garbage': 5}


##### Due to the problem of severe sample imbalance in the dumpsite dataset (Fig. 1a), we propose two training strategies, data augmentation (vertical flipping, horizontal
##### flipping, forward 90° rotation and reverse 90° rotation) and category balancing, to ensure the model’sefficiency during the training process

In [41]:
from PIL import Image, ImageDraw
import torch
from torch.utils.data import Dataset
import torchvision.transforms as T

In [42]:
class  VOCDataset(Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        txt_file = os.path.join(root, "train.txt")
        with open(txt_file, 'r') as f:
            self.image_ids = f.read().strip().split()
        self.image_folder = os.path.join(root, "JPEGImages")
        self.ann_folder = os.path.join(root, "Annotations")

    def __len__(self):
        return len(self.image_folder)
    
    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        img_path = os.path.join(self.image_folder, f"{img_id}.jpg")
        ann_path = os.path.join(self.ann_folder, f"{img_id}.xml")

        img = Image.open(img_path).convert("RGB")
        tree = ET.parse(ann_path)
        root = tree.getroot()
        boxes = []
        labels = []
        for obj in root.findall("object"):
            label = obj.find("name").text
            print("label是:", label)
            labels.append(label_maps[label])
            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            ymin = int(bbox.find("ymin").text)
            xmax = int(bbox.find("xmax").text)
            ymax = int(bbox.find("ymax").text)
            if self.transforms:
                transform = self.transforms.transforms[-2]
                print(transform)
                if isinstance(transform, T.Resize):
                    print("等比例缩放")
                    scale_factor = transform.size[0] / img.width
                    print(scale_factor)
                    xmin = int(xmin * scale_factor)
                    ymin = int(ymin * scale_factor)
                    xmax = int(xmax * scale_factor)
                    ymax = int(ymax * scale_factor)
            box = [xmin, ymin, xmax, ymax]
            boxes.append(box)
        
        boxes = torch.tensor(boxes, dtype=torch.float32)
        target = {"boxes":boxes, "labels":labels}
        if self.transforms:
            img = self.transforms(img)
        return img, target

In [43]:
import random
from torchvision.transforms import functional as F

In [1]:
class DetectionTransforms:
    def __init__(self):
        self.augment = T.Compose([
            T.RandomVerticalFlip(p=0.5),
            T.RandomHorizontalFlip(p=0.5), 
            T.RandomRotation(degrees=(0, 90), expand=False, center=None),
            T.RandomRotation(degrees=(-90, 0), expand=False, center=None), 
        ])
        self.normalize = T.Compose([
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __call__(self, image, targets):
        image = self.augment(image)
        image = self.normalize(image)
        for box, label in zip(*targets):
            # augment process, the same seed
            # verticalfilp

            # horizontalfilp

            # rotation
            # rotation
        return image

IndentationError: expected an indented block after 'for' statement on line 17 (3316825155.py, line 22)

In [45]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

def unnormalize(image, mean, std):
    mean = torch.tensor(mean).view(3, 1, 1)
    std = torch.tensor(std).view(3, 1, 1)
    return image * std * mean

In [46]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    images = []
    targets = []
    for obj in batch:
        images.append(obj[0])
        targets.append(obj[1])
    images = torch.stack(images, dim=0)
    return images, targets


transforms = DetectionTransforms()
dataset = VOCDataset(root='./dumpsite_data/VOC2012/train', transforms=transforms)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

for images, targets in dataloader:
    PIL_image = unnormalize(images[0], mean, std)
    PIL_image = F.to_pil_image(PIL_image)
    # PIL_image.show()
    print(targets[0]['boxes'])
    print(targets[0]['labels'])

label是: domestic garbage


AttributeError: 'DetectionTransforms' object has no attribute 'transforms'

In [47]:
transforms.augment.transforms

[RandomVerticalFlip(p=0.5),
 RandomHorizontalFlip(p=0.5),
 RandomRotation(degrees=[0.0, 90.0], interpolation=nearest, expand=False, fill=0),
 RandomRotation(degrees=[-90.0, 0.0], interpolation=nearest, expand=False, fill=0)]

In [48]:
test = Image.open("E:/COMP/COMP9444/project/dumpsite_data/VOC2012/train/JPEGImages/CS7500.jpg")
print(test.size)
test.show()

(1024, 1024)


In [53]:
def label_name(label_num):
    class_name = {k for k, v in label_maps.items() if v == label_num}
    return list(class_name)[0]

In [57]:
transforms = T.Compose([
    T.transforms.Resize((448, 448)),
    T.transforms.ToTensor()
])
dataset = VOCDataset(root='./dumpsite_data/VOC2012/train', transforms=transforms)
random_idx = random.randint(0, len(dataset) - 1)
img, target = dataset[random_idx]
image = F.to_pil_image(img)
draw = ImageDraw.Draw(image)
boxes = target['boxes']
labels = target['labels']
for box, label in zip(boxes, labels):
    xmin, ymin, xmax, ymax = box
    draw.rectangle([xmin, ymin, xmax, ymax], outline="red")
    draw.text((xmin - 10, ymin - 10), label_name(label), fill="red")
print(image.size)
image.show()

label是: domestic garbage
Resize(size=(448, 448), interpolation=bilinear, max_size=None, antialias=True)
等比例缩放
0.4375
tensor([[ 79., 268., 117., 318.]]) [5]
(448, 448)


In [None]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn