In [11]:
import os
import xml.etree.ElementTree as ET

In [12]:
root_path = 'e:\\COMP\\COMP9444\\project\\dumpsite_data\\VOC2012'
train_path = os.path.join(root_path, "train")
test_path = os.path.join(root_path, "test")
file_Annotations = os.path.join(train_path, "Annotations")

In [13]:
object_class = []
for each_xml in os.listdir(file_Annotations):
    pic_xml = os.path.join(file_Annotations, each_xml)
    tree = ET.parse(pic_xml)
    root = tree.getroot()
    for object_elem in root.findall('object'):
        name_elem_value = object_elem.find('name').text
        object_class.append(name_elem_value)
print(set(object_class))


{'domestic garbage', 'disposed garbage', 'agriculture forestry', 'industry waste', 'construction waste', 'mining waste'}


##### Due to the problem of severe sample imbalance in the dumpsite dataset (Fig. 1a), we propose two training strategies, data augmentation (vertical flipping, horizontal
##### flipping, forward 90° rotation and reverse 90° rotation) and category balancing, to ensure the model’sefficiency during the training process

In [36]:
from PIL import Image, ImageDraw
import torch
from torch.utils.data import Dataset
import torchvision.transforms as T

In [116]:
class  VOCDataset(Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        txt_file = os.path.join(root, "train.txt")
        with open(txt_file, 'r') as f:
            self.image_ids = f.read().strip().split()
        self.image_folder = os.path.join(root, "JPEGImages")
        self.ann_folder = os.path.join(root, "Annotations")

    def __len__(self):
        return len(self.image_folder)
    
    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        img_path = os.path.join(self.image_folder, f"{img_id}.jpg")
        ann_path = os.path.join(self.ann_folder, f"{img_id}.xml")

        img = Image.open(img_path).convert("RGB")
        tree = ET.parse(ann_path)
        root = tree.getroot()
        boxes = []
        labels = []
        for obj in root.findall("object"):
            label = obj.find("name").text
            print("label是:", label)
            labels.append(label)
            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            ymin = int(bbox.find("ymin").text)
            xmax = int(bbox.find("xmax").text)
            ymax = int(bbox.find("ymax").text)
            if self.transforms:
                transform = self.transforms.normalize.transforms
                print(transform)
                if isinstance(transform, T.Resize):
                    print("等比例缩放")
                    scale_factor = transform.size[0] / img.width
                    print(scale_factor)
                    xmin = int(xmin * scale_factor)
                    ymin = int(ymin * scale_factor)
                    xmax = int(xmax * scale_factor)
                    ymax = int(ymax * scale_factor)
            box = [xmin, ymin, xmax, ymax]
            boxes.append(box)
        
        boxes = torch.tensor(boxes, dtype=torch.float32)
        target = {"boxes":boxes, "labels":labels}
        if self.transforms:
            img = self.transforms(img)
        return img, target

In [57]:
import random
from torchvision.transforms import functional as F

In [112]:
class DetectionTransforms:
    def __init__(self):
        self.augment = T.Compose([
            T.RandomVerticalFlip(p=0.5),
            T.RandomHorizontalFlip(p=0.5), 
            T.RandomRotation(degrees=(0, 90), expand=False, center=None),
            T.RandomRotation(degrees=(-90, 0), expand=False, center=None), 
        ])
        self.normalize = T.Compose([
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __call__(self, image):
        # image = self.augment(image)
        image = self.normalize(image)
        return image

In [59]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

def unnormalize(image, mean, std):
    mean = torch.tensor(mean).view(3, 1, 1)
    std = torch.tensor(std).view(3, 1, 1)
    return image * std * mean

In [117]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    images = []
    targets = []
    for obj in batch:
        images.append(obj[0])
        targets.append(obj[1])
    images = torch.stack(images, dim=0)
    return images, targets


transforms = DetectionTransforms()
dataset = VOCDataset(root='./dumpsite_data/VOC2012/train', transforms=transforms)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

for images, targets in dataloader:
    PIL_image = unnormalize(images[0], mean, std)
    PIL_image = F.to_pil_image(PIL_image)
    PIL_image.show()
    print(targets[0]['boxes'])
    print(targets[0]['labels'])

label是: domestic garbage
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: domestic garbage
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: agriculture forestry
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: agriculture forestry
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: domestic garbage
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: domestic garbage
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: construction waste
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: agriculture forestry
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: agriculture forestry
[ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
label是: construction waste
[ToTensor(), Normal

In [78]:
transforms.augment.transforms

[RandomHorizontalFlip(p=0.5),
 ColorJitter(brightness=(0.8, 1.2), contrast=(0.8, 1.2), saturation=(0.8, 1.2), hue=(-0.1, 0.1)),
 RandomRotation(degrees=[-30.0, 30.0], interpolation=nearest, expand=False, fill=0),
 RandomResizedCrop(size=(300, 300), scale=(0.0, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear, antialias=True)]

In [48]:
test = Image.open("E:/COMP/COMP9444/project/dumpsite_data/VOC2012/train/JPEGImages/CS7500.jpg")
print(test.size)
test.show()

(1024, 1024)


In [106]:
transforms = T.Compose([
    T.transforms.Resize((448, 448)),
    T.transforms.ToTensor()
])
dataset = VOCDataset(root='./dumpsite_data/VOC2012/train', transforms=transforms)
random_idx = random.randint(0, len(dataset) - 1)
img, target = dataset[random_idx]
image = F.to_pil_image(img)
draw = ImageDraw.Draw(image)
boxes = target['boxes']
labels = target['labels']
for box, label in zip(boxes, labels):
    xmin, ymin, xmax, ymax = box
    draw.rectangle([xmin, ymin, xmax, ymax], outline="red")
    draw.text((xmin - 10, ymin - 10), label, fill="red")
print(image.size)
image.show()

label是: domestic garbage
Resize(size=(448, 448), interpolation=bilinear, max_size=None, antialias=True)
等比例缩放
0.4375
(448, 448)
