In [56]:
#torch_vision
#データ前処理
import torch as t
from torchvision import models
from torchvision import transforms
from torchvision import datasets
from torch.utils import data

import cv2
import PIL
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns

from PIL import Image
from io import BytesIO
from matplotlib import pyplot as plt
import json
from collections import Counter

path = "C:/Users/dso-s.gao/Desktop/signate"
sys.path.append(path)

In [20]:
#jsonファイルの中身を解読
def read_json_file(file_name):
    with open(file_name, "rb") as f:
        file = json.load(f)

    address = file["attributes"]['route']
    time = file["attributes"]['timeofday']

    bbox = [row["box2d"] for row in file["labels"]]
    category = [row["category"] for row in file["labels"]]
    
    return address, time, bbox, category

In [29]:
#カテゴリのラベルを定義します。
dict_category = {
    'Car': 1,
    'Bicycle': 2,
    'Pedestrian': 3,
    'Signal': 4,
    'Signs': 5,
    'Truck': 6,
    'Bus': 7,
    'SVehicle': 8,
    'Motorbike': 9,
    'Train': 10
}

In [72]:
import os
import numpy as np
import torch
from PIL import Image

#統一前処理 
class PennFudanDataset(object):
    def __init__(self, root, anotation_root, transforms=True, train=True, category_dict=dict_category):
        self.root = root
        self.anotation_root = anotation_root
        self.category_dict = category_dict
        self.transforms = transforms
        # 下载所有图像文件，为其排序
        # 确保它们对齐
        self.imgs = list(sorted(os.listdir(root)))
        self.anotation = list(sorted(os.listdir(anotation_root)))
    
    def __getitem__(self, index):
        #画像データ、PILで読み、TO_TENSORで返す
        self.img_path = os.path.join( self.root + self.imgs[index] )
        img = Image.open(self.img_path).convert("RGB")
        json = self.anotation[index]
        
        #画像を編集するときであれば
        if self.transforms:
            transform = transforms.Compose([
                            #transforms.CenterCrop((100, 100)), #中心クロップ
                            #transforms.Grayscale(num_output_channels=1), #灰色化
                            #transforms.RandomHorizontalFlip(), #水平反転
                            #transforms.Scale(224), #resize
                            #transforms.TenCrop(3), # 十分割
                            #transforms.Lambda(lambda crops : t.stack([transforms.ToTensor()(crop) for crop in crops])),
                            #transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], #標準化
                            #std = [ 0.229, 0.224, 0.225 ]),
                            #transforms.ToPILImage(mode="RGBA") 
                            transforms.ToTensor() #テンソル化
                            ])
            img = transform(img)
        
        #データのBBOXを書き込み
        address, time, bbox, category = read_json_file(self.anotation_root + self.anotation[index])
        
        #boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, 
        #with values between 0 and H and 0 and W
        #boxes の座標をembedingします。
        num_objs = len(bbox)
        boxes = []
        for i in range(num_objs):
            x1 = bbox[i]["x1"]
            x2 = bbox[i]["x2"]
            y1 = bbox[i]["y1"]
            y2 = bbox[i]["y2"]
            boxes.append([x1, y1, x2, y2])
        
        #bounding_boxをテンソル化
        boxes = t.as_tensor(boxes, dtype=torch.float32)
        
        #categoryをテンソル化します。
        category_transed = [self.category_dict[cate] for cate in category]
        category_transed = t.as_tensor(category_transed, dtype=t.int)
 
        return img, boxes, category_transed
    
    def __len__(self):
        return len(self.imgs)

In [73]:
test = PennFudanDataset(
    root=path + "/dtc_train_images_0/dtc_train_images/", 
    anotation_root=path + "/dtc_train_annotations/dtc_train_annotations/",
                )

In [74]:
test[1]

(tensor([[[0.0235, 0.0314, 0.0314,  ..., 0.0314, 0.0235, 0.0196],
          [0.0314, 0.0353, 0.0353,  ..., 0.0196, 0.0196, 0.0157],
          [0.0314, 0.0353, 0.0392,  ..., 0.0078, 0.0118, 0.0118],
          ...,
          [0.0196, 0.0196, 0.0196,  ..., 0.0196, 0.0235, 0.0235],
          [0.0196, 0.0235, 0.0196,  ..., 0.0235, 0.0235, 0.0157],
          [0.0235, 0.0275, 0.0235,  ..., 0.0235, 0.0235, 0.0157]],
 
         [[0.0275, 0.0275, 0.0275,  ..., 0.0314, 0.0235, 0.0196],
          [0.0275, 0.0314, 0.0314,  ..., 0.0196, 0.0196, 0.0157],
          [0.0314, 0.0353, 0.0392,  ..., 0.0078, 0.0118, 0.0118],
          ...,
          [0.0000, 0.0039, 0.0039,  ..., 0.0196, 0.0196, 0.0196],
          [0.0000, 0.0000, 0.0000,  ..., 0.0157, 0.0157, 0.0078],
          [0.0000, 0.0039, 0.0039,  ..., 0.0157, 0.0157, 0.0078]],
 
         [[0.0431, 0.0471, 0.0471,  ..., 0.0314, 0.0235, 0.0275],
          [0.0471, 0.0510, 0.0510,  ..., 0.0196, 0.0196, 0.0157],
          [0.0392, 0.0431, 0.0471,  ...,

In [16]:
import os
import numpy as np
import torch
from PIL import Image
 
class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # 下载所有图像文件，为其排序
        # 确保它们对齐
        self.imgs = list(sorted(os.listdir(os.path.join(root, "JPGImages"))))
        #self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
 
    def __getitem__(self, idx):
        # load images ad masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        #mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # 请注意我们还没有将mask转换为RGB,
        # 因为每种颜色对应一个不同的实例
        # 0是背景
        #mask = Image.open(mask_path)
        # 将PIL图像转换为numpy数组
        #mask = np.array(mask)
        # 实例被编码为不同的颜色
        #obj_ids = np.unique(mask)
        # 第一个id是背景，所以删除它
        #obj_ids = obj_ids[1:]
 
        # 将颜色编码的mask分成一组
        # 二进制格式
        #masks = mask == obj_ids[:, None, None]
 
        # 获取每个mask的边界框坐标
        #num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
 
        # 将所有转换为torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # 这里仅有一个类
        labels = torch.ones((num_objs,), dtype=torch.int64)
        #masks = torch.as_tensor(masks, dtype=torch.uint8)
 
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # 假设所有实例都不是人群
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
 
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
 
        if self.transforms is not None:
            img, target = self.transforms(img, target)
 
        return img, target
 
    def __len__(self):
        return len(self.imgs)

SyntaxError: invalid character in identifier (<ipython-input-16-7c04d216464a>, line 6)

In [3]:
# 画像ファイルパスから読み込み
def read(filename): 
    return Image.open(filename)



Constructs a Faster R-CNN model with a ResNet-50-FPN backbone.

The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each image, and should be in 0-1 range. Different images can have different sizes.

The behavior of the model changes depending if it is in training or evaluation mode.

During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing:

boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values between 0 and H and 0 and W

labels (Int64Tensor[N]): the class label for each ground-truth box

The model returns a Dict[Tensor] during training, containing the classification and regression losses for both the RPN and the R-CNN.

During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows:

boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between 0 and H and 0 and W

labels (Int64Tensor[N]): the predicted labels for each image

scores (Tensor[N]): the scores or each prediction

In [16]:
from torchvision import models
models.

In [24]:
from torchvision import datasets
from torchvision import models
data = datasets.VOCDetection('./VOC_Detection',year='2012', image_set='train', download=True)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to ./VOC_Detection\VOCtrainval_11-May-2012.tar


1999642624it [10:20, 4304268.42it/s]                                                                                   

In [42]:
data[0][1]

{'annotation': {'folder': 'VOC2012',
  'filename': '2008_000008.jpg',
  'source': {'database': 'The VOC2008 Database',
   'annotation': 'PASCAL VOC2008',
   'image': 'flickr'},
  'size': {'width': '500', 'height': '442', 'depth': '3'},
  'segmented': '0',
  'object': [{'name': 'horse',
    'pose': 'Left',
    'truncated': '0',
    'occluded': '1',
    'bndbox': {'xmin': '53', 'ymin': '87', 'xmax': '471', 'ymax': '420'},
    'difficult': '0'},
   {'name': 'person',
    'pose': 'Unspecified',
    'truncated': '1',
    'occluded': '0',
    'bndbox': {'xmin': '158', 'ymin': '44', 'xmax': '289', 'ymax': '167'},
    'difficult': '0'}]}}