In [2]:
#torch_vision
#データ前処理
import torch as t
from torchvision import models
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader

import cv2
import PIL
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns

from PIL import Image
from io import BytesIO
from matplotlib import pyplot as plt
import json
from collections import Counter

path = "C:/Users/dso-s.gao/Desktop/signate"
sys.path.append(path)

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


### Constructs a Faster R-CNN model with a ResNet-50-FPN backbone.
---
---
 - The input to the model is expected to be a list of tensors, each of shape [C, H, W], <br>
   one for each image, and should be in 0-1 range.   Different images can have different sizes.

 - The behavior of the model changes depending if it is in training or evaluation mode.

 - During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing:

 - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, <br>
   with values between 0 and H and 0 and W

 - labels (Int64Tensor[N]): the class label for each ground-truth box

 - The model returns a Dict[Tensor] during training, containing the classification and regression losses for both the RPN and the R-CNN.

 - During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], <br>
   one for each input image. The fields of the Dict are as follows:

 - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between 0 and H and 0 and W

 - labels (Int64Tensor[N]): the predicted labels for each image

 - scores (Tensor[N]): the scores or each prediction

In [3]:
#jsonファイルの中身を解読
def read_json_file(file_name):
    with open(file_name, "rb") as f:
        file = json.load(f)

    address = file["attributes"]['route']
    time = file["attributes"]['timeofday']

    bbox = [row["box2d"] for row in file["labels"]]
    category = [row["category"] for row in file["labels"]]
    
    return address, time, bbox, category

In [4]:
#カテゴリのラベルを定義します。
dict_category = {
    'Car': 1,
    'Bicycle': 2,
    'Pedestrian': 3,
    'Signal': 4,
    'Signs': 5,
    'Truck': 6,
    'Bus': 7,
    'SVehicle': 8,
    'Motorbike': 9,
    'Train': 10
}

In [5]:
import os
import numpy as np
import torch
from PIL import Image

#統一前処理 
class PennFudanDataset(object):
    def __init__(self, root, anotation_root, transforms=True, train=True, category_dict=dict_category):
        self.root = root
        self.anotation_root = anotation_root
        self.category_dict = category_dict
        self.transforms = transforms
        # 下载所有图像文件，为其排序
        # 确保它们对齐
        self.imgs = list(sorted(os.listdir(root)))
        self.anotation = list(sorted(os.listdir(anotation_root)))
    
    def __getitem__(self, index):
        #画像データ、PILで読み、TO_TENSORで返す
        self.img_path = os.path.join( self.root + self.imgs[index] )
        img = Image.open(self.img_path).convert("RGB")
        json = self.anotation[index]
        
        #画像を編集するときであれば
        #The input to the model is expected to be a list of tensors, 
        #each of shape [C, H, W], one for each image, and should be in 0-1 range. 
        #Different images can have different sizes.
        
        if self.transforms:
            transform = transforms.Compose([
                            #transforms.CenterCrop((100, 100)), #中心クロップ
                            #transforms.Grayscale(num_output_channels=1), #灰色化
                            #transforms.RandomHorizontalFlip(), #水平反転
                            #transforms.Scale(224), #resize
                            #transforms.TenCrop(3), # 十分割
                            #transforms.Lambda(lambda crops : t.stack([transforms.ToTensor()(crop) for crop in crops])),
                            #transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], #標準化
                            #std = [ 0.229, 0.224, 0.225 ]),
                            #transforms.ToPILImage(mode="RGBA") 
                            transforms.ToTensor() #テンソル化
                            ])
            img = transform(img)
        
        #データのBBOXを書き込み
        address, time, bbox, category = read_json_file(self.anotation_root + self.anotation[index])
        
        #boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, 
        #with values between 0 and H and 0 and W
        #boxes の座標をembedingします。
        num_objs = len(bbox)
        boxes = []
        for i in range(num_objs):
            x1 = bbox[i]["x1"]
            x2 = bbox[i]["x2"]
            y1 = bbox[i]["y1"]
            y2 = bbox[i]["y2"]
            boxes.append([x1, y1, x2, y2])
        
        #bounding_boxをテンソル化
        boxes = t.as_tensor(boxes, dtype=torch.float32)
        
        #categoryをテンソル化します。
        category_transed = [self.category_dict[cate] for cate in category]
        category_transed = t.as_tensor(category_transed, dtype=t.int)
 
        return img, boxes, category_transed
    
    def __len__(self):
        return len(self.imgs)

In [69]:
#データの作成
data_set = PennFudanDataset(
    root=path + "/dtc_train_images_0/dtc_train_images/", 
    anotation_root=path + "/dtc_train_annotations/dtc_train_annotations/",
                )

In [68]:
#object検知時に自作のデータセット関数を導入
def my_collate_fn(batch):
    # datasetの出力が
    # [image, target] = dataset[batch_idx]
    # の場合.
    images = []
    boxes = []
    category = [] 
    for sample in batch:
        image, box, cate = sample
        images.append(image)
        boxes.append(box)
        category.append(cate)
        
    images = torch.stack(images, dim=0)
    return [images, boxes, category]

In [59]:
train_dataloader = DataLoader(data_set, shuffle=True, batch_size=2, drop_last=True, num_workers=0, collate_fn=my_collate_fn)

In [63]:
for step, (img, boxes, category_transed) in enumerate(train_dataloader):
    if step <= 2:
        print(boxes,category_transed)
        break  

[tensor([[ 869.,  592., 1097.,  779.],
        [1067.,  516., 1180.,  649.],
        [1173.,  584., 1382.,  725.]]), tensor([[   0.,  537.,   58.,  787.],
        [  74.,  540.,  157.,  769.],
        [ 132.,  522.,  193.,  699.],
        [ 173.,  545.,  216.,  714.],
        [ 800.,  497.,  950.,  608.],
        [1656.,  581., 1929.,  671.],
        [ 412.,  557.,  434.,  600.],
        [ 594.,  560.,  627.,  606.],
        [ 627.,  562.,  650.,  606.],
        [ 651.,  563.,  668.,  606.],
        [ 658.,  565.,  679.,  605.],
        [ 676.,  567.,  696.,  606.],
        [ 712.,  572.,  731.,  611.],
        [ 726.,  567.,  745.,  615.],
        [ 752.,  567.,  770.,  610.],
        [ 764.,  567.,  788.,  615.],
        [ 778.,  567.,  801.,  611.],
        [ 793.,  567.,  811.,  618.],
        [ 821.,  567.,  839.,  603.],
        [ 843.,  567.,  858.,  601.],
        [ 858.,  583.,  877.,  615.],
        [ 810.,  484.,  826.,  519.],
        [ 887.,  437.,  907.,  485.],
        [