In [None]:
!which pip
!which python
# !pip install numpy torch scikit-learn tqdm
# !pip install pandas matplotlib torchvision IPython opencv-python opencv-contrib-python

In [None]:
import os
import numpy as np
import gc
import torch.distributed as dist
import pandas as pd
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import matplotlib.pyplot as plt
from typing import List
import cv2
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms
import pandas as pd
import json
from PIL.Image import Image
import PIL
from tqdm import tqdm
from PIL import ImageDraw
import engine

## Config

In [None]:
class Config:
    batch_size = 16
    num_epochs = 5
    learning_rate = 1e-4
    weight_decay = 1e-5
    num_classes = 10
    input_size = (800, 800)
    
    def __init__(self) -> None:
        pass
    
config = Config()

In [None]:
def parse_json_file(label_path: str) -> pd.DataFrame:
    with open(label_path, "r") as f:
        data = json.load(f)
        
    attributes = pd.Series(data["attributes"])
    boxes = pd.DataFrame(list(map(lambda d: d["box2d"], data["labels"])))
    categories = pd.Series(list(map(lambda d: d["category"], data["labels"])))

    df = pd.DataFrame(
        data={
            "category": categories,
            "x1": boxes.x1,
            "x2": boxes.x2,
            "y1": boxes.y1,
            "y2": boxes.y2,    
        },
    )
    for (key, val) in attributes.items():
        df[key] = val
    df["image_id"] = int((label_path.split("/")[-1].split(".")[0]).replace("_", "").replace("train", ""))
    return df

## Dataset

In [None]:
import torchvision.transforms.functional as F

def get_padding(image):    
    w, h = image.size
    max_wh = np.max([w, h])
    h_padding = (max_wh - w) / 2
    v_padding = (max_wh - h) / 2    
    r_pad = h_padding if h_padding % 1 == 0 else h_padding-0.5
    b_pad = v_padding if v_padding % 1 == 0 else v_padding-0.5
    r_pad *= 2
    b_pad *= 2
    padding = (0, 0, int(r_pad), int(b_pad))
    return padding

class NewPad(object):
    def __init__(self, fill=0, padding_mode='constant'):
        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']

        self.fill = fill
        self.padding_mode = padding_mode
        
    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to be padded.

        Returns:
            PIL Image: Padded image.
        """
        return F.pad(img, get_padding(img), self.fill, self.padding_mode)
    
    def __repr__(self):
        return self.__class__.__name__ + '(padding={0}, fill={1}, padding_mode={2})'.\
            format(self.fill, self.padding_mode)

In [None]:
from PIL.Image import open as pil_open

def label_str_to_num(label: str) -> int:
    return int(label[0])

class CustomDataset(Dataset):
    def __init__(self, data: List[str], labels: List[str] = None, transform: torchvision.transforms.Compose = None, has_label: bool = False, resizing: bool = True) -> None:
        super().__init__()
        self.data = data
        self.labels = labels
        self.transform = transform
        self.has_label = has_label
        self.resizing = resizing
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, index: int) -> torch.Tensor:
        image_path = self.data[index]
        img = pil_open(image_path).convert("RGB")
        # width = img.size[0]
        # height = img.size[1]
        # print(f"width: {width}, height: {height}")
        
        if not self.has_label:
            img = self.transform(img)
            return img, {}

        label_info = parse_json_file(self.labels[index])
        image_id = label_info["image_id"].values[0]
        label = torch.tensor(np.array(list(map(label_str_to_num, label_info.category.values))))        
        ratio = config.input_size[0]/max(img.size) if self.resizing else 1
        boxes = [
            # label_info.x1 * config.input_size[1] / width,
            # label_info.y1 * config.input_size[0] / height, 
            # label_info.x2 * config.input_size[1] / width + 1, 
            # label_info.y2 * config.input_size[0] / height + 1,
            label_info.x1 * ratio,
            label_info.y1 * ratio, 
            label_info.x1 * ratio + (label_info.x2 - label_info.x1) * ratio + 1, 
            label_info.y1 * ratio + (label_info.y2 - label_info.y1) * ratio + 1, 
        ]
        for i in range(len(boxes)):
            boxes[i] = torch.tensor((boxes[i]).values)
        boxes = torch.stack(boxes, dim=1)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        N = label.size()[0]
        iscrowd = torch.zeros((N, ), dtype=torch.int64)
        targets = {
            "boxes": boxes,
            "labels": label,
            "area": area,
            "iscrowd": iscrowd,
            "image_id": torch.tensor([image_id])            
        }
        
        if self.transform is not None:
            img = self.transform(img)
        else:
            assert False
        
        return img, targets


## Model

In [None]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN_ResNet50_FPN_Weights


class CustomModel(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)        
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
        num_classes = config.num_classes
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
        self.model = model
        
    def forward(self, img, targets = [{}], train = False):
        if train:            
            return self.model(img, targets)
        else:
            return self.model(img)

## Data preparation

In [None]:
from glob import glob
import pathlib
from sklearn.model_selection import train_test_split

annotation_path = pathlib.Path("train_annotations") / "*.json"
labels = list(glob(annotation_path.absolute().as_posix()))
new_labels = []

images_path = pathlib.Path("train_images") / "*.jpg"
images = list(glob(images_path.absolute().as_posix()))
# new_images = []
# for i in range(len(images)):
#     # 1600 x 1200ではない解像度を除外
#     img_size = pil_open(images[i]).convert("RGB").size
#     if img_size == (1600, 1200):
#         new_images.append(images[i])
#         new_labels.append(labels[i])
# images = new_images
# labels = new_labels

In [None]:
train_images, val_images, train_labels, val_labels = train_test_split(images, labels)

del images
del labels
gc.collect()

In [None]:
def custom_collate(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    images = torch.stack(images, dim=0)
    return images, targets


In [None]:
device = torch.device("cuda")
model = CustomModel().to(device)
model = nn.DataParallel(model).to(device)

## Train

In [None]:
dataset = CustomDataset(
    labels=train_labels,
    data=train_images,
    has_label=True,
    transform=torchvision.transforms.Compose(transforms=[   
        NewPad(),
        transforms.Resize(config.input_size),
        transforms.ToTensor(),
    ])
)
dataloader = DataLoader(dataset=dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, collate_fn=custom_collate)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

### Val preparation

In [None]:
val_dataset = CustomDataset(
    labels=val_labels, 
    data=val_images, 
    has_label=True,
    resizing=True,
    transform=torchvision.transforms.Compose(transforms=[
        NewPad(),
        transforms.Resize(config.input_size),
        transforms.ToTensor(),
    ])
)
val_dataloader = DataLoader(dataset=dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, collate_fn=custom_collate)


In [None]:
def show(img, target):
    img = torchvision.transforms.ToPILImage()(img)
    draw = ImageDraw.Draw(img)

    for bbox, label in zip(target["boxes"], target["labels"]):
        label = label.item()
        origin = (bbox[0], bbox[1])
        rect = (origin, (bbox[2], bbox[3]))
        draw.text(xy=origin, text=str(label))
        draw.rectangle(xy=rect, outline=(255, 0, 0))
        
    img.show()

In [None]:
img, target = dataset[1]
show(img, target)

In [16]:
def train(epoch: int):
    bar = tqdm(val_dataloader)
    # train  
    model.train()
    for imgs, d_targets in bar: 
        imgs = imgs.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in d_targets]
        loss_dict = model(imgs, targets, True)
        loss = sum(loss_dict.values())
        bar.set_description(f"train loss: {loss}")
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()        
        
    scheduler.step()

    # val
    # 検証フェーズ
    model.eval()    
    with torch.no_grad():
        eval = engine.evaluate(model, val_dataloader, device)        
        imgs = eval.eval_imgs
        for iou_type in eval.iou_types:
            print("Result:")
            print(" iou type:", iou_type)
            print(" ", imgs[iou_type])

main_bar = tqdm(range(1, config.num_epochs+1))
for epoch in main_bar:
    train(epoch)

train loss: tensor([0.3084], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>): 100%|██████████| 108/108 [00:38<00:00,  2.77it/s]


creating index...
index created!
Test:  [  0/108]  eta: 0:05:08  model_time: 0.6154 (0.6154)  evaluator_time: 0.6456 (0.6456)  time: 2.8531  data: 1.5590  max mem: 5436
Test:  [100/108]  eta: 0:00:10  model_time: 0.5826 (0.5894)  evaluator_time: 0.6425 (0.6758)  time: 1.3271  data: 0.0434  max mem: 5436
Test:  [107/108]  eta: 0:00:01  model_time: 0.5824 (0.5850)  evaluator_time: 0.5653 (0.6635)  time: 1.2126  data: 0.0438  max mem: 5436
Test: Total time: 0:02:24 (1.3355 s / it)
Averaged stats: model_time: 0.5824 (0.5850)  evaluator_time: 0.5653 (0.6635)
Accumulating evaluation results...
DONE (t=3.21s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.103
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.169
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.108
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.017
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=med

 60%|██████    | 3/5 [13:35<09:03, 271.51s/it]

[[[{'image_id': 8299820032, 'category_id': 1, 'aRng': [0, 10000000000.0], 'maxDet': 100, 'dtIds': [1626, 1684, 1742], 'gtIds': [310], 'dtMatches': array([[310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [  0.,   0.,   0.],
          [  0.,   0.,   0.],
          [  0.,   0.,   0.]]), 'gtMatches': array([[1626.],
          [1626.],
          [1626.],
          [1626.],
          [1626.],
          [1626.],
          [1626.],
          [   0.],
          [   0.],
          [   0.]]), 'dtScores': [0.22822630405426025, 0.22822630405426025, 0.22822630405426025], 'gtIgnore': array([0]), 'dtIgnore': array([[False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [

train loss: tensor([0.2955], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>): 100%|██████████| 108/108 [00:38<00:00,  2.78it/s]


creating index...
index created!
Test:  [  0/108]  eta: 0:05:55  model_time: 0.6108 (0.6108)  evaluator_time: 0.6235 (0.6235)  time: 3.2881  data: 2.0206  max mem: 5436
Test:  [100/108]  eta: 0:00:10  model_time: 0.5848 (0.5903)  evaluator_time: 0.5757 (0.6109)  time: 1.2472  data: 0.0401  max mem: 5436
Test:  [107/108]  eta: 0:00:01  model_time: 0.5848 (0.5861)  evaluator_time: 0.5111 (0.5991)  time: 1.1503  data: 0.0421  max mem: 5436
Test: Total time: 0:02:17 (1.2726 s / it)
Averaged stats: model_time: 0.5848 (0.5861)  evaluator_time: 0.5111 (0.5991)
Accumulating evaluation results...
DONE (t=2.66s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.120
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.170
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.139
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.020
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=med

 80%|████████  | 4/5 [17:59<04:28, 268.15s/it]

[[[{'image_id': 8299820032, 'category_id': 1, 'aRng': [0, 10000000000.0], 'maxDet': 100, 'dtIds': [1150, 1194, 1238], 'gtIds': [310], 'dtMatches': array([[310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [  0.,   0.,   0.],
          [  0.,   0.,   0.],
          [  0.,   0.,   0.]]), 'gtMatches': array([[1150.],
          [1150.],
          [1150.],
          [1150.],
          [1150.],
          [1150.],
          [1150.],
          [   0.],
          [   0.],
          [   0.]]), 'dtScores': [0.07911579310894012, 0.07911579310894012, 0.07911579310894012], 'gtIgnore': array([0]), 'dtIgnore': array([[False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [

train loss: tensor([0.2762], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>): 100%|██████████| 108/108 [00:38<00:00,  2.79it/s]


creating index...
index created!
Test:  [  0/108]  eta: 0:05:35  model_time: 0.8524 (0.8524)  evaluator_time: 0.6173 (0.6173)  time: 3.1091  data: 1.6069  max mem: 5436
Test:  [100/108]  eta: 0:00:10  model_time: 0.5866 (0.5962)  evaluator_time: 0.5691 (0.6012)  time: 1.2581  data: 0.0427  max mem: 5436
Test:  [107/108]  eta: 0:00:01  model_time: 0.5850 (0.5916)  evaluator_time: 0.4806 (0.5896)  time: 1.1445  data: 0.0421  max mem: 5436
Test: Total time: 0:02:16 (1.2670 s / it)
Averaged stats: model_time: 0.5850 (0.5916)  evaluator_time: 0.4806 (0.5896)
Accumulating evaluation results...
DONE (t=2.36s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.121
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.170
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.140
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.020
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=med

100%|██████████| 5/5 [22:20<00:00, 268.05s/it]

[[[{'image_id': 8299820032, 'category_id': 1, 'aRng': [0, 10000000000.0], 'maxDet': 100, 'dtIds': [1100, 1142, 1184], 'gtIds': [310], 'dtMatches': array([[310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [310.,   0.,   0.],
          [  0.,   0.,   0.],
          [  0.,   0.,   0.],
          [  0.,   0.,   0.]]), 'gtMatches': array([[1100.],
          [1100.],
          [1100.],
          [1100.],
          [1100.],
          [1100.],
          [1100.],
          [   0.],
          [   0.],
          [   0.]]), 'dtScores': [0.06359256058931351, 0.06359256058931351, 0.06359256058931351], 'gtIgnore': array([0]), 'dtIgnore': array([[False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [False, False, False],
          [




In [17]:
import time

torch.save(model, f'model_weight-{time.time()}.pth')

: 