In [1]:
# Global variables. Set with respect to your circumstances
A2D2_PATH = "/home/g.leontiev/a2d2/"

# Benchmark creation. Dataset split.

In [2]:
# Import libs
import json
import numpy as np
from tutorial_modules import *
from os.path import sep as os_sep
from os.path import join as join_path
from os.path import exists as path_exists
from glob import glob as gg
from random import choice as r_ch
from tqdm import tqdm
import pickle

# Load files
with open(join_path(A2D2_PATH, "cams_lidars.json"), "rb") as f:
    config = json.load(f)
    
# Load files
with open(join_path(A2D2_PATH, "camera_lidar_semantic", "class_list.json"), "rb") as f:
     class_list= json.load(f)

INFO - 2022-06-13 15:47:35,739 - utils - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO - 2022-06-13 15:47:35,740 - utils - NumExpr defaulting to 8 threads.


В датасете присутствуют данные для семантической сегментации, при этом не для каждого изображения есть разбивки и данные дальномера.
В будущем для сравнения эффективности разных подходов будут использоваться в том числе техники слияния данных с разных сенсоров, поэтому для нашего "бенчмарка" нужно подобрать те наблюдения, которые представлены с разных сенсоров и для которых есть маска сегментации.

In [3]:
ss_p = join_path(A2D2_PATH, "camera_lidar_semantic")
parent_folders, sensor_types, sensor_aligns, all_files = [np.unique([p.split("/")[5:][i] for p in gg(join_path(ss_p, "*/*/*/*"))]) for i in range(4)]

Для однозначной идентификации наблюдения достаточно указать через "_" дату, положение камеры и timestamp

In [4]:
ids = sorted(set([ # remove duplicates and sort
    "_".join([a,d,c]) for a,b,c,d in [ # create id
        x.split(os_sep)[-1].split(".")[0].split("_") for x in # split filename into parts
        gg(join_path(ss_p, f"*{os_sep}*{os_sep}*{os_sep}*")) # find all data from all sensors
    ]]))

In [5]:
sens_ext = {
    "camera": ".png",
    "label": ".png",
    "lidar": ".npz"
}

rel_ = lambda __p: join_path(*__p.split(os_sep)[__p.split(os_sep).index('camera_lidar_semantic'):])
abs_ = lambda __p: join_path(A2D2_PATH, __p)

def sensor_p(_id, s_type):
    if s_type not in sens_ext.keys(): raise ValueError("Wrong sensor type: s_type")
    d,t,s = _id.split("_")
    _p = "_".join([d, s_type, s, t]) + sens_ext[s_type]
    _p = join_path(ss_p, f"{d[:8]}_{d[8:]}", s_type, f"cam_{sa_(s)}", _p)
    return _p

def sa_(x):
    als = ["center", "left", "right"]
    for o in als:
        if o in x:
            return x.replace(o, "_" + o)
    raise ValueError(f"Bad index contains wrong sensor align: {x}")

In [6]:
ds_ids = []
for _id in ids:
    _exists = True
    for i in sensor_types:
        _exists = _exists and path_exists(sensor_p(_id, i))
    if _exists:
        ds_ids.append(_id)

При разбиении датасета на выборки, важно учитывать временную составляющую, поскольку многие потенциально эффективные нейронные сети используют рекуррентные слои. Поэтому важно отсортировать id по дате и дню, а также не разбрасывать данные одного наблюдения разных сенсоров по выборкам

In [7]:
if not path_exists("bm_ds.pkl"):
    ds_pool = sorted(set(["_".join(i.split("_")[:-1]) for i in ds_ids]))
    val_size = test_size = round(len(ds_pool) * 0.15)
    subsets = test_ids, val_ids, train_ids = [], [], []
    namesplits = test_list, val_list, train_list = ds_pool[-test_size:], ds_pool[-test_size-val_size:-test_size], ds_pool[:-test_size-val_size]
    for ss, ns in zip(subsets, namesplits):
        for n in tqdm(ns):
            for p in ds_ids:
                if n in p:
                    ss.append(p)
    bm_ds = {"test_ids" : test_ids, "val_ids" : val_ids, "train_ids" : train_ids}
    with open("bm_ds.pkl", "wb") as f:
        pickle.dump(bm_ds, f)
else:
    with open("bm_ds.pkl", "rb") as f:
        bm_ds = pickle.load(f)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 5655/5655 [00:17<00:00, 323.95it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 5655/5655 [00:17<00:00, 326.27it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 26388/26388 [01:21<00:00, 325.00it/s]


# Semantic segmentation. Baseline model.

In [8]:
import torch
from PIL import Image
from torchvision import transforms
from torchvision.models.segmentation.deeplabv3 import DeepLabHead
from torchvision.models.segmentation import deeplabv3_resnet101

In [9]:
NUM_CLASSES = len(class_list)
random_image_id = r_ch(bm_ds['train_ids'])

image_p = sensor_p(random_image_id, "camera")
label_p = sensor_p(random_image_id, "label")

In [11]:
from PIL import Image
from torchvision import transforms
model = deeplabv3_resnet101(pretrained=True, progress=True)
input_image = Image.open(image_p)
input_image = input_image.convert("RGB")
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

# move the input and model to GPU for speed if available
if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')

with torch.no_grad():
    output = model(input_batch)['out'][0]
output_predictions = output.argmax(0)

ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256, 1, 1])

In [None]:
# model = deeplabv3_resnet101(pretrained=True, progress=True)
# model.classifier = DeepLabHead(2048, NUM_CLASSES)
# _ = model.train()

# import torch
# model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True)
# # or any of these variants
# # model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet101', pretrained=True)
# # model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_mobilenet_v3_large', pretrained=True)
# model.eval()




# input_image = Image.open(image_p)
# input_image = input_image.convert("RGB")

# preprocess = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
# ])

# input_tensor = preprocess(input_image)
# input_batch = input_tensor.unsqueeze(0)

# if torch.cuda.is_available():
#     input_batch = input_batch.to('cuda')
#     model.to('cuda')

# with torch.no_grad():
#     output = model(input_batch)['out'][0]
# output_predictions = output.argmax(0)

# # create a color pallette, selecting a color for each class
# palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
# colors = torch.as_tensor([i for i in range(21)])[:, None] * palette
# colors = (colors % 255).numpy().astype("uint8")

# # plot the semantic segmentation predictions of 21 classes in each color
# r = Image.fromarray(output_predictions.byte().cpu().numpy()).resize(input_image.size)
# r.putpalette(colors)

# import matplotlib.pyplot as plt
# plt.imshow(r)
# # plt.show()

# input_image