Split Mapillary data into training and validation areas.

In [3]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import sys
sys.path.append('fbsource/fbcode/surreal/')
from maploc.utils.geo import BoundaryBox
from maploc.osm.tiling_v2 import TileManager

In [2]:
ls data/mapillary_dumps_v2

In [4]:
dump_root = Path("data/mapillary_dumps_v2")
# locations = ["sanfrancisco_soma", "sanfrancisco_hayes", "amsterdam", "berlin", "lemans", "montrouge", "toulouse", "nantes"]
locations = ["sanfrancisco_soma", "sanfrancisco_hayes", "amsterdam", "berlin", "lemans", "montrouge", "toulouse", "nantes", "vilnius", "avignon", "helsinki", "milan"]
tilers = {}
dumps = {}
for loc in locations:
    tilers[loc] = TileManager.load(dump_root / loc / "tiles.pkl")
    with open(dump_root / loc / "outputs_per_sequence.json", "r") as fp:
        dumps[loc] = json.load(fp)

In [5]:
size = 80
margin = 10
target = 100
max_train = 50000

splits = {
    "train": [],
    "val": [],
}
for loc in locations:
    views = {(loc, seq, n): v for seq, vs in dumps[loc].items() for n, v in vs["views"].items()}
    names = list(views.keys())
    pos = np.array([views[n]["t_c2w"][:2] for n in names])
    center = np.median(pos, 0)

    bbox = BoundaryBox(center - size, center + size)
    mask = bbox.contains(pos)
    dist = np.abs(pos[mask] - center).max(-1)
    thresh = np.sort(dist)[target]

    bbox2 = BoundaryBox(center - thresh, center + thresh)
    val = [names[i] for i in np.where(bbox2.contains(pos))[0]]
    train = [names[i] for i in np.where(~(bbox2+margin).contains(pos))[0]]
    if len(train) > max_train:
        idxs = np.random.RandomState(0).choice(len(train), max_train)
        train = [train[i] for i in idxs]

    splits["train"].extend(train)
    splits["val"].extend(val)
    print(loc, len(val), len(train), thresh)
len(splits["val"]), len(splits["train"])

In [7]:
for scene in locations:
    loc_train, loc_val = [np.stack([dumps[sc][se]["views"][n]["t_c2w"][:2] for sc, se, n in splits[split] if sc == scene]) for split in ["train", "val"]]

    plt.figure(dpi=150, figsize=(10, 4))
    plt.subplot(121)
    plt.scatter(*loc_train.T, c="blue", s=1);
    plt.scatter(*loc_val.T, c="r", s=1);
    plt.gca().set_aspect("equal")
    plt.ylabel(scene)

    plt.subplot(122)
    dist = np.linalg.norm(loc_val[None] - loc_train[:, None], axis=-1)
    plt.hist(dist.min(0), bins=20);
    print(scene, np.median(dist.min(0)), dist.min(0).min())

In [20]:
from maploc.utils.io import write_json
split_path = dump_root / "splits_mly12_100each.json"
write_json(split_path, splits)

In [21]:
!manifold put $split_path psarlin/tree/maploc/data/mapillary_v2/splits_mly12_100each.json