<a href="https://colab.research.google.com/github/dariadementeva/NQ_Working/blob/main/mapillary_streetscapes_lux_city_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installs the package
!pip install mapillary

Collecting mapillary
  Downloading mapillary-1.0.15-py3-none-any.whl.metadata (10 kB)
Collecting mapbox-vector-tile>=2.1.0 (from mapillary)
  Downloading mapbox_vector_tile-2.2.0-py3-none-any.whl.metadata (16 kB)
Collecting mercantile>=1.2.1 (from mapillary)
  Downloading mercantile-1.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting vt2geojson>=0.2.1 (from mapillary)
  Downloading vt2geojson-0.2.1-py3-none-any.whl.metadata (1.5 kB)
Collecting haversine>=2.3.1 (from mapillary)
  Downloading haversine-2.9.0-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting turfpy>=0.0.7 (from mapillary)
  Downloading turfpy-0.0.8-py3-none-any.whl.metadata (4.2 kB)
Collecting geojson>=2.5.0 (from mapillary)
  Downloading geojson-3.2.0-py3-none-any.whl.metadata (16 kB)
Collecting protobuf<7.0.0,>=6.31.1 (from mapbox-vector-tile>=2.1.0->mapillary)
  Downloading protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting pyclipper<2.0.0,>=1.3.0 (from mapbox-vector-tile>=2.1.0->mapilla

In [None]:
from collections import Counter
import json
import mapillary.interface as mly
import math
import pandas as pd
import pprint, re
import random
import requests
import requests
import time

In [None]:
ACCESS_TOKEN = "MLY|25517726937837208|3f8625861dba933ed6b04631405be5bd"

# Luxembourg City (rough) bbox
WEST, SOUTH, EAST, NORTH = 6.05, 49.55, 6.20, 49.67

# 6.056671, 49.570060,6.188507, 49.645435: lux city
# 6.080446,49.599675,6.160870,49.622686: lux centre


FIELDS = "id,geometry,captured_at,thumb_1024_url"
LIMIT = 2000  # keep at max = 2000

# define grids to get the images per grid

def grid_bboxes(w, s, e, n, nx=4, ny=4):
    dx = (e - w) / nx
    dy = (n - s) / ny
    for ix in range(nx):
        for iy in range(ny):
            yield (w + ix*dx, s + iy*dy, w + (ix+1)*dx, s + (iy+1)*dy)

def get_images_bbox(bbox, retries=5):
    url = "https://graph.mapillary.com/images"
    params = {
        "access_token": ACCESS_TOKEN,
        "bbox": ",".join(map(str, bbox)),
        "fields": FIELDS,
        "limit": LIMIT,
    }

    for attempt in range(retries + 1):
        try:
            r = requests.get(url, params=params, timeout=60)
            if r.status_code in (429, 500, 502, 503, 504):
                raise requests.HTTPError(f"{r.status_code} {r.text[:200]}", response=r)
            r.raise_for_status()
            return r.json()
        except Exception as ex:
            if attempt == retries:
                raise
            time.sleep((2 ** attempt) + random.random())

def paginate(first_json):
    out = []
    data = first_json
    while True:
        out.extend(data.get("data", []))
        nxt = data.get("paging", {}).get("next")
        if not nxt:
            break
        r = requests.get(nxt, timeout=60)
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(2 + random.random())
            r = requests.get(nxt, timeout=60)
        r.raise_for_status()
        data = r.json()
    return out

all_images = []
for bbox in grid_bboxes(WEST, SOUTH, EAST, NORTH, nx=5, ny=5):
    js = get_images_bbox(bbox)
    imgs = paginate(js)
    all_images.extend(imgs)
    print("tile bbox", bbox, "->", len(imgs), "images")

# deduplicate by id (grid overlaps, grid border duplicates)
uniq = {img["id"]: img for img in all_images}
images = list(uniq.values())
print("Unique images, total:", len(images))

tile bbox (6.05, 49.55, 6.08, 49.574) -> 1892 images
tile bbox (6.05, 49.574, 6.08, 49.598) -> 1133 images
tile bbox (6.05, 49.598, 6.08, 49.622) -> 1536 images
tile bbox (6.05, 49.622, 6.08, 49.646) -> 1613 images
tile bbox (6.05, 49.646, 6.08, 49.67) -> 1874 images
tile bbox (6.08, 49.55, 6.11, 49.574) -> 1815 images
tile bbox (6.08, 49.574, 6.11, 49.598) -> 1944 images
tile bbox (6.08, 49.598, 6.11, 49.622) -> 1526 images
tile bbox (6.08, 49.622, 6.11, 49.646) -> 1604 images
tile bbox (6.08, 49.646, 6.11, 49.67) -> 1946 images
tile bbox (6.11, 49.55, 6.14, 49.574) -> 1898 images
tile bbox (6.11, 49.574, 6.14, 49.598) -> 1775 images
tile bbox (6.11, 49.598, 6.14, 49.622) -> 1865 images
tile bbox (6.11, 49.622, 6.14, 49.646) -> 1934 images
tile bbox (6.11, 49.646, 6.14, 49.67) -> 1938 images
tile bbox (6.14, 49.55, 6.17, 49.574) -> 1724 images
tile bbox (6.14, 49.574, 6.17, 49.598) -> 1795 images
tile bbox (6.14, 49.598, 6.17, 49.622) -> 1763 images
tile bbox (6.14, 49.622, 6.17, 49.6

In [None]:
# save to the json file

with open("mapillary_luxembourg_bbox.json", "w", encoding="utf-8") as f:
    json.dump(images, f, ensure_ascii=False, indent=2)

print("Saved", len(images), "records to mapillary_luxembourg_bbox.json")

Saved 43081 records to mapillary_luxembourg_bbox.json


In [None]:
JSON_PATH = "/content/mapillary_luxembourg_bbox.json"

with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

print("Top-level type:", type(data))

def pick_items(data):
    if isinstance(data, list):
        return data
    if isinstance(data, dict):
        for k in ["features", "images", "items", "data", "results"]:
            v = data.get(k)
            if isinstance(v, list):
                print(f"Using data['{k}'] as records")
                return v
        if all(isinstance(v, dict) for v in data.values()):
            print("Using dict values as records (id -> obj)")
            return list(data.values())
    return []

items = pick_items(data)
print("Number of records:", len(items))
print("First record type:", type(items[0]) if items else None)
print("Top keys in first record:", list(items[0].keys())[:30] if items and isinstance(items[0], dict) else None)


Top-level type: <class 'list'>
Number of records: 43081
First record type: <class 'dict'>
Top keys in first record: ['id', 'geometry', 'captured_at', 'thumb_1024_url']


In [None]:
pp = pprint.PrettyPrinter(depth=5, width=120)

if isinstance(data, dict):
    print("Top-level keys:", list(data.keys())[:50])
    print("GeoJSON type:", data.get("type"))
    if "features" in data:
        print("features type:", type(data["features"]), "len:", len(data["features"]))
        items = data["features"]

print("example")
pp.pprint(items[0])


example
{'captured_at': 1680175272403,
 'geometry': {'coordinates': [6.0612534, 49.5592706], 'type': 'Point'},
 'id': '135115332855948',
 'thumb_1024_url': 'https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/An_aOcle52VSckWX-8Aj-80wq10VoFE98EFRqZVAWSs8ejWfz9v8dPCDXZ4yfwNHGXUN_Exh43v5Ol-c-DnM68MUduU4QwYhcEU-yKf4KgGeB0JX-fLzSj4B4o11vFk1KYT6B9H_nyT4qqhWfVVcpw?stp=s1024x576&edm=AOnQwmMEAAAA&_nc_gid=OCaOMKjbGr4CxLCNFvhJtA&_nc_oc=AdnUTMO1eWSaSSXKb1LOEQ7qsrmFpL5OUwDqf5L21v86b_qE9R7eILq7mxuM2egkXWE&ccb=10-5&oh=00_AfqsUtro9aKF3d2O38w8TS_7oo1FtvIhTqduyCwdHAffow&oe=69A2C415&_nc_sid=201bca'}


In [None]:
# Create a csv for geocoding, keep ids, lon and lats, timestamps and urls

JSON_PATH = "/content/mapillary_luxembourg_bbox.json"
OUT_CSV = "/content/mapillary_all_43k_coords.csv"

with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

features = data["features"] if isinstance(data, dict) and "features" in data else data
print("Total features:", len(features))

rows = []
skipped = 0
missing_url = 0

for feat in features:
    image_id = ( # get ID
        feat.get("id") or
        feat.get("image_id") or
        feat.get("properties", {}).get("id")
    )

    geom = feat.get("geometry", {}) # get geometry
    coords = geom.get("coordinates")

    #
    thumb_url = ( # get url
        feat.get("thumb_1024_url") or
        feat.get("thumb_2048_url") or
        feat.get("url")
    )


    captured_at = ( # get timestamp
        feat.get("captured_at") or
        feat.get("properties", {}).get("captured_at") or
        feat.get("properties", {}).get("timestamp") or
        ""
    )

    if image_id is None or not isinstance(coords, list) or len(coords) != 2:
        skipped += 1
        continue

    if not thumb_url:
        missing_url += 1
        thumb_url = ""

    lon, lat = coords

    rows.append({
        "id": str(image_id),
        "captured_at": captured_at,
        "lon": float(lon),
        "lat": float(lat),
        "thumb_1024_url": thumb_url
    })

df = (
    pd.DataFrame(rows)
    .drop_duplicates(subset=["id"])
    .reset_index(drop=True)
)

df.to_csv(OUT_CSV, index=False)

print("Saved:", OUT_CSV)
print({
    "rows_written": len(df),
    "skipped_invalid": skipped,
    "missing_url": missing_url
})

df.head()


Total features: 43081
✅ Saved: /content/mapillary_all_43k_coords.csv
{'rows_written': 43081, 'skipped_invalid': 0, 'missing_url': 45}


Unnamed: 0,id,captured_at,lon,lat,thumb_1024_url
0,135115332855948,1680175272403,6.061253,49.559271,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...
1,209139725534508,1698313506346,6.061008,49.559129,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...
2,311225928352720,1698313553399,6.078481,49.562919,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...
3,385057477049231,1658824856830,6.050998,49.55309,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...
4,679104880390821,1692888443941,6.051723,49.553629,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...


In [None]:
from google.colab import files
files.download("/content/mapillary_all_43k_coords.csv") # download in csv for geocoding

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Fetch the images


In [None]:
import pandas as pd

csv_path = "/content/sample_data/geo_all_lux.csv"
df = pd.read_csv(csv_path)

df.head()


  df = pd.read_csv(csv_path)


Unnamed: 0,id,captured_at,lon,lat,thumb_1024_url,batch_id,address,place_id,licence,osm_type,...,commercial,historic,craft,man_made,neighbourhood,farm,residential,place,allotments,industrial
0,778445043542389,1681798905521,6.058698,49.591418,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...,1,"CR 163, Grevels-Barrière, Lorentzscheuer, Bert...",108642548,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,...,,,,,,,,,,
1,787405862846807,1681798845498,6.053924,49.597758,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...,1,"Route de Longwy, Grevels-Barrière, Lorentzsche...",108421508,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,...,,,,,,,,,,
2,1351241788777971,1678778758505,6.060283,49.594162,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...,1,"CR 163, Grevels-Barrière, Lorentzscheuer, Bert...",108642548,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,...,,,,,,,,,,
3,2233262170140424,1493733556070,6.050446,49.597099,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...,1,"N 5, Grevels-Barrière, Lorentzscheuer, Bertran...",108132672,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,...,,,,,,,,,,
4,157685697113031,1677654436569,6.051386,49.597229,https://scontent-bru2-1.xx.fbcdn.net/m1/v/t6/A...,1,"Route de Longwy, Grevels-Barrière, Lorentzsche...",108834957,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,...,,,,,,,,,,


In [None]:
for batch_id, batch_df in df.groupby("batch_id"):
    print(f"Batch {batch_id} → {len(batch_df)} rows")


Batch 1 → 5157 rows
Batch 2 → 5627 rows
Batch 3 → 8141 rows
Batch 4 → 8577 rows
Batch 5 → 8616 rows


In [None]:
MAPILLARY_TOKEN = "MLY|25517726937837208|3f8625861dba933ed6b04631405be5bd"

In [None]:
# GET THE IMAGES FROM BATCH 1

In [None]:
import requests
import time
import os
from tqdm import tqdm

# Create output folder
os.makedirs("images_batch_1", exist_ok=True)

# Filter batch 1
batch_1_df = df[df["batch_id"] == 1].dropna(subset=["thumb_1024_url"])

print(f"Downloading {len(batch_1_df)} images from batch 1")

for _, row in tqdm(
    batch_1_df.iterrows(),
    total=len(batch_1_df),
    desc="Downloading images"
):
    image_url = row["thumb_1024_url"]
    image_id = str(row["id"])

    file_path = f"images_batch_1/{image_id}.jpg"

    # Skip already downloaded images
    if os.path.exists(file_path):
        continue

    try:
        r = requests.get(image_url, timeout=10)
        if r.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(r.content)
    except Exception as e:
        print(f"Error {image_id}: {e}")

    time.sleep(0.3)  # avoid rate limits


Downloading 5157 images from batch 1


Downloading images: 100%|██████████| 5157/5157 [00:00<00:00, 16040.99it/s]


In [None]:
import os
import pandas as pd
import re

# Folder with existing images
IMAGE_DIR = "images_batch_1"

# Filter batch 1
batch_1_df = df[
    (df["batch_id"] == 1) &
    df["id"].notna() &
    df["image_name"].notna()
]

def safe_filename(name):
    name = str(name)
    return re.sub(r'[\\/*?:"<>|]', "_", name)

renamed = 0
skipped = 0
missing = 0

for _, row in batch_1_df.iterrows():
    old_name = f"{row['id']}.jpg"
    new_name = f"{safe_filename(row['image_name'])}.jpg"

    old_path = os.path.join(IMAGE_DIR, old_name)
    new_path = os.path.join(IMAGE_DIR, new_name)

    if not os.path.exists(old_path):
        missing += 1
        continue

    if os.path.exists(new_path):
        skipped += 1
        continue

    os.rename(old_path, new_path)
    renamed += 1

print(f"Renamed: {renamed}")
print(f"Skipped (already exists): {skipped}")
print(f"Missing original files: {missing}")


Renamed: 5157
Skipped (already exists): 0
Missing original files: 0


In [None]:
import shutil

shutil.make_archive(
    base_name="images_batch_1",  # output zip name (without .zip)
    format="zip",
    root_dir="images_batch_1"
)

from google.colab import files

files.download("images_batch_1.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# GET THE IMAGES FROM BATCH 2

In [None]:
import requests
import time
import os
from tqdm import tqdm

# Create output folder
os.makedirs("images_batch_2", exist_ok=True)

# Filter batch 2
batch_2_df = df[df["batch_id"] == 2].dropna(subset=["thumb_1024_url"])

print(f"Downloading {len(batch_2_df)} images from batch 2")

for _, row in tqdm(
    batch_2_df.iterrows(),
    total=len(batch_2_df),
    desc="Downloading images"
):
    image_url = row["thumb_1024_url"]
    image_id = str(row["id"])

    file_path = f"images_batch_2/{image_id}.jpg"

    # Skip already downloaded images
    if os.path.exists(file_path):
        continue

    try:
        r = requests.get(image_url, timeout=10)
        if r.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(r.content)
    except Exception as e:
        print(f"Error {image_id}: {e}")

    time.sleep(0.3)  # avoid rate limits


Downloading 5627 images from batch 2


Downloading images: 100%|██████████| 5627/5627 [2:24:10<00:00,  1.54s/it]


In [None]:
import os
import pandas as pd
import re

# Folder with existing images
IMAGE_DIR = "images_batch_2"

# Filter batch 2
batch_1_df = df[
    (df["batch_id"] == 2) &
    df["id"].notna() &
    df["image_name"].notna()
]

def safe_filename(name):
    name = str(name)
    return re.sub(r'[\\/*?:"<>|]', "_", name)

renamed = 0
skipped = 0
missing = 0

for _, row in batch_2_df.iterrows():
    old_name = f"{row['id']}.jpg"
    new_name = f"{safe_filename(row['image_name'])}.jpg"

    old_path = os.path.join(IMAGE_DIR, old_name)
    new_path = os.path.join(IMAGE_DIR, new_name)

    if not os.path.exists(old_path):
        missing += 1
        continue

    if os.path.exists(new_path):
        skipped += 1
        continue

    os.rename(old_path, new_path)
    renamed += 1

print(f"Renamed: {renamed}")
print(f"Skipped (already exists): {skipped}")
print(f"Missing original files: {missing}")


Renamed: 5627
Skipped (already exists): 0
Missing original files: 0


In [None]:
import shutil

shutil.make_archive(
    base_name="images_batch_2",  # output zip name (without .zip)
    format="zip",
    root_dir="images_batch_2"
)

from google.colab import files

files.download("images_batch_2.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# GET THE IMAGES, BATCH 3

In [None]:
import requests
import time
import os
from tqdm import tqdm

# Create output folder
os.makedirs("images_batch_3", exist_ok=True)

# Filter batch 3
batch_3_df = df[df["batch_id"] == 3].dropna(subset=["thumb_1024_url"])

print(f"Downloading {len(batch_3_df)} images from batch 3")

for _, row in tqdm(
    batch_3_df.iterrows(),
    total=len(batch_3_df),
    desc="Downloading images"
):
    image_url = row["thumb_1024_url"]
    image_id = str(row["id"])

    file_path = f"images_batch_3/{image_id}.jpg"

    # Skip already downloaded images
    if os.path.exists(file_path):
        continue

    try:
        r = requests.get(image_url, timeout=10)
        if r.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(r.content)
    except Exception as e:
        print(f"Error {image_id}: {e}")

    time.sleep(0.3)  # avoid rate limits


Downloading 8141 images from batch 3


Downloading images: 100%|██████████| 8141/8141 [3:32:57<00:00,  1.57s/it]


In [None]:
import os
import pandas as pd
import re

# Folder with existing images
IMAGE_DIR = "images_batch_3"

# Filter batch 3
batch_3_df = df[
    (df["batch_id"] == 3) &
    df["id"].notna() &
    df["image_name"].notna()
]

def safe_filename(name):
    name = str(name)
    return re.sub(r'[\\/*?:"<>|]', "_", name)

renamed = 0
skipped = 0
missing = 0

for _, row in batch_3_df.iterrows():
    old_name = f"{row['id']}.jpg"
    new_name = f"{safe_filename(row['image_name'])}.jpg"

    old_path = os.path.join(IMAGE_DIR, old_name)
    new_path = os.path.join(IMAGE_DIR, new_name)

    if not os.path.exists(old_path):
        missing += 1
        continue

    if os.path.exists(new_path):
        skipped += 1
        continue

    os.rename(old_path, new_path)
    renamed += 1

print(f"Renamed: {renamed}")
print(f"Skipped (already exists): {skipped}")
print(f"Missing original files: {missing}")

import shutil

shutil.make_archive(
    base_name="images_batch_3",  # output zip name (without .zip)
    format="zip",
    root_dir="images_batch_3"
)

from google.colab import files

files.download("images_batch_3.zip")

Renamed: 8141
Skipped (already exists): 0
Missing original files: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# GET THE IMAGES, BATCH 4

In [None]:
import requests
import time
import os
from tqdm import tqdm

# Create output folder
os.makedirs("images_batch_4", exist_ok=True)

# Filter batch 4
batch_4_df = df[df["batch_id"] == 4].dropna(subset=["thumb_1024_url"])

print(f"Downloading {len(batch_4_df)} images from batch 4")

for _, row in tqdm(
    batch_4_df.iterrows(),
    total=len(batch_4_df),
    desc="Downloading images"
):
    image_url = row["thumb_1024_url"]
    image_id = str(row["id"])

    file_path = f"images_batch_4/{image_id}.jpg"

    # Skip already downloaded images
    if os.path.exists(file_path):
        continue

    try:
        r = requests.get(image_url, timeout=10)
        if r.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(r.content)
    except Exception as e:
        print(f"Error {image_id}: {e}")

    time.sleep(0.3)  # avoid rate limits


Downloading 8535 images from batch 4


Downloading images: 100%|██████████| 8535/8535 [2:57:43<00:00,  1.25s/it]


In [None]:
import os
import pandas as pd
import re

# Folder with existing images
IMAGE_DIR = "images_batch_4"

# Filter batch 4
batch_4_df = df[
    (df["batch_id"] == 4) &
    df["id"].notna() &
    df["image_name"].notna()
]

def safe_filename(name):
    name = str(name)
    return re.sub(r'[\\/*?:"<>|]', "_", name)

renamed = 0
skipped = 0
missing = 0

for _, row in batch_4_df.iterrows():
    old_name = f"{row['id']}.jpg"
    new_name = f"{safe_filename(row['image_name'])}.jpg"

    old_path = os.path.join(IMAGE_DIR, old_name)
    new_path = os.path.join(IMAGE_DIR, new_name)

    if not os.path.exists(old_path):
        missing += 1
        continue

    if os.path.exists(new_path):
        skipped += 1
        continue

    os.rename(old_path, new_path)
    renamed += 1

print(f"Renamed: {renamed}")
print(f"Skipped (already exists): {skipped}")
print(f"Missing original files: {missing}")

import shutil

shutil.make_archive(
    base_name="images_batch_4",  # output zip name (without .zip)
    format="zip",
    root_dir="images_batch_4"
)

from google.colab import files

files.download("images_batch_4.zip")

Renamed: 8520
Skipped (already exists): 0
Missing original files: 57


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# FETCH THE IMAGES, BATCH 5

In [None]:
import requests
import time
import os
from tqdm import tqdm

# Create output folder
os.makedirs("images_batch_5", exist_ok=True)

# Filter batch 5
batch_5_df = df[df["batch_id"] == 5].dropna(subset=["thumb_1024_url"])

print(f"Downloading {len(batch_5_df)} images from batch 5")

for _, row in tqdm(
    batch_5_df.iterrows(),
    total=len(batch_5_df),
    desc="Downloading images"
):
    image_url = row["thumb_1024_url"]
    image_id = str(row["id"])

    file_path = f"images_batch_5/{image_id}.jpg"

    # Skip already downloaded images
    if os.path.exists(file_path):
        continue

    try:
        r = requests.get(image_url, timeout=10)
        if r.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(r.content)
    except Exception as e:
        print(f"Error {image_id}: {e}")

    time.sleep(0.3)  # avoid rate limits


Downloading 8616 images from batch 5


Downloading images: 100%|██████████| 8616/8616 [2:53:33<00:00,  1.21s/it]


In [None]:
import os
import pandas as pd
import re

# Folder with existing images
IMAGE_DIR = "images_batch_5"

# Filter batch 5
batch_5_df = df[
    (df["batch_id"] == 5) &
    df["id"].notna() &
    df["image_name"].notna()
]

def safe_filename(name):
    name = str(name)
    return re.sub(r'[\\/*?:"<>|]', "_", name)

renamed = 0
skipped = 0
missing = 0

for _, row in batch_5_df.iterrows():
    old_name = f"{row['id']}.jpg"
    new_name = f"{safe_filename(row['image_name'])}.jpg"

    old_path = os.path.join(IMAGE_DIR, old_name)
    new_path = os.path.join(IMAGE_DIR, new_name)

    if not os.path.exists(old_path):
        missing += 1
        continue

    if os.path.exists(new_path):
        skipped += 1
        continue

    os.rename(old_path, new_path)
    renamed += 1

print(f"Renamed: {renamed}")
print(f"Skipped (already exists): {skipped}")
print(f"Missing original files: {missing}")

import shutil

shutil.make_archive(
    base_name="images_batch_5",  # output zip name (without .zip)
    format="zip",
    root_dir="images_batch_5"
)

from google.colab import files

files.download("images_batch_5.zip")

Renamed: 8601
Skipped (already exists): 0
Missing original files: 15


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Scrape neighborhoods
# GARE

In [1]:
from google.colab import files
uploaded = files.upload("gare_boundary.shp.zip")

Saving gare_boundary.shp.zip to gare_boundary.shp.zip/gare_boundary.shp.zip


In [8]:
!find /content/unzipped_gare -maxdepth 3 -type f -name "*.shp" -print

In [2]:
import os, getpass
os.environ["MLY|25517726937837208|3f8625861dba933ed6b04631405be5bd"] = os.environ.get("MMLY|25517726937837208|3f8625861dba933ed6b04631405be5bd") or getpass.getpass("Paste Mapillary token:")


Paste Mapillary token:··········


In [17]:
!pip -q install geopandas mapillary tqdm

import os, json, getpass
import geopandas as gpd
import mapillary.interface as mly
from shapely.geometry import Polygon, MultiPolygon
from tqdm.auto import tqdm

# 1) Set your shapefile path (CHANGE THIS)
SHAPEFILE = "/content/gare_boundary/gare_boundary.shp/gare_boundary.shp"
OUT_JSON = "/content/mapillary_images.json"


# --- token ---
token = os.environ.get("MAPILLARY_TOKEN", "").strip() or getpass.getpass("Mapillary token (starts with MLY|): ")
os.environ["MAPILLARY_TOKEN"] = token
mly.set_access_token(token)

# --- AOI to EPSG:4326 ---
gdf = gpd.read_file(SHAPEFILE)
aoi = gdf.dissolve().geometry.iloc[0]
aoi = gpd.GeoSeries([aoi], crs=gdf.crs).to_crs(4326).iloc[0]

# --- FeatureCollection with nesting your SDK expects ---
if isinstance(aoi, Polygon):
    geom = {"type": "Polygon", "coordinates": [list(aoi.exterior.coords)]}
elif isinstance(aoi, MultiPolygon):
    geom = {"type": "MultiPolygon", "coordinates": [[list(p.exterior.coords)] for p in aoi.geoms]}
else:
    raise TypeError(type(aoi))

shape_fc = {"type": "FeatureCollection", "features": [{"type": "Feature", "properties": {}, "geometry": geom}]}

# --- get IDs inside AOI ---
fc = mly.images_in_shape(shape=shape_fc, image_type="flat", zoom=14)
features = list(fc.features)
print("Found features:", len(features))

# --- Graph API helper (bypasses SDK id validation) ---
FIELDS = "captured_at,thumb_1024_url,thumb_original_url,sequence"
BASE = "https://graph.mapillary.com/"

session = requests.Session()
session.headers.update({"Authorization": f"OAuth {token}"})

def graph_image_meta(image_id: str):
    r = session.get(f"{BASE}{image_id}", params={"fields": FIELDS}, timeout=30)
    # If an id is not an image id, Graph will usually 400/404; handle gracefully
    if not r.ok:
        return {"_error_status": r.status_code, "_error_text": r.text[:200]}
    return r.json()

# --- fetch metadata with progress ---
images = []
for f in tqdm(features, desc="Graph API lookups", unit="item"):
    f = f.to_dict() if hasattr(f, "to_dict") else dict(f)

    any_id = f["properties"]["id"]
    lon, lat = f["geometry"]["coordinates"]

    meta = graph_image_meta(any_id)

    images.append({
        "id": any_id,
        "lon": lon,
        "lat": lat,
        **meta,  # captured_at, thumb urls, sequence, or _error_*
    })

# --- save ---
with open(OUT_JSON, "w", encoding="utf-8") as fp:
    json.dump({"count": len(images), "images": images}, fp, indent=2)

print("Saved:", OUT_JSON, "items:", len(images))

from google.colab import files
files.download(OUT_JSON)




[Vector Tiles API] Fetching 4 tilesfor images ...
Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 200ms


INFO:mapillary.utils.client:Response 200 OK received in 200ms


Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5585/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5585/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 193ms


INFO:mapillary.utils.client:Response 200 OK received in 193ms


Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8471/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8471/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 223ms


INFO:mapillary.utils.client:Response 200 OK received in 223ms


Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8471/5585/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8471/5585/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 204ms


INFO:mapillary.utils.client:Response 200 OK received in 204ms


Found features: 8908


Graph API lookups:   0%|          | 0/8908 [00:00<?, ?item/s]

Saved: /content/mapillary_images.json items: 8908


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
import json, os, requests
from tqdm.auto import tqdm

JSON_FILE = "/content/mapillary_images.json"
OUT_DIR = "/content/gare"

os.makedirs(OUT_DIR, exist_ok=True)

with open(JSON_FILE, "r", encoding="utf-8") as f:
    images = json.load(f)["images"]

session = requests.Session()

for item in tqdm(images, desc="Downloading images", unit="img"):
    img_id = item.get("id")
    url = (
        item.get("thumb_1024_url")
        or item.get("thumb_2048_url")
        or item.get("thumb_256_url")
        or item.get("thumb_original_url")
    )

    if not img_id or not url:
        continue

    out_path = os.path.join(OUT_DIR, f"{img_id}.jpg")
    if os.path.exists(out_path):
        continue

    r = session.get(url, timeout=30)
    if r.ok:
        with open(out_path, "wb") as f:
            f.write(r.content)

print("Images saved in directory:", OUT_DIR)


Downloading images:   0%|          | 0/8908 [00:00<?, ?img/s]

Images saved in directory: /content/gare


In [21]:
!zip -r gare.zip gare

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
updating: gare/747668889257973.jpg (deflated 0%)
updating: gare/1636371230155658.jpg (deflated 0%)
updating: gare/970954904621567.jpg (deflated 1%)
updating: gare/676626940671730.jpg (deflated 0%)
updating: gare/569011688698407.jpg (deflated 0%)
updating: gare/601435324567910.jpg (deflated 0%)
updating: gare/2014156548726083.jpg (deflated 0%)
updating: gare/1188587054916004.jpg (deflated 0%)
updating: gare/916350805822297.jpg (deflated 1%)
updating: gare/575586260567306.jpg (deflated 1%)
updating: gare/728549998967126.jpg (deflated 0%)
updating: gare/286429837606728.jpg (deflated 0%)
updating: gare/1562215094562216.jpg (deflated 1%)
updating: gare/581310941557112.jpg (deflated 0%)
updating: gare/488702845909412.jpg (deflated 0%)
updating: gare/164116822216283.jpg (deflated 1%)
updating: gare/104260435272240.jpg (deflated 0%)
updating: gare/1607637359971421.jpg (deflated 0%)
updating: gare/195890965715005.jpg (deflated 0%)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
from google.colab import files
files.download("/content/gare.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
from google.colab import files
uploaded = files.upload("belair_boundary.shp.zip")

Saving belair_boundary.shp.zip to belair_boundary.shp.zip/belair_boundary.shp.zip


In [28]:
!mkdir -p /content/belair_boundary
!unzip /content/belair_boundary.shp.zip/belair_boundary.shp.zip -d /content/belair_boundary

Archive:  /content/belair_boundary.shp.zip/belair_boundary.shp.zip
   creating: /content/belair_boundary/belair_boundary.shp/
  inflating: /content/belair_boundary/belair_boundary.shp/belair_boundary.dbf  
  inflating: /content/belair_boundary/belair_boundary.shp/belair_boundary.prj  
  inflating: /content/belair_boundary/belair_boundary.shp/belair_boundary.shp  
  inflating: /content/belair_boundary/belair_boundary.shp/belair_boundary.shx  


In [30]:
!pip -q install geopandas mapillary tqdm

import os, json, getpass
import geopandas as gpd
import mapillary.interface as mly
from shapely.geometry import Polygon, MultiPolygon
from tqdm.auto import tqdm

# 1) Set your shapefile path (CHANGE THIS)
SHAPEFILE = "/content/belair_boundary/belair_boundary.shp/belair_boundary.shp"
OUT_JSON = "/content/mapillary_images_belair.json"


# --- token ---
token = os.environ.get("MAPILLARY_TOKEN", "").strip() or getpass.getpass("Mapillary token (starts with MLY|): ")
os.environ["MAPILLARY_TOKEN"] = token
mly.set_access_token(token)

# --- AOI to EPSG:4326 ---
gdf = gpd.read_file(SHAPEFILE)
aoi = gdf.dissolve().geometry.iloc[0]
aoi = gpd.GeoSeries([aoi], crs=gdf.crs).to_crs(4326).iloc[0]

# --- FeatureCollection with nesting your SDK expects ---
if isinstance(aoi, Polygon):
    geom = {"type": "Polygon", "coordinates": [list(aoi.exterior.coords)]}
elif isinstance(aoi, MultiPolygon):
    geom = {"type": "MultiPolygon", "coordinates": [[list(p.exterior.coords)] for p in aoi.geoms]}
else:
    raise TypeError(type(aoi))

shape_fc = {"type": "FeatureCollection", "features": [{"type": "Feature", "properties": {}, "geometry": geom}]}

# --- get IDs inside AOI ---
fc = mly.images_in_shape(shape=shape_fc, image_type="flat", zoom=14)
features = list(fc.features)
print("Found features:", len(features))

# --- Graph API helper (bypasses SDK id validation) ---
FIELDS = "captured_at,thumb_1024_url,thumb_original_url,sequence"
BASE = "https://graph.mapillary.com/"

session = requests.Session()
session.headers.update({"Authorization": f"OAuth {token}"})

def graph_image_meta(image_id: str):
    r = session.get(f"{BASE}{image_id}", params={"fields": FIELDS}, timeout=30)
    # If an id is not an image id, Graph will usually 400/404; handle gracefully
    if not r.ok:
        return {"_error_status": r.status_code, "_error_text": r.text[:200]}
    return r.json()

# --- fetch metadata with progress ---
images = []
for f in tqdm(features, desc="Graph API lookups", unit="item"):
    f = f.to_dict() if hasattr(f, "to_dict") else dict(f)

    any_id = f["properties"]["id"]
    lon, lat = f["geometry"]["coordinates"]

    meta = graph_image_meta(any_id)

    images.append({
        "id": any_id,
        "lon": lon,
        "lat": lat,
        **meta,  # captured_at, thumb urls, sequence, or _error_*
    })

# --- save ---
with open(OUT_JSON, "w", encoding="utf-8") as fp:
    json.dump({"count": len(images), "images": images}, fp, indent=2)

print("Saved:", OUT_JSON, "items:", len(images))

from google.colab import files
files.download(OUT_JSON)

[Vector Tiles API] Fetching 4 tilesfor images ...
Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8469/5583/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8469/5583/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 656ms


INFO:mapillary.utils.client:Response 200 OK received in 656ms


Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8469/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8469/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 561ms


INFO:mapillary.utils.client:Response 200 OK received in 561ms


Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5583/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5583/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 683ms


INFO:mapillary.utils.client:Response 200 OK received in 683ms


Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


INFO:mapillary.utils.client:Requesting GET to https://tiles.mapillary.com/maps/vtp/mly1_public/2/14/8470/5584/?access_token=MLY%7C25517726937837208%7C3f8625861dba933ed6b04631405be5bd


Response 200 OK received in 552ms


INFO:mapillary.utils.client:Response 200 OK received in 552ms


Found features: 9628


Graph API lookups:   0%|          | 0/9628 [00:00<?, ?item/s]

Saved: /content/mapillary_images_belair.json items: 9628


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
import json, os, requests
from tqdm.auto import tqdm

JSON_FILE = "/content/mapillary_images_belair.json"
OUT_DIR = "/content/belair"

os.makedirs(OUT_DIR, exist_ok=True)

with open(JSON_FILE, "r", encoding="utf-8") as f:
    images = json.load(f)["images"]

session = requests.Session()

for item in tqdm(images, desc="Downloading images", unit="img"):
    img_id = item.get("id")
    url = (
        item.get("thumb_1024_url")
        or item.get("thumb_2048_url")
        or item.get("thumb_256_url")
        or item.get("thumb_original_url")
    )

    if not img_id or not url:
        continue

    out_path = os.path.join(OUT_DIR, f"{img_id}.jpg")
    if os.path.exists(out_path):
        continue

    r = session.get(url, timeout=30)
    if r.ok:
        with open(out_path, "wb") as f:
            f.write(r.content)

print("Images saved in directory:", OUT_DIR)

Downloading images:   0%|          | 0/9628 [00:00<?, ?img/s]

Images saved in directory: /content/belair


In [33]:
!zip -r belair.zip belair

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: belair/487129665773038.jpg (deflated 0%)
  adding: belair/788229005137851.jpg (deflated 0%)
  adding: belair/1065718031121887.jpg (deflated 1%)
  adding: belair/2326309157557958.jpg (deflated 0%)
  adding: belair/267446069063054.jpg (deflated 1%)
  adding: belair/1177909060253097.jpg (deflated 0%)
  adding: belair/1867595870367580.jpg (deflated 1%)
  adding: belair/874980993053732.jpg (deflated 0%)
  adding: belair/2747889135358963.jpg (deflated 1%)
  adding: belair/962485205437668.jpg (deflated 0%)
  adding: belair/1168085873652655.jpg (deflated 1%)
  adding: belair/974478047145989.jpg (deflated 1%)
  adding: belair/623341669029246.jpg (deflated 0%)
  adding: belair/174160774610854.jpg (deflated 0%)
  adding: belair/1418309195557864.jpg (deflated 1%)
  adding: belair/308512614251761.jpg (deflated 1%)
  adding: belair/1469668573483596.jpg (deflated 1%)
  adding: belair/426715522814066.jpg (deflated 0%)
  adding:

In [34]:
from google.colab import files
files.download("/content/belair.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>