# Create dataset

## Download Global Streetscapes Metadata from [NUS-UAL/global-streetscapes](https://huggingface.co/datasets/NUS-UAL/global-streetscapes/tree/main/data)

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd

The following files where chosen since they were considered as quite useful (really subjective though).

In [None]:
# Define dataset ID and target filenames
dataset_id = "NUS-UAL/global-streetscapes"
files = [
    "metadata_common_attributes.csv",
    "metadata_mly1.csv",
    "metadata_mly2.csv",
    "osm.csv",
    "perception.csv",
    "places365.csv",
    "season.csv",
    "simplemaps.csv",
    "contextual.csv",
]

# Download each file from the "data/" folder in the dataset repo
local_paths = []
for fname in files:
    path = hf_hub_download(
        repo_id=dataset_id,
        repo_type="dataset",
        filename=f"data/{fname}",
        local_dir="data/",
        local_dir_use_symlinks=False,
    )
    local_paths.append(path)
    print(f"Downloaded {fname} -> {path}")

# Optional: load one of the CSVs into pandas
import pandas as pd

df = pd.read_csv(local_paths[0])
df.head()

## Combine metadata in one file

In [None]:
# Base CSV: Filter by source = 'Mapillary'
metadata_common = pd.read_csv("data/metadata_common_attributes.csv", usecols=['uuid', 'lat', 'lon', 'heading', 'orig_id', 'source'])
metadata_common = metadata_common[metadata_common['source'] == 'Mapillary']

# Function to safely merge another CSV by uuid
def merge_csv(df_base, file_path, usecols):
    df_add = pd.read_csv(file_path, usecols=usecols)
    return df_base.merge(df_add, on='uuid', how='left')

# Sequentially merge all additional files
metadata_common = merge_csv(metadata_common, "data/metadata_mly1.csv", ['uuid', 'mly_quality_score'])
metadata_common = merge_csv(metadata_common, "data/metadata_mly2.csv", ['uuid', 'mly_computed_compass_angle'])
metadata_common = merge_csv(metadata_common, "data/osm.csv", ['uuid', 'type_highway'])
metadata_common = merge_csv(metadata_common, "data/perception.csv", None)  # None = all columns
metadata_common = merge_csv(metadata_common, "data/places365.csv", ['uuid', 'place'])
metadata_common = merge_csv(metadata_common, "data/season.csv", ['uuid', 'season'])
metadata_common = merge_csv(metadata_common, "data/simplemaps.csv", ['uuid', 'city_ascii', 'city_id', 'iso3', 'admin_name'])
metadata_common = merge_csv(metadata_common, "data/contextual.csv", ['uuid', 'platform', 'view_direction', 'quality'])

# Save final result
metadata_common.to_csv("data/joined_metadata.csv", index=False)

print("Final dataset shape:", metadata_common.shape)
print(metadata_common.head())

## Build gpd-Dataframe

In [None]:
# Read file
data = pd.read_csv("data/joined_metadata.csv")
print(data.head())

In [None]:
# Create gdf
gdf = gpd.GeoDataFrame(
    data, geometry=gpd.points_from_xy(data.lon, data.lat), crs="EPSG:4326"
)

## Connect to PostgreSQL

In [None]:
# Database connection info
# Change credentials if necessary

host = "localhost"
database = "gis"
user = "moritz"
port = "25432"
password = getpass("Enter your password: ")

# URL-encode the password to handle special characters
encoded_password = quote_plus(password)

# Add the port in the connection string
connection_string = f"postgresql://{user}:{encoded_password}@{host}:{port}/{database}"

from sqlalchemy import create_engine

engine = create_engine(connection_string)

# If you want to use %sql magic in Jupyter
%reload_ext sql
%sql $connection_string
%config SqlMagic.style = 'DEFAULT'

In [None]:
# Create table
gdf.to_postgis('global_streetscapes', engine, if_exists='replace', index=False)

In [None]:
# Create spatial index
%%sql
DROP INDEX IF EXISTS idx_global_streetscapes_spgist_geometry;
CREATE INDEX idx_global_streetscapes_spgist_geometry ON global_streetscapes USING SPGIST (geometry)

In [None]:
# Test if it worked
%%sql
SELECT * FROM global_streetscapes LIMIT 1;

# Create table 'berlin'

In [None]:
%%sql
DROP TABLE iF EXISTS berlin;
CREATE TABLE berlin AS
SELECT *
FROM global_streetscapes
WHERE city_ascii = 'Berlin'
AND mly_quality_score >= 0.95
AND ABS(heading - mly_computed_compass_angle) <= 10;

In [None]:
# UTM for meters
%%sql
ALTER TABLE berlin ADD COLUMN geometry_32633 geometry(Geometry, 32633);
UPDATE berlin
SET geometry_32633 = ST_Transform(geometry, 32633);

In [None]:
# Create spatial index
%%sql
CREATE INDEX idx_berlin_spgist_geometry_32633 ON berlin USING SPGIST (geometry_32633);

In [None]:
# Test if it worked
%%sql
SELECT * FROM berlin LIMIT 1;

## 'berlin' needs to be modified 

The lat & long columns are unprecise, as a result we also add the computed lat & long from mapillary.

In [None]:
# Add to new columns
%%sql
ALTER TABLE berlin
ADD COLUMN comp_lat double precision,
ADD COLUMN comp_lon double precision

In [None]:
import os
import requests
from sqlalchemy import text
from tqdm import tqdm

## Set mapillary token

In [None]:
os.environ["MAPILLARY_TOKEN"] = "<YOUR_API_KEY>"
token = os.environ["MAPILLARY_TOKEN"]

def fetch_metadata(img_id: str, token: str) -> tuple[float, float] | None:
    """
    Return (lon, lat) from Mapillary’s computed_geometry or None if missing.
    """
    url = (
        f"https://graph.mapillary.com/{img_id}"
        f"?access_token={token}"
        "&fields=id,computed_geometry"
    )
    r = requests.get(url, timeout=10)
    r.raise_for_status()
    coords = r.json().get("computed_geometry", {}).get("coordinates")
    if coords and len(coords) == 2:
        lon, lat = coords      # Mapillary returns [lon, lat]
        return lon, lat
    return None

## Collect the IDs that need updating

In [None]:
with engine.connect() as conn:
    ids_to_update = conn.execute(text("""
        SELECT orig_id_x
        FROM berlin
        WHERE comp_lat IS NULL
           OR comp_lon IS NULL;
    """)).scalars().all()

print(f"Found {len(ids_to_update)} images that need coordinates.")

## Call mapillary & build a VALUES list for bulk update

In [None]:
values = []
for img_id in tqdm(ids_to_update, desc="Fetching Mapillary metadata"):
    coords = None
    try:
        coords = fetch_metadata(img_id, token)
    except Exception as e:
        print(f"{img_id}: {e}")

    if coords:
        lon, lat = coords
        values.append((img_id, lon, lat))

print(f"Prepared {len(values)} rows for UPDATE.")

## One bulk UPDATE back into PostGIS

In [None]:
if values:
    values_clause = ",\n".join(
        f"({img_id}, {lon}, {lat})"
        for img_id, lon, lat in values
    )

    sql = f"""
        UPDATE berlin AS b
           SET comp_lon = v.lon,
               comp_lat = v.lat
          FROM (VALUES
                {values_clause}
          ) AS v(orig_id_x, lon, lat)
         WHERE b.orig_id_x = v.orig_id_x;
    """

    with engine.begin() as conn:
        conn.execute(text(sql))

    print("PostGIS table updated.")

else:
    print("Nothing to update – all rows already have comp_lat/comp_lon.")

## Create the new geom column

In [None]:
%%sql
ALTER TABLE berlin ADD COLUMN geom_comp geometry(Point, 4326)

In [None]:
%%sql
UPDATE berlin SET geom_comp = ST_SetSRID(ST_MakePoint(comp_lon, comp_lat), 4326)

In [None]:
%%sql
ALTER TABLE berlin ADD COLUMN geometry_comp_32633 geometry(Geometry, 32633);
UPDATE berlin
SET geometry_comp_32633 = ST_Transform(geom_comp, 32633)