In [None]:
# Get list of all osm tags in nyc bbox. V. full US?

# Get distribution of different osm tags across the bbox? Compared to full US?

# Sparsity within each bbox (how many pixels are empty)? Compare to full US?

# Are tags consistent with osm images? Do tags have similar colors?

In [None]:

# # OSM + Satellite Imagery EDA for ControlNet Conditioning
# This notebook explores:
# 1. Spatial coverage & tag density
# 2. Tag semantics & distribution
# 3. Image-tag alignment
# 4. Model-relevant tag embedding quality

import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import box
import seaborn as sns
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from PIL import Image
warnings.filterwarnings('ignore')

In [None]:

# --- CONFIG ---
NYC_TAGS_PATH = "./data/nyc_tags.geojson"
US_TAGS_PATH = "./data/us_tags.geojson"
IMAGE_METADATA_PATH = "./data/image_metadata.csv"  # Includes image paths + associated tags
TAG_EMBEDDINGS_PATH = "./data/tag_embeddings.npy"  # numpy array of shape (n_tags, d)
TAG_LIST_PATH = "./data/tag_list.csv"  # CSV with columns: tag_id, tag_text


In [None]:

# --- LOAD DATA ---
nyc_gdf = gpd.read_file(NYC_TAGS_PATH)
us_gdf = gpd.read_file(US_TAGS_PATH)
img_df = pd.read_csv(IMAGE_METADATA_PATH)
tag_embeddings = np.load(TAG_EMBEDDINGS_PATH)
tag_text = pd.read_csv(TAG_LIST_PATH)


In [None]:
# --- 1. SPATIAL COVERAGE & DENSITY ---
def compute_tag_density(gdf, region_name, grid_size_km=1):
    # Convert to EPSG:3857 for metric distances
    gdf = gdf.to_crs(epsg=3857)
    bounds = gdf.total_bounds
    xmin, ymin, xmax, ymax = bounds
    x_coords = np.arange(xmin, xmax, grid_size_km * 1000)
    y_coords = np.arange(ymin, ymax, grid_size_km * 1000)

    density_matrix = np.zeros((len(y_coords), len(x_coords)))
    for idx, geom in enumerate(gdf.geometry):
        centroid = geom.centroid
        x_idx = np.searchsorted(x_coords, centroid.x) - 1
        y_idx = np.searchsorted(y_coords, centroid.y) - 1
        if 0 <= x_idx < len(x_coords) and 0 <= y_idx < len(y_coords):
            density_matrix[y_idx, x_idx] += 1

    plt.figure(figsize=(12, 8))
    sns.heatmap(density_matrix, cmap="viridis")
    plt.title(f"Tag Density Heatmap ({region_name})")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.show()

compute_tag_density(nyc_gdf, "NYC")
compute_tag_density(us_gdf, "US")


In [None]:
# --- 2. TAG SEMANTICS & DISTRIBUTION ---
def plot_tag_distribution(gdf, region_name):
    tags = gdf["tag"].value_counts().head(20)
    tags.plot(kind="barh", figsize=(8, 6), title=f"Top Tags in {region_name}")
    plt.gca().invert_yaxis()
    plt.xlabel("Count")
    plt.show()

plot_tag_distribution(nyc_gdf, "NYC")
plot_tag_distribution(us_gdf, "US")


In [None]:
# --- 3. IMAGE-TAG ALIGNMENT ---

def compute_mean_colors(image_metadata, sample_size=100):
    sample = image_metadata.sample(sample_size)
    tag_colors = {}
    for tag in sample["tag"].unique():
        tag_imgs = sample[sample["tag"] == tag]["image_path"]
        colors = []
        for path in tag_imgs:
            try:
                img = Image.open(path).convert("RGB")
                arr = np.array(img)
                colors.append(arr.mean(axis=(0, 1)))
            except:
                continue
        if colors:
            tag_colors[tag] = np.mean(colors, axis=0)
    return tag_colors

mean_colors = compute_mean_colors(img_df)
pd.DataFrame(mean_colors).T.plot(kind="bar", figsize=(10, 5), title="Mean Image Color by Tag")
plt.xlabel("Tag")
plt.ylabel("RGB Mean")
plt.show()

In [None]:

# --- 4. MODEL-RELEVANT FEATURES: TAG EMBEDDING CLUSTERING ---
def plot_tag_embedding_tsne(embeddings, tag_texts):
    tsne = TSNE(n_components=2, perplexity=5, random_state=0)
    reduced = tsne.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced[:, 0], reduced[:, 1])
    for i, txt in enumerate(tag_texts):
        plt.annotate(txt, (reduced[i, 0], reduced[i, 1]), fontsize=9, alpha=0.6)
    plt.title("t-SNE of Tag Embeddings")
    plt.show()

plot_tag_embedding_tsne(tag_embeddings, tag_text["tag_text"].tolist())
