# Metadata mapping

This notebook sets up methods to add metadata from the original fMoW dataset to the metadata of the wilds version:
- coordinates of the image center 
- image span in degrees

Both information are derived from the bounding box information contained in the fMoW metadata.

In [None]:
import json
import os
import logging

import pandas as pd
from wilds import get_dataset

In [None]:
PROJECT_ROOT = '/'.join(os.getcwd().split('/')[:-2])
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
METADATA_DIR = os.path.join(DATA_DIR, "groundtruth")

if not (os.path.exists(PROJECT_ROOT) and os.path.exists(DATA_DIR) and os.path.exists(METADATA_DIR)):
    raise NotADirectoryError()

LOGGER = logging.getLogger(__name__)
logging.basicConfig(filename='metadata_mapping.log', level=logging.INFO)

In [17]:
dataset = get_dataset(dataset="fmow")
metadata = dataset.metadata
h = metadata.head()
h

Unnamed: 0,split,img_filename,img_path,spatial_reference,epsg,category,visible,img_width,img_height,country_code,cloud_cover,timestamp,lat,lon,region,y,year
0,train,tunnel_opening_370_6_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1359,1222,CHN,0,2017-04-12T04:14:15Z,26.604456,101.733373,0,57,15.0
1,train,tunnel_opening_370_1_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,900,809,CHN,0,2017-01-30T04:18:04Z,26.604456,101.733373,0,57,15.0
2,train,tunnel_opening_370_0_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1058,951,CHN,1,2015-02-20T04:11:50Z,26.604456,101.733373,0,57,13.0
3,train,tunnel_opening_370_3_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1568,1409,CHN,11,2015-11-23T04:10:38Z,26.604456,101.733373,0,57,13.0
4,train,tunnel_opening_370_4_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1624,1460,CHN,0,2016-04-04T04:12:15Z,26.604456,101.733373,0,57,14.0


In [None]:
from typing import Self
import re
from dataclasses import dataclass


@dataclass(frozen=True)
class BoundingBox:
    north_west: tuple[float, float]
    north_east: tuple[float, float]
    south_east: tuple[float, float]
    south_west: tuple[float, float]

    @classmethod
    def from_raw(cls, raw_location: str) -> Self:
        """Extract coordinates from raw location string and check for validity.

        A raw location string is considered valid, if the order of points in the raw string is:
            1. North west
            2. North east
            3. South east
            4. South west

        and any additional points after the first four are duplicates.
        """
        points = re.findall(r"[-\d.]+ [-\d.]+", raw_location)
        coords = [tuple(map(float, p.split())) for p in points]

        if len(coords) < 4:
            raise ValueError(
                f"Expected at least 4 coordinate pairs, got {len(coords)}")

        # Check if any additional points are duplicates.
        first_four = coords[:4]
        extras = coords[4:]
        if extras:
            for coord in extras:
                if coord not in first_four:
                    raise ValueError(
                        f"Extra coordinate {coord} is not one of the first four; "
                        "inconsistent bounding box"
                    )

        if contains_pole(first_four):
            raise ValueError(
                "Bounding box contains or intersects north/south pole "
                "(each corner in different longitude sector)"
            )

        if crosses_180th_meridian(first_four):
            LOGGER.info('Found bounding box crossing the 180th meridian!')
            # Shift bounding box to 0th meridian
            first_four = shift_bounding_box_lon(first_four, 180)

        # Create the expected canonical form from the first four points.
        lons = [c[0] for c in first_four]
        lats = [c[1] for c in first_four]
        min_lon, max_lon = min(lons), max(lons)
        min_lat, max_lat = min(lats), max(lats)

        canonical = [
            (min_lon, max_lat),    # NW
            (max_lon, max_lat),    # NE
            (max_lon, min_lat),    # SE
            (min_lon, min_lat),    # SW
        ]

        if first_four != canonical:
            raise ValueError(
                "First four points do not form an axis-aligned rectangle in "
                "north_west, north_east, south_east, south_west configuration"
            )

        return cls(
            north_west=first_four[0],
            north_east=first_four[1],
            south_east=first_four[2],
            south_west=first_four[3]
        )

    def as_list(self) -> list[tuple[float, float]]:
        """Return the four points in north_west, north_east, south_east, south_west order."""
        return [self.north_west, self.north_east, self.south_east, self.south_west]

    def __iter__(self):
        return iter(self.as_list())
    
    def get_width_deg(self) -> float:
        return abs(self.north_east[0] - self.north_west[0])
    
    def get_height_deg(self) -> float:
        return abs(self.north_west[1] - self.south_west[1])


def lies_any_lon_within(coords: list[tuple[float, float]], lower: float, upper: float) -> bool:
    """Checks if any longitude of the given coordinates (lon, lat) lies between lower and upper.

    Args:
        coords (list): List of coordinates (lon, lat) tuples.
        lower (float): Lower bound of the longitude sector to check.
        upper (float): Upper bound of the longitude sector to check.

    Returns:
        bool: True, if any coordinates longitude lies within the range. 
    """
    return any(lower <= coord[0] <= upper for coord in coords)


def contains_pole(bounding_box: list[tuple[float, float]]) -> bool:
    """Checks if the given bounding box contains or intersects the north or south pole.

    Args:
        bounding_box: List of four coordinate tuples (lon, lat) describing a rectangle.

    Returns:
        bool: True, if north or south pole is contained in the box, i.e. each corner lies in a different longitude sector.
    """
    return (
        lies_any_lon_within(bounding_box, -180, -90)
        and lies_any_lon_within(bounding_box, -90, 0)
        and lies_any_lon_within(bounding_box, 0, 90)
        and lies_any_lon_within(bounding_box, 90, 180)
    )


def crosses_180th_meridian(bounding_box: list[tuple[float, float]]) -> bool:
    """Checks if the bounding box crosses the 180th meridian, but is canonical besides that.

    Assumptions taken: 
        - Bounding box does not span more than 20 deg in longitude.

    Args:
        bounding_box: List of four coordinate tuples (lon, lat) describing a rectangle.

    Returns:
        bool: True, if the bounding box crosses the 180th meridian and is canonical. 
    """

    c1, c2, c3, c4 = bounding_box

    canonical_cond = (
        (c1[0] == c4[0])
        and (c2[0] == c3[0])
        and (c1[1] == c2[1])
        and (c3[1] == c4[1])
        and (c1[1] > c4[1])
    ) 
    crossing_cond = (
        (c1[0] > 170 and c4[0] > 170) 
        and (c2[0] < -170 and c3[0] < -170)
    )
    return canonical_cond and crossing_cond


def shift_bounding_box_lon(bounding_box: list[tuple[float, float]], s: float) -> list[tuple[float, float]]:
    """Shift bounding box by s degree longitude.

    Args:
        bounding_box: List of four coordinate tuples (lon, lat) describing a rectangle.

    Returns:
        Shifted bounding box. 
    """
    c1, c2, c3, c4 = bounding_box

    # Shift bounding box by s degree longitude.
    c1_shifted = (c1[0] - s, c1[1])
    c4_shifted = (c4[0] - s, c4[1])
    c2_shifted = (c2[0] + s, c2[1])
    c3_shifted = (c3[0] + s, c3[1])

    return [c1_shifted, c2_shifted, c3_shifted, c4_shifted]

In [None]:
def build_fmow_metadata_path(wilds_metadata_sample: pd.core.series.Series) -> str:
    """Build sample specific path to original fmow metadata. 

    The original fmow metadata is organized as a collection of json files. To access
    the original fmow metadata, extract the image path from wilds metadata, exchange
    the extension to `.json` and prepend the path to the original fmow metadata. 

    Args:
        wilds_metadata_sample (pd.core.series.Series): Metadata for a sample of the fmow WILDS dataset. 

    Returns:
        str: Path to the original fmow metadata file.
    """

    dir_name, base_name = os.path.split(wilds_metadata_sample["img_path"])
    file_name, _ = os.path.splitext(base_name)
    metadata_path = os.path.join(dir_name, f"{file_name}.json")
    return os.path.join(METADATA_DIR, metadata_path)


def compute_center_coordinates_and_span(fmow_metadata_sample: pd.core.series.Series) -> list:
    """Compute the coordinates of the image center and the image span in degrees.

    The coordinates of the bounding box are extracted together with the image width and height.
    Both are used to interpolate the image center coordinates as well as the image span in degrees.

    Args:
        fmow_metadata_sample (pd.core.series.Series): Metadata of the original fmow dataset. 

    Returns:
        list: Center coordinates and image span in degree. 
    """
    box_info = fmow_metadata_sample.get('bounding_boxes')[0]
    box = box_info.get('box')
    box_pos_x, box_pos_y, box_width, box_height = box

    img_width = fmow_metadata_sample.get('img_width')
    img_height = fmow_metadata_sample.get('img_height')
    center = (img_width / 2, img_height / 2)
    top_left_to_center_fraction = ((center[0] - box_pos_x) / box_width, (center[1] - box_pos_y) / box_height)

    raw_location = fmow_metadata_sample['bounding_boxes'][0]['raw_location']

    bbox = BoundingBox.from_raw(raw_location)

    center_lon = bbox.north_west[0] + top_left_to_center_fraction[0] * bbox.get_width_deg()
    center_lat = bbox.north_west[1] - top_left_to_center_fraction[1] * bbox.get_height_deg() 

    if img_width > img_height:
        img_span_deg = img_height / box_height * bbox.get_height_deg() 
    else:
        img_span_deg = img_width / box_width * bbox.get_width_deg()

    if img_span_deg > 0.1:
        LOGGER.info(
            f"Very large image span of {img_span_deg} found. Probably miscalculation at. {fmow_metadata_sample.get('img_filename')}!")

    return (center_lon, center_lat), img_span_deg


def extract_center_coords_and_img_span(wilds_metadata_sample):
    with open(build_fmow_metadata_path(wilds_metadata_sample), 'r') as file:
        fmow_metadata_sample = json.load(file)

    (center_lon, center_lat), img_span_deg = compute_center_coordinates_and_span(
        fmow_metadata_sample)

    return pd.Series({"img_center_lon": center_lon, "img_center_lat": center_lat, "img_span_deg": img_span_deg})

In [None]:
coords_and_span = h.apply(extract_center_coords_and_img_span, axis=1)
h_extended = pd.concat([h, coords_and_span], axis=1)
h_extended

Unnamed: 0,split,img_filename,img_path,spatial_reference,epsg,category,visible,img_width,img_height,country_code,cloud_cover,timestamp,lat,lon,region,y,year,img_center_lon,img_center_lat,img_span_deg
0,train,tunnel_opening_370_6_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1359,1222,CHN,0,2017-04-12T04:14:15Z,26.604456,101.733373,0,57,15.0,101.733828,26.60486,0.004038
1,train,tunnel_opening_370_1_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,900,809,CHN,0,2017-01-30T04:18:04Z,26.604456,101.733373,0,57,15.0,101.73383,26.604859,0.004095
2,train,tunnel_opening_370_0_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1058,951,CHN,1,2015-02-20T04:11:50Z,26.604456,101.733373,0,57,13.0,101.733826,26.604856,0.004038
3,train,tunnel_opening_370_3_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1568,1409,CHN,11,2015-11-23T04:10:38Z,26.604456,101.733373,0,57,13.0,101.733824,26.604857,0.004038
4,train,tunnel_opening_370_4_rgb.jpg,train/tunnel_opening/tunnel_opening_370/tunnel...,GCS_WGS_1984,4326,tunnel_opening,True,1624,1460,CHN,0,2016-04-04T04:12:15Z,26.604456,101.733373,0,57,14.0,101.733822,26.604859,0.004069
