# Validate WILDS Metadata Extension

In [1]:
import json
import pathlib
import os
import tempfile

import pandas as pd
from wilds import get_dataset
import geemap
import ee
import numpy as np
from geopy.distance import geodesic
from geopy.point import Point

file_path = os.path.abspath('')

PROJECT_ROOT = pathlib.Path(file_path).parent.parent.resolve()
DATA_DIR = PROJECT_ROOT / "data" / "fmow_landsat"

if not (os.path.exists(PROJECT_ROOT) and os.path.exists(DATA_DIR)):
    raise NotADirectoryError()

EE_PROJECT_NAME = 'seeing-the-big-picture'

try:
    ee.Authenticate()
    ee.Initialize(project=EE_PROJECT_NAME)
except Exception as e:
    print("Please authenticate Earth Engine: earthengine authenticate")
    raise e

  from pkg_resources import parse_version


In [2]:
dataset = get_dataset(dataset="fmow")
metadata = pd.read_csv(DATA_DIR / "rgb_metadata_extended.csv")

In [3]:
CHOOSE_RANDOM = False 

split = "train"
category = "airport"
sample_id = "230_2"

if CHOOSE_RANDOM:
    sample_meta_df = metadata.sample()
    while sample_meta_df["split"].values[0] == "seq":
        sample_meta_df = metadata.sample()
    sample_idx = sample_meta_df.index[0]
    # Turn pandas dataframe with one element into pandas series
    sample_meta = sample_meta_df.squeeze()
else:
    cond_df = (
        (metadata["img_filename"] == f"{category}_{sample_id}_rgb.jpg")
        & (metadata["split"] == split)
    )
    sample_meta_df = metadata.loc[cond_df]
    sample_idx = sample_meta_df.index[0]
    # Turn pandas dataframe with one element into pandas series
    sample_meta = sample_meta_df.squeeze()

img_pil, y_tensor, _ = dataset[sample_idx]  # Triple: (PIL, label, metadata)

In [4]:
def compute_img_span(img_center_lon: float, img_center_lat: float, img_span_km: float) -> tuple[float, float]:
    """Compute the distance in degrees longitude and latitude from kilometers.

    Args:
        img_center_lon (float): Longitude of the image center.
        img_center_lat (float): Latitude of the image center. 
        img_span_km (float): Span of the image in kilometer.  

    Returns:
        tuple[float, float]: Span of the image in degrees longitude and latitude.
    """
    img_center = Point(img_center_lat, img_center_lon)
    img_upper = geodesic(kilometers=(img_span_km / 2)).destination(img_center, 0)
    img_lower = geodesic(kilometers=(img_span_km / 2)).destination(img_center, 180)
    img_left = geodesic(kilometers=(img_span_km / 2)).destination(img_center, 90)
    img_right = geodesic(kilometers=(img_span_km / 2)).destination(img_center, 270)
    return (np.abs(img_left.longitude - img_right.longitude), np.abs(img_upper.latitude - img_lower.latitude))

In [5]:
img_center_lon, img_center_lat, img_span_km = sample_meta[
    "img_center_lon"], sample_meta["img_center_lat"], sample_meta["img_span_km"]

img_span_lon, img_span_lat = compute_img_span(img_center_lon, img_center_lat, img_span_km)

# Compute bounds of the fmow image - format fits geemap.ImageOverlay bounds
image_bounds = (
    (img_center_lat - (img_span_lat / 2), img_center_lon - (img_span_lon / 2)),
    (img_center_lat + (img_span_lat / 2), img_center_lon + (img_span_lon / 2))
)
# Compute extended bounds for Landsat8 download - format must fit with ee.Geometry.Rectangle
buffer_factor = 6.0
extended_bounds = [img_center_lon - (img_span_lon / 2 * buffer_factor), img_center_lat - (img_span_lat / 2 * buffer_factor),
                   img_center_lon + (img_span_lon / 2 * buffer_factor), img_center_lat + (img_span_lat / 2 * buffer_factor)]

In [6]:
def scale_l8(image):
    return (image
            .select(['SR_B2', 'SR_B3', 'SR_B4'])
            .multiply(0.0000275)
            .add(-0.2))


m = geemap.Map(center=[img_center_lat, img_center_lon], zoom=17)
region = ee.Geometry.Rectangle(extended_bounds)

l8 = (
    ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')
    .filterBounds(region)
    .map(scale_l8)
)

least_cloudy = l8.sort('CLOUD_COVER').first()

rgb_mask = (least_cloudy
            .select(['SR_B2', 'SR_B3', 'SR_B4'])
            .mask()
            .reduce(ee.Reducer.min()))

coverage_dict = rgb_mask.reduceRegion(
    reducer=ee.Reducer.mean(),
    geometry=region,
    scale=30,
    maxPixels=1e7
)

least_cloudy_coverage = ee.Number(coverage_dict.get('min'))
rgb_ok = least_cloudy_coverage.gte(0.99)

composite = l8.median()
context = ee.Image(ee.Algorithms.If(rgb_ok, least_cloudy, composite))

m.addLayer(
    context,
    {'bands': ['SR_B4', 'SR_B3', 'SR_B2'], 'min': 0, 'max': 0.3},
    'Landsat8'
)
m.addLayer(
    region
)

with tempfile.NamedTemporaryFile(suffix=".png") as temp:
    img_pil.save(temp.name)
    overlay = geemap.ImageOverlay(
        url=temp.name,
        bounds=image_bounds,
        name="PIL overlay"
    )
    m.add_layer(overlay)

m

Map(center=[np.float64(3.3388945588021777), np.float64(31.7683164102518)], controls=(WidgetControl(options=['pâ€¦

# Image and Dataset Size

The largest sample in the dataset is `airport 135_1` with a span of `0.045022` degree.
Dividing the area of this region by $\text{scale}^2$ with $\text{scale}$ being $30$, leads to the number of pixels in the region. 

In [7]:
print(region.area().getInfo())
expected_img_size = region.area().getInfo() / 900
small_img_size = 224 ** 2
print(f"The downloaded image is expected to be {expected_img_size / small_img_size} times bigger than the samples of fmow WILDS.")
print(f"The expected size of the extended dataset will be {expected_img_size / small_img_size * 52} Gb.")

897778775.4411978
The downloaded image is expected to be 19.880659532693755 times bigger than the samples of fmow WILDS.
The expected size of the extended dataset will be 1033.7942957000753 Gb.


In [8]:
print(f"The image size of the downloaded pixels should be: {expected_img_size ** (1/2)}.")

The image size of the downloaded pixels should be: 998.765224020361.
