In [1]:
# !pip install rioxarray planetary_computer pystac_client

In [1]:
import pandas as pd
import geopy.distance as distance
from datetime import timedelta
import numpy as np

from collections import Counter
import rioxarray
from IPython.display import Image
from PIL import Image as PILImage
import matplotlib.pyplot as plt
import cv2
from skimage.io import imsave, imread
from multiprocessing import Pool

import planetary_computer as pc
from pystac_client import Client
import traceback
# import odc.stac
import os
import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
!mkdir Sentinel

In [3]:
# get our bounding box to search latitude and longitude coordinates
def get_bounding_box(latitude, longitude, meter_buffer=50000):
    """
    Given a latitude, longitude, and buffer in meters, returns a bounding
    box around the point with the buffer on the left, right, top, and bottom.

    Returns a list of [minx, miny, maxx, maxy]
    """
    distance_search = distance.distance(meters=meter_buffer)

    # calculate the lat/long bounds based on ground distance
    # bearings are cardinal directions to move (south, west, north, and east)
    min_lat = distance_search.destination((latitude, longitude), bearing=180)[0]
    min_long = distance_search.destination((latitude, longitude), bearing=270)[1]
    max_lat = distance_search.destination((latitude, longitude), bearing=0)[0]
    max_long = distance_search.destination((latitude, longitude), bearing=90)[1]

    return [min_long, min_lat, max_long, max_lat]


# get our date range to search, and format correctly for query
def get_date_range(date, time_buffer_days=30):
    """Get a date range to search for in the planetary computer based
    on a sample's date. The time range will include the sample date
    and time_buffer_days days prior

    Returns a string"""
    datetime_format = "%Y-%m-%dT"
    range_start = pd.to_datetime(date) - timedelta(days=time_buffer_days)
    date_range = f"{range_start.strftime(datetime_format)}/{pd.to_datetime(date).strftime(datetime_format)}"

    return date_range


def crop_sentinel_image(item, bounding_box):
    """
    Given a STAC item from Sentinel-2 and a bounding box tuple in the format
    (minx, miny, maxx, maxy), return a cropped portion of the item's visual
    imagery in the bounding box.

    Returns the image as a numpy array with dimensions (color band, height, width)
    """
    (minx, miny, maxx, maxy) = bounding_box

    image = rioxarray.open_rasterio(pc.sign(item.assets["visual"].href)).rio.clip_box(
        minx=minx,
        miny=miny,
        maxx=maxx,
        maxy=maxy,
        crs="EPSG:4326",
    )

    return image.to_numpy()


def crop_sentinel_image_with_water(item, bounding_box):
    """
    Given a STAC item from Sentinel-2 and a bounding box tuple in the format
    (minx, miny, maxx, maxy), return a cropped portion of the item's visual
    imagery in the bounding box.

    Returns the image as a numpy array with dimensions (color band, height, width)
    """
    (minx, miny, maxx, maxy) = bounding_box

    image = rioxarray.open_rasterio(pc.sign(item.assets["visual"].href)).rio.clip_box(
        minx=minx,
        miny=miny,
        maxx=maxx,
        maxy=maxy,
        crs="EPSG:4326",
    )


    scl_arr = rioxarray.open_rasterio(pc.sign(item.assets["SCL"].href)).rio.clip_box(
            minx=minx,
            miny=miny,
            maxx=maxx,
            maxy=maxy,
            crs="EPSG:4326",
        )

    return image.to_numpy(), scl_arr[0]


def crop_landsat_image(item, bounding_box):
    """
    Given a STAC item from Landsat and a bounding box tuple in the format
    (minx, miny, maxx, maxy), return a cropped portion of the item's visual
    imagery in the bounding box.

    Returns the image as a numpy array with dimensions (color band, height, width)
    """
    (minx, miny, maxx, maxy) = bounding_box

    image = odc.stac.stac_load(
        [pc.sign(item)], bands=["red", "green", "blue"], bbox=[minx, miny, maxx, maxy]
    ).isel(time=0)
    image_array = image[["red", "green", "blue"]].to_array().to_numpy()

    # normalize to 0 - 255 values
    image_array = cv2.normalize(image_array, None, 0, 255, cv2.NORM_MINMAX)

    return image_array


def get_stats(scl_arr_):
    data_ratio = (scl_arr_ > 0).sum() / scl_arr_.shape[0] / scl_arr_.shape[1]
    water_ratio = (scl_arr_ == 6).sum() / scl_arr_.shape[0] / scl_arr_.shape[1]
    land_ratio = ((scl_arr_ <= 6) & (scl_arr_ >= 4)).sum() / scl_arr_.shape[0] / scl_arr_.shape[1]
    cloud_ratio = ((scl_arr_ >= 7) & (scl_arr_ <= 10)).sum() / scl_arr_.shape[0] / scl_arr_.shape[1]
    return data_ratio, water_ratio, land_ratio, cloud_ratio

In [4]:
catalog = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace
)

metadata = pd.read_csv('../inputs/metadata.csv')
metadata.sample(5)

Unnamed: 0,uid,latitude,longitude,date,split
3208,dnot,39.14193,-95.47957,2020-06-29,test
2690,czoi,39.3992,-99.42481,2018-07-30,train
23154,znwj,37.2606,-121.992,2014-06-04,train
13801,pevh,37.4583,-120.967,2014-01-14,train
7245,hzrk,41.935,-81.478333,2016-04-05,test


In [5]:
!rm Sentinel/*

rm: cannot remove 'Sentinel/*': No such file or directory


In [6]:
def get_row(x):
    try:
        example_row = pd.Series(x)
        # example_row['date'] = '2022-' + example_row['date'].split('-')[1] + '-' + example_row['date'].split('-')[2]
        bbox = get_bounding_box(example_row.latitude, example_row.longitude, meter_buffer=50000)
        date_range = get_date_range(example_row.date, 60).replace('T', '')
        # print(date_range)
        # search the planetary computer sentinel-l2a and landsat level-2 collections
        search = catalog.search(
            collections=["sentinel-2-l2a"], bbox=bbox, datetime=date_range
        )

        # see how many items were returned
        items = [item for item in search.get_all_items()]
        
        # get details of all of the items returned
        item_details = pd.DataFrame(
            [
                {
                    "datetime": item.datetime.strftime("%Y-%m-%d"),
                    "platform": item.properties["platform"],
                    "min_long": item.bbox[0],
                    "max_long": item.bbox[2],
                    "min_lat": item.bbox[1],
                    "max_lat": item.bbox[3],
                    "bbox": item.bbox,
                    "item_obj": item,
                }
                for item in items
            ]
        )
        if item_details.shape[0] == 0:
            return
        # check which rows actually contain the sample location
        item_details["contains_sample_point"] = (
            (item_details.min_lat < example_row.latitude)
            & (item_details.max_lat > example_row.latitude)
            & (item_details.min_long < example_row.longitude)
            & (item_details.max_long > example_row.longitude)
        )

        item_details = item_details[item_details["contains_sample_point"]]
        item_details[["datetime", "platform", "contains_sample_point", "bbox"]].sort_values(
            by="datetime"
        )
        # break
        for i, x in item_details.iterrows():
            if 'Sentinel' in x.platform:
                item = x.item_obj
                # breaks = True
                # break

            minx, miny, maxx, maxy = get_bounding_box(
                example_row.latitude, example_row.longitude, meter_buffer=2000
            )

            # get SCL layer first and filter
            scl_arr = rioxarray.open_rasterio(pc.sign(item.assets["SCL"].href)).rio.clip_box(
                    minx=minx,
                    miny=miny,
                    maxx=maxx,
                    maxy=maxy,
                    crs="EPSG:4326",
                ).to_numpy()[0].astype(np.uint8)

            data_ratio, water_ratio, land_ratio, cloud_ratio = get_stats(scl_arr)
            # print(data_ratio, water_ratio, land_ratio, cloud_ratio)
            if (data_ratio > 0.99 and cloud_ratio < 0.05):
                vis_arr = rioxarray.open_rasterio(pc.sign(item.assets["visual"].href)).rio.clip_box(
                    minx=minx,
                    miny=miny,
                    maxx=maxx,
                    maxy=maxy,
                    crs="EPSG:4326",
                ).to_numpy()
                vis_arr = np.transpose(vis_arr, axes=[1, 2, 0]).astype(np.uint8)          
                
                fname = f'{example_row.uid}_{x.datetime}_{x.item_obj.id}.png'
                fname_water = f'W_{example_row.uid}_{x.datetime}_{x.item_obj.id}.png'
                imsave('./Sentinel/' + fname, vis_arr)
                imsave('./Sentinel/' + fname_water, scl_arr)
                break
    except:
#         raise
#         print(traceback.format_exc())
        print('Failed: {}'.format(x['uid']))

## retrival
* if any network exception, please retry

> _DrivenData note: remove the `.head(10)` in the following cell to not limit to 10 downloads._

In [8]:
%%time
p = Pool(processes=16)
for i, row in tqdm.tqdm(metadata.head(10).iterrows()):
    example_row = row
    p.apply_async(get_row, (dict(example_row), ))
p.close()
p.join()

10it [00:00, 9892.23it/s]


CPU times: user 12.8 ms, sys: 48.3 ms, total: 61.1 ms
Wall time: 2min 32s


In [8]:
get_row(dict(example_row))