## MOSAIKS meta data extraction

In [1]:
# !pip install -q git+https://github.com/geopandas/dask-geopandas
!pip install -q pyhere

In [2]:
import warnings
import time
import os
import gc
import calendar
import re

RASTERIO_BEST_PRACTICES = dict(  # See https://github.com/pangeo-data/cog-best-practices
    CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt",
    GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR",
    AWS_NO_SIGN_REQUEST="YES",
    GDAL_MAX_RAW_BLOCK_CACHE_SIZE="200000000",
    GDAL_SWATH_SIZE="200000000",
    VSI_CURL_CACHE_SIZE="200000000",
)
os.environ.update(RASTERIO_BEST_PRACTICES)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyhere import here

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from scipy import ndimage as nd

import rasterio
import rasterio.warp
import rasterio.mask
import shapely.geometry
import geopandas
import dask_geopandas
from dask.distributed import Client

from pystac import Item
import stackstac
import pyproj

warnings.filterwarnings(action="ignore", category=UserWarning, module="torch")
warnings.filterwarnings(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", category=RuntimeWarning)
warnings.filterwarnings(action="ignore", category=UserWarning)

import pystac_client
import planetary_computer as pc


# Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
# causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
# https://pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.benchmark = False

np.random.seed(42)
torch.manual_seed(42)

import random
random.seed(42)

## Set Parameters

In [3]:
num_features = 1024
country_code = 'ZMB'
satellite = "landsat-c2-l2"
bands = ["red"]
if satellite == "landsat-c2-l2":
    resolution = 30
    min_image_edge = 6
else:
    resolution = 10
    min_image_edge = 20

## Create grid and sample points to featurize

In [4]:
gdf = pd.read_feather(here('data', 'land_cover', 'ZMB_cropland_percentage_20k-points.feather'))
gdf = (
    geopandas
    .GeoDataFrame(
        gdf, 
        geometry = geopandas.points_from_xy(x = gdf.lon, y = gdf.lat), 
        crs='EPSG:4326')
)

pt_len = gdf.shape[0]
gdf.shape

(19598, 4)

In [5]:
NPARTITIONS = 250

ddf = dask_geopandas.from_geopandas(gdf, npartitions=1)
hd = ddf.hilbert_distance().compute()
gdf["hd"] = hd
gdf = gdf.sort_values("hd")

dgdf = dask_geopandas.from_geopandas(gdf, npartitions=NPARTITIONS, sort=False)

del ddf, hd, gdf
gc.collect()

125

In [6]:
buffer_size = 0.005

class CustomDataset(Dataset):
    def __init__(self, points, items, buffer=buffer_size):
        self.points = points
        self.items = items
        self.buffer = buffer

    def __len__(self):
        return self.points.shape[0]

    def __getitem__(self, idx):

        lon, lat = self.points[idx]
        fn = self.items[idx]

        if fn is None:
            return None
        else:
            stack = stackstac.stack(fn, assets=bands, resolution=resolution)
            x_min, y_min = pyproj.Proj(stack.crs)(lon-self.buffer, lat-self.buffer)
            x_max, y_max = pyproj.Proj(stack.crs)(lon+self.buffer, lat+self.buffer)
            aoi = stack.loc[..., y_max:y_min, x_min:x_max]
            data = aoi.data.squeeze()
            na_percentage = np.isnan(data).sum() / (data.shape[0] * data.shape[1])
            return na_percentage

In [7]:
%%time
start_month = 3
year_start  = 2022
year_end    = 2022

batch_size  = 1
workers     = 6 # os.cpu_count() 
cloud_limit = 20

print(
f"""
Parameters:  
    Satellite: {satellite}  
    Pixel resolution: {resolution}  
    Grid resolution: {buffer_size * 2} degree squared (WGS84) 
    Cloud limit: {cloud_limit}%  
    Bands: {bands} 
    Number of points: {pt_len} 
    Number of features: {num_features} features 
    Year range: {year_start} to {year_end} 
"""
)
for yr in range(year_start, year_end+1):
    
    df = []

    if (yr == year_start):
        month_range = range(start_month, 13)
    else:
        month_range = range(1, 13) 

    for mn in month_range:

        if mn < 10:
            month = "0"+str(mn)
        else:
            month = mn

        def query(points):
            """
            Find a STAC item for points in the `points` DataFrame

            Parameters
            ----------
            points : geopandas.GeoDataFrame
                A GeoDataFrame

            Returns
            -------
            geopandas.GeoDataFrame
                A new geopandas.GeoDataFrame with a `stac_item` column containing the STAC
                item that covers each point.
            """
            intersects = shapely.geometry.mapping(points.unary_union.convex_hull)

            catalog = pystac_client.Client.open(
                "https://planetarycomputer.microsoft.com/api/stac/v1"
            )
            ending_day = calendar.monthrange(yr, int(mn))[1]
            search_start = f"{yr}-{month}-1" 
            search_end = f"{yr}-{month}-{ending_day}" 

            search = catalog.search(
                collections=[satellite],  
                intersects=intersects,
                datetime=[search_start, search_end],
                query={"eo:cloud_cover": {"lt": cloud_limit}},
                limit=500,
            )
            ic = search.get_all_items_as_dict()
            features = ic["features"]
            features_d = {item["id"]: item for item in features}
            data = {
                "eo:cloud_cover": [],
                "geometry": [],
            }
            index = []
            for item in features:
                data["eo:cloud_cover"].append(item["properties"]["eo:cloud_cover"])
                data["geometry"].append(shapely.geometry.shape(item["geometry"]))
                index.append(item["id"])
            items = geopandas.GeoDataFrame(data, index=index, geometry="geometry").sort_values(
                "eo:cloud_cover"
            )
            point_list = points.geometry.tolist()
            point_items = []
            for point in point_list:
                covered_by = items[items.covers(point)]
                if len(covered_by):
                    point_items.append(features_d[covered_by.index[0]])
                else:
                    point_items.append(None)
            return points.assign(stac_item=point_items)

        tic = time.time()
        print("Matching images to points for: ", mn, "-", yr, sep = "")

        with Client(n_workers=16) as client:
            meta = dgdf._meta.assign(stac_item=[])
            df2 = dgdf.map_partitions(query, meta=meta).compute()
            
        df3 = df2.dropna(subset=["stac_item"]).reset_index(drop = True)

        matching_items = []
        for item in df3.stac_item.tolist():
            signed_item = pc.sign(Item.from_dict(item))
            matching_items.append(signed_item)

        points = df3[["lon", "lat"]].to_numpy()

        print("Found acceptable images for ", 
              points.shape[0], "/", pt_len,
              " points in ", 
              f"{time.time()-tic:0.2f} seconds", 
              sep = "")

        dataset = CustomDataset(points, matching_items)

        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=workers,
            collate_fn=lambda x: x,
        )

        print("Collecting metadata: ", month, "-", yr, sep = "")

        na_perc = np.zeros((points.shape[0], 1), dtype=float)
        tic = time.time()
        toc = time.time()
        i = 0
        for images in dataloader:
            for image in images:
                
                na_perc[i] = image

                if i % 1000 == 0:
                    print(
                        f"{i}/{points.shape[0]} -- {i / points.shape[0] * 100:0.2f}%"
                        + f" -- {time.time()-tic:0.2f} seconds"
                    )
                    tic = time.time()
                i += 1

        df3['stac_id'] = df3['stac_item'].apply(pd.Series)['id']
        df3['platform'] = df3['stac_item'].apply(pd.Series)['properties'].apply(pd.Series)['platform']
        df3['cloud_cover'] = df3['stac_item'].apply(pd.Series)['properties'].apply(pd.Series)['eo:cloud_cover']
        df3[['na_percent', 'year', "month"]] = na_perc, yr, mn
        df3.drop(['geometry', 'hd', 'stac_item'], axis = 1, inplace = True)
        df3 = pd.DataFrame(df3)
        
        fn = f'{satellite}_{country_code}_{pt_len/1000:.0f}k-points_meta_{yr}_{mn}.csv'
        file_name = here('data', 'feature_meta_data', fn)
        # print("Saving file as:", file_name, "\n")
        df3.to_csv(file_name, index=False)
        print(f"Saving file as: {fn}\nDone in {(time.time()-toc)/60:0.2f} minutes\n")
        


Parameters:  
    Satellite: landsat-c2-l2  
    Pixel resolution: 30  
    Grid resolution: 0.01 degree squared (WGS84) 
    Cloud limit: 20%  
    Bands: ['red'] 
    Number of points: 19598 
    Number of features: 1024 features 
    Year range: 2022 to 2022 

Matching images to points for: 3-2022
Found acceptable images for 12036/19598 points in 59.71 seconds
Collecting metadata: 03-2022
0/12036 -- 0.00% -- 0.85 seconds
1000/12036 -- 8.31% -- 92.47 seconds
2000/12036 -- 16.62% -- 117.99 seconds
3000/12036 -- 24.93% -- 119.02 seconds
4000/12036 -- 33.23% -- 128.25 seconds
5000/12036 -- 41.54% -- 143.87 seconds
6000/12036 -- 49.85% -- 92.73 seconds
7000/12036 -- 58.16% -- 177.52 seconds
8000/12036 -- 66.47% -- 107.59 seconds
9000/12036 -- 74.78% -- 102.21 seconds
10000/12036 -- 83.08% -- 95.55 seconds


KeyboardInterrupt: 

In [18]:
i

10170