# Generate CMIP STAC Items and Load them into a pgSTAC database

This notebook walks through generating STAC items from [NEX GDDP CMIP6 COGs on AWS](https://aws.amazon.com/marketplace/pp/prodview-k6adk576fiwmm#resources).

As-is it uses daily data from the `GISS-E2-1-G` model, the `tas` variable and loads data from 1950 and 1951. The bucket has other data available. It includes monthly aggregates, other models, other variables and more years. The scripts below can easily be modified to STAC-ify other data in the nex-gddp-cmip6-cog bucket.

In [1]:
import boto3
import fsspec
import json
from pystac import Catalog, Collection, Item, Asset, MediaType
from datetime import datetime
import rio_stac
from pprint import pprint
import concurrent.futures
import threading

In [2]:
# Specify the CMIP collection to use (daily or monthly)
model = "GISS-E2-1-G"
variable = "tas"

In [3]:
anon = True
s3_path = f"s3://nex-gddp-cmip6-cog/daily/{model}/historical/r1i1p1f2/{variable}/"

In [4]:
fs_read = fsspec.filesystem("s3", anon=anon)

In [5]:
file_paths = fs_read.glob(f"{s3_path}*")
print(f"{len(file_paths)} discovered from {s3_path}")

23725 discovered from s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/


In [6]:
# Here we prepend the prefix 's3://', which points to AWS.
subset_files = sorted(["s3://" + f for f in file_paths if "_1950_" in f or "_1951_" in f])

In [7]:
if len(subset_files) == 0:
    raise Exception(f"No files to process. Do COGs for the {model} model exist?")
else:
    print(f"Subseted data to files for 1950 and 1951. {len(subset_files)} files to process.")

Subseted data to files for 1950 and 1951. 730 files to process.


In [8]:
file_prefix = f"CMIP6_daily_{model}_{variable}"
stac_items_file = f"{file_prefix}_stac_items.ndjson"
collection_json = json.loads(open(f'{file_prefix}_collection.json').read())
collection = Collection.from_dict(collection_json)

In [9]:
# clear the file
with open(stac_items_file, 'w') as file:
    pass

In [10]:
def process_item(s3_file, file, lock):
    print(f"Processing {s3_file}")
    filename = s3_file.split('/')[-1]
    year, month, day = filename.split('_')[-3:]
    day = day.replace('.tif', '')
    datetime_ = datetime.strptime(f'{year}{month}{day}', '%Y%m%d')    
    # Create a new Item
    item = rio_stac.create_stac_item(
            id=filename,
            source=s3_file,
            collection=collection.id,
            input_datetime=datetime_,
            with_proj=True,
            with_raster=True,
            asset_name="data",
            asset_roles=["data"],
            asset_media_type="image/tiff; application=geotiff; profile=cloud-optimized"
        )
    tiling_asset = Asset(
        href=s3_file,
        roles=['virtual', 'tiling'],
        title='tiling',
        description='Virtual asset for tiling',
        extra_fields={
            'compose:rescale': [210, 330],
            'compose:colormap_name': 'hot'
        }
    )
    item.assets['tiling'] = tiling_asset
    with lock:
        file.write(json.dumps(item.to_dict()) + '\n')

In [None]:
lock = threading.Lock()
file = open(stac_items_file, 'a')
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_item, obj, file, lock) for obj in subset_files]
    [future.result() for future in concurrent.futures.as_completed(futures)]
file.close()

Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_01.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_02.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_03.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_04.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_05.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_06.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_07.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/hi

In [21]:
!./seed-db.sh {model} {variable}

Inserting collection from CMIP6_daily_GISS-E2-1-G_tas_collection.json
Inserting items from CMIP6_daily_GISS-E2-1-G_tas_stac_items.ndjson
