In [1]:
import boto3
import fsspec
import json
from pystac import Catalog, Collection, Item, Asset, MediaType
from datetime import datetime
import rio_stac
from pprint import pprint
import concurrent.futures

In [2]:
# Specify the CMIP collection to use (daily or monthly)
temporal_resolution = "daily" 
storage_location = "remote"
model = "GISS-E2-1-G"
variable = "tas"

In [3]:
if temporal_resolution == "daily":
    print("Running STAC generation for daily CMIP6 data...")
    temporal_resolution = "daily"
    anon = True
    s3_path = f"s3://nex-gddp-cmip6-cog/daily/{model}/historical/r1i1p1f2/{variable}/"
elif temporal_resolution == "monthly":
    print("Running STAC generation for monthly CMIP6 data...")
    temporal_resolution = "monthly"
    anon = True
    s3_path = f"s3://nex-gddp-cmip6-cog/monthly/CMIP6_ensemble_median/{variable}/"

Running STAC generation for daily CMIP6 data...


In [4]:
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [5]:
file_paths = fs_read.glob(f"{s3_path}*")
print(f"{len(file_paths)} discovered from {s3_path}")

23725 discovered from s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/


In [6]:
# Here we prepend the prefix 's3://', which points to AWS.
if temporal_resolution == "monthly":
    subset_files = sorted(["s3://" + f for f in file_paths if "historical_1950" in f or "historical_1951" in f])
elif temporal_resolution == "daily":
    subset_files = sorted(["s3://" + f for f in file_paths if "_1950_" in f or "_1951_" in f])

In [7]:
print(f"Subseted data to files for 1950 and 1951. {len(subset_files)} files to process.")

Subseted data to files for 1950 and 1951. 730 files to process.


In [8]:
# Create the collection
file_prefix = f"cmip6_pgstac/CMIP6_{temporal_resolution}_{model}_{variable}"
collection_json = json.loads(open(f'{file_prefix}_collection.json').read())
collection = Collection.from_dict(collection_json)

In [9]:
stac_items_file = f"{file_prefix}_stac_items.ndjson"
# clear the file
with open(stac_items_file, 'w') as file:
    pass

In [10]:
def process_item(s3_file):
    print(f"Processing {s3_file}")
    filename = s3_file.split('/')[-1]
    if temporal_resolution == 'monthly':
        input_datetime = filename.split('_')[-1].replace('.tif', '')
        datetime_ = datetime.strptime(input_datetime, '%Y%m')
    elif temporal_resolution == 'daily':
        year, month, day = filename.split('_')[-3:]
        day = day.replace('.tif', '')
        datetime_ = datetime.strptime(f'{year}{month}{day}', '%Y%m%d')    
    with open(stac_items_file, 'a') as f:
        # Create a new Item
        item = rio_stac.create_stac_item(
                id=filename,
                source=s3_file,
                collection=collection.id,
                input_datetime=datetime_,
                with_proj=True,
                with_raster=True,
                asset_name="data",
                asset_roles=["data"],
                asset_media_type="image/tiff; application=geotiff; profile=cloud-optimized"
            )
        f.write(json.dumps(item.to_dict()) + '\n')    

In [11]:
# For each object, create an Item and add it to the Catalog
# This takes an hour to run on my macbook
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_item, obj) for obj in subset_files]

Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_01.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_02.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_03.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_04.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_05.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_06.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_07.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/hi

KeyboardInterrupt: 

Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_20.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_21.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_22.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_23.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_24.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_25.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_02_26.tif
Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/hi

In [None]:
!./seed-db.sh {temporal_resolution} {storage_location} {model} {variable}