In [None]:
import numpy as np
import pandas as pd
import numpy as np
import geopandas as gpd

import time
import os
from glob import glob
from pathlib import Path

from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta

import xarray as xr
import rasterio as rio

import tqdm.auto as tq

In [None]:
start_date = date(2014,9,1)
end_date = date(2021,8,1) # Right border not included

# Total months
months = (
    relativedelta(end_date, start_date).months + relativedelta(end_date, start_date).years * 12
)

DATA_DIR = Path("data")

HRRR_SAMPLE = "hrrr_sample.grib2"
HRRR_DIR = DATA_DIR / "hrrr"

PRODUCT = 'MYD10A1'
MODIS_DIR = DATA_DIR / "modis"

SOIL_FILE = DATA_DIR / "global_soil_regions" / "so2015v2.tif"
DEM_FILE = DATA_DIR / "copernicus_dem" / "COP90.tif"

GRID_FILE = "development/grid_cells.geojson"

OUTPUT_DIR = Path("development/")

# Modis projection
PROJ4MODIS = "+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs"
# HRRR projection
PROJ4HRRR = '+proj=lcc +lat_0=38.5 +lon_0=-97.5 +lat_1=38.5 +lat_2=38.5 +x_0=0 +y_0=0 +R=6371229 +units=m +no_defs'

In [None]:
grid_cells = gpd.read_file(GRID_FILE)

## Gathering HRRR data

In [None]:
# Obtaine x/y projection grid from sample with rasterio:
ds = xr.open_dataset(HRRR_SAMPLE, engine='rasterio')

proj_y = np.flip(ds.y)
proj_x = ds.x

del ds

# Search points: HRRR projection
mid_x = grid_cells.to_crs(PROJ4HRRR).geometry.centroid.x.values
mid_y = grid_cells.to_crs(PROJ4HRRR).geometry.centroid.y.values

mid_x = xr.DataArray(mid_x, dims="cell_id")
mid_y = xr.DataArray(mid_y, dims="cell_id")

In [None]:
def get_points(date_range, 
               proj_x = proj_x, proj_y = proj_y,
               mid_x = mid_x, mid_y = mid_y):
    fnamest12 = []
    for day in date_range:
        for cycle in [12,11,10]:
            if f"{day:%Y%m%d}" == "20160805": cycle = 10
            filename = HRRR_DIR / f"hrrr.{day:%Y%m%d}/conus/hrrr.t{cycle:02}z.wrfsfcf00.grib2"
            if filename.is_file(): 
                fnamest12.append(filename.as_posix())
                break

    fnamest00 = []
    for day in date_range:
        for cycle in [0,1,2]:
            filename = HRRR_DIR / f"hrrr.{day:%Y%m%d}/conus/hrrr.t{cycle:02}z.wrfsfcf00.grib2"
            if filename.is_file(): 
                fnamest00.append(filename.as_posix())
                break
    
    def round_time(ds):
        ds.coords['time'] = ds.coords['time'].dt.floor('D')
        return ds
    
    ds = xr.merge([
        # Temperature T12
        xr.open_mfdataset(fnamest12, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={'stepType': 'instant',
                             'typeOfLevel': 'surface', 'shortName': 't'},
                             preprocess = round_time,
            concat_dim='time', combine='nested', parallel=True).rename({'t': 't12'}),
        # U component of wind
        xr.open_mfdataset(fnamest12, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={
                             'stepType': 'instant',
                             'typeOfLevel': 'heightAboveGround',
                             'shortName': 'u'},
                             preprocess = round_time,
                             concat_dim='time', combine='nested', parallel=True),
        # V component of wind   
        xr.open_mfdataset(fnamest12, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={
                             'stepType': 'instant',
                             'typeOfLevel': 'heightAboveGround',
                             'shortName': 'v'},
                             preprocess = round_time,
                             concat_dim='time', combine='nested', parallel=True),
        # Water equivalent of accumulated snow depth
        xr.open_mfdataset(fnamest12, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={'stepType': 'instant',
                             'typeOfLevel': 'surface', 'shortName': 'sdwe'},
                             preprocess = round_time,
                             concat_dim='time', combine='nested', parallel=True),
        # Precipitable water
        xr.open_mfdataset(fnamest12, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={'stepType': 'instant',
                                'typeOfLevel': 'atmosphereSingleLayer',
                                'shortName': 'pwat'},
                             preprocess = round_time,
                             concat_dim='time', combine='nested', parallel=True),
        # Maximum/Composite radar reflectivity
        xr.open_mfdataset(fnamest12, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={'stepType': 'instant',
                                'typeOfLevel': 'atmosphere',
                                'shortName': 'refc'},
                             preprocess = round_time,
                             concat_dim='time', combine='nested', parallel=True),
        # Temperature T00
        xr.open_mfdataset(fnamest00, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={'stepType': 'instant',
                             'typeOfLevel': 'surface', 'shortName': 't'},
                             preprocess = round_time,
            concat_dim='time', combine='nested', parallel=True).rename({'t': 't00'}),
        
        # Water equivalent of accumulated snow depth - Day accumulated
        xr.open_mfdataset(fnamest00, engine='cfgrib',
                             backend_kwargs={'indexpath':''},
                             drop_variables = ['latitude', 'longitude', 'valid_time', 'step'],
                             filter_by_keys={
                             'stepType': 'accum',
                             'typeOfLevel': 'surface',
                             'shortName': 'sdwe'},
                             preprocess = round_time,
            concat_dim='time', combine='nested', parallel=True).rename({'sdwe': 'sdwea'})
    ]).reindex({'time': date_range})
    
    ds['x'] = proj_x
    ds['y'] = proj_y

    points = ds.sel(x=mid_x, y=mid_y, method="nearest")
    del ds
    return points

In [None]:
# Creat folder for HRRR chunks:
os.makedirs(f"{OUTPUT_DIR}/hrrr", exist_ok=True)

In [None]:
for i in tq.trange(months):
    ds = get_points(pd.date_range(
            start_date + relativedelta(months=i),
                  start_date + relativedelta(months=i+1), closed='left', freq='1D'))
    # Save to file:
    ds.to_netcdf(
        f"{OUTPUT_DIR}/hrrr/hrrr_{start_date + relativedelta(months=i):%Y%m}.nc",
                format="NETCDF4", engine='netcdf4')
    ds.close();
    del ds

## Gathering MODIS data

In [None]:
bounds = grid_cells.to_crs(PROJ4MODIS).geometry.bounds

# Data slice size
rx, ry = 5, 3

# Transform values
a, _, b, _, c, d = 463.31271652791725, 0.0, -11119505.196667, 0.0, -463.31271652750013, 5559752.598333

rowsn = (bounds.maxy.values - d ) / c
colsn = (bounds.minx.values - b ) / a

xs = xr.DataArray(
    np.tile( np.stack(
        [np.arange(x, x + rx) for x in np.floor(colsn).astype(int)]), (1,1,ry)).flatten())

ys = xr.DataArray(
    np.repeat( np.stack(
        [np.arange(x, x + ry) for x in np.floor(rowsn).astype(int)]), rx, axis=-1).flatten())

In [None]:
def flatten(outter):
    return [item for sublist in outter for item in sublist]

def get_data(day,
             x=xs, y=ys,
             rx=5, ry=3,
             cell_id = grid_cells.cell_id.values,
             product = PRODUCT,
             variable = 'NDSI',
             data_dir = MODIS_DIR):
    
    # filenames for reading
    filenames = flatten([
                glob(f"{data_dir}/{product}/{h:0>2d}/{v:0>2d}/{day:%Y%j}/{product}.A{day:%Y%j}.*.hdf")
                     for h, v in [(8,4),(8,5),(9,4),(9,5),(10,4),(10,5)]])
    
    if len(filenames) > 4:
        xds = xr.open_mfdataset(filenames, engine='rasterio', variable=variable)
        ds = xr.Dataset(
            data_vars = {
                variable : (
                    ["cell_id", "time", "x", "y"],
                            xds[variable].isel(x=xs, y=ys).data.reshape(-1, 1, ry, rx))
            },
            coords = dict(
                    cell_id = cell_id,
                    time = pd.date_range(day, day)
                ),
        )
    else:
        # No files for reading
        ar = np.empty((cell_id.shape[0], 1, ry, rx), dtype=np.float32)
        ar.fill(np.nan)
        ds = xr.Dataset(
            data_vars = {
                variable : (["cell_id", "time", "x", "y"], ar)
            },
            coords = dict(
                    cell_id = cell_id,
                    time = pd.date_range(day, day)
                ),
        )
    return ds

In [None]:
# Creat folder for modis chunks:
os.makedirs(f"{OUTPUT_DIR}/modis", exist_ok=True)

In [None]:
for i in tq.trange(months):
    ds = xr.concat(
        [get_data(day) for day in pd.date_range(
            start_date + relativedelta(months=i),
                  start_date + relativedelta(months=i+1), closed='left', freq='1D')],
        dim='time'
    )
    # Save to file:
    ds.to_netcdf(
        f"{OUTPUT_DIR}/modis/{PRODUCT}_{start_date + relativedelta(months=i):%Y%m}.nc",
                format="NETCDF4", engine='netcdf4')

# Combine all

In [None]:
# Load dataset
ds = xr.open_mfdataset(f"{OUTPUT_DIR}/hrrr/hrrr_*.nc", engine='netcdf4')
# Add cell id information
ds = ds.assign_coords(cell_id=grid_cells.cell_id.values)
# Remove unused coords
ds = ds.drop([i for i in ds.coords if i not in ds.dims])

# Loand NDSI
ndsi = xr.open_mfdataset(f"{OUTPUT_DIR}/modis/{PRODUCT}_*.nc", engine='netcdf4')
ndsi = ndsi.transpose("time", "cell_id", "x", "y")
# Merge datasets
ds = xr.merge([ds, ndsi.ffill('time').fillna(0).reduce(np.nanmean, ("x", "y"))])

## Add Sunlight Duration (minutes)

In [None]:
# Calculate additional values for sunlingt duration
grid_cells['lat'] = (grid_cells.geometry.bounds['maxy'] + grid_cells.geometry.bounds['miny']) / 2
grid_cells['lon'] = (grid_cells.geometry.bounds['maxx'] + grid_cells.geometry.bounds['minx']) / 2
grid_cells['lat_rad'] = np.pi * grid_cells['lat'] / 180
grid_cells['tan_lat'] = np.tan(grid_cells['lat_rad'])
grid_cells['k_cos'] = np.cos(np.pi * 90.833 / 180) / np.cos(grid_cells['lat_rad'])

In [None]:
# Load cvs file with sun decline information
sun_decline = pd.read_csv(f"../development/sun_decline.csv", index_col=[0], parse_dates=[0])

In [None]:
# Caclculate values
time_idx = ds.time.values

sun_duration = grid_cells.loc[:, "k_cos"].values[None] * sun_decline.loc[time_idx, "cos-1_decl"].values[:, None]
sun_duration -= grid_cells.loc[:, "tan_lat"].values[None] * sun_decline.loc[time_idx, "tan_decl"].values[:, None]
sun_duration = 8 * 180 * np.arccos(sun_duration) / (np.pi * 1) # - 720 / 200 # k - 400
sun_duration = sun_duration.astype(np.float32)

In [None]:
attrs = {
    'long_name': "Sunlight Duration",
    'shortName': "sd",
    'units': "minutes per day",
    'reference': "https://gml.noaa.gov/grad/solcalc/calcdetails.html"
}

## Add static data: Copernicus DEM and FAO-UNESCO Global Soil Regions Map

In [None]:
demtiff = rio.open(DEM_FILE)
soiltif = rio.open(SOIL_FILE)

In [None]:
images_dem = []
images_soil = []
bins = np.array([0, 1, 2, 3, 4, 10, 14, 20, 29, 39, 49, 59, 69, 79, 84, 94])

for idx, row in grid_cells.iterrows():
    
    image_dem = demtiff.read(1,
            window=demtiff.window(*row.geometry.bounds), out_shape=(10,10)) 
    images_dem.append(image_dem)
    
    image_soil = soiltif.read(1,
            window=soiltif.window(*row.geometry.bounds), out_shape=(10,10))
    image_soil = np.digitize(image_soil, bins, right=True)
    images_soil.append(image_soil)

In [None]:
images_dem = np.stack(images_dem).astype(np.float32)
images_soil = np.stack(images_soil).astype(np.int64)

In [None]:
ds = ds.assign(dict(
    sd = (['time', 'cell_id'], sun_duration, attrs),
    dem = (["cell_id", "x", "y"], images_dem),
    soil = (["cell_id", "x", "y"], images_soil),
))

## Save file

In [None]:
ds = ds.drop(
    ['atmosphereSingleLayer','heightAboveGround',
     'atmosphere', 'surface', 'valid_time'], errors='ignore')

In [None]:
ds.to_netcdf(f"{OUTPUT_DIR}/train_dataset.nc",format="NETCDF4",
             engine='netcdf4', encoding={"sd": {"dtype": "float32"}})

In [None]:
ds.close()