In [None]:
# These libraries are needed for the pygrib library in Colab. 
# Note that is needed if you install pygrib using pip.
# If you use conda, the libraries will be installed automatically.
! apt-get install libeccodes-dev libproj-dev

# Install the python packages
! pip install pyproj
! pip install pygrib

# Uninstall existing shapely
# We will re-install shapely in the next step by ignoring the binary
# wheels to make it compatible with other modules that depend on 
# GEOS, such as Cartopy (used here).
!pip uninstall --yes shapely

# To install cartopy in Colab using pip, we need to install the library 
# dependencies first.

!apt-get install -qq libgdal-dev libgeos-dev
!pip install shapely --no-binary shapely
!pip install cfgrib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import pandas as pd
import numpy as np
import io
from datetime import date, datetime, timedelta
import tempfile

import xarray as xr
import requests

# Not used directly, but used via xarray
import cfgrib

## Import Base Data Files

In [None]:
run_date = '2022-06-30'

In [None]:
ground_measures_metadata = pd.read_csv('/content/drive/MyDrive/snocast/eval/data/ground_measures_metadata.csv')
submission_format = pd.read_csv('/content/drive/MyDrive/snocast/eval/data/submission_format.csv')
lookback = 3

In [None]:
# get latitude longitude for grids
f = open('/content/drive/MyDrive/snocast/eval/data/grid_cells.geojson')
grid_cells = json.load(f)
print('length grid_cells features: ', len(grid_cells['features']))

ids = []
lats = []
lons = []
bboxes = []

for grid_cell in grid_cells['features']:
    cell_id = grid_cell['properties']['cell_id']
    coordinates = grid_cell['geometry']['coordinates'][0]
    lon, lat = np.mean(coordinates, axis=0)
    northeast_corner = np.max(coordinates, axis=0)
    southwest_corner = np.min(coordinates, axis=0)
    # bbox = [min_lon, min_lat, max_lon, max_lat]
    bbox = np.concatenate([southwest_corner,northeast_corner])
    ids.append(cell_id)
    lats.append(lat)
    lons.append(lon)
    bboxes.append(bbox)

grid_cells_pd = pd.DataFrame({'location_id': ids, 
                             'latitude': lats, 
                             'longitude': lons, 
                             'bbox': bboxes})

## Get NOAA HRRR Data
The NOAA HRRR is a real-time 3km resolution, hourly updated, cloud-resolving, convection-allowing atmospheric model, initialized by 3km grids with 3km radar assimilation.

In [None]:
max_date = datetime.strptime(run_date,'%Y-%m-%d')
date_list = [(max_date - timedelta(days=x)).strftime('%Y-%m-%d') for x in range(lookback)]

In [None]:
date_list

In [None]:
# Constants for creating the full URL
blob_container = "https://noaa-hrrr-bdp-pds.s3.amazonaws.com"
blob_container = "https://noaahrrr.blob.core.windows.net/hrrr"
sector = "conus"
cycle = 12        # 4 PM
forecast_hour = 0   # offset from cycle time
product = "wrfsfcf" # 2D surface levels

# Put it all together
file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"

hrrr_day = date_list[0].replace("-","")
url = f"{blob_container}/hrrr.{hrrr_day}/{sector}/{file_path}"
#url = f'{blob_container}/hrrr.20220210/{sector}/{file_path}'

print(url)

In [None]:
# Fetch the idx file by appending the .idx file extension to our already formatted URL
r = requests.get(f"{url}.idx")
url_idx = r.text.splitlines()

# Take a peek at the content of the index
print(*url_idx[0:10], sep="\n")

In [None]:
# https://www.nco.ncep.noaa.gov/pmb/products/hrrr/hrrr.t00z.wrfsfcf02.grib2.shtml
metrics = [
           (":TMP:surface:anl", "t"), # temperature [K]
           (":SNOD:surface:anl", "sde"), # snow depth [m]
           (":WEASD:surface:anl", "sdwe"), # water equivalent of accumulated snow depth [kg/m^2]
           (":SPFH:2 m above ground:anl:", "sh2"), # specific humidity [kg/kg]
           (":SNOWC:surface:anl:", "snowc"), # snow cover [%]
           (":REFC:entire atmosphere:anl:", "refc"), # composite reflectivity [dB]
           (":PRES:surface:anl:", "sp"), # pressure [Pa]
           (":PWAT:entire atmosphere (considered as a single layer):anl:", "pwat"), # precipitable water [kg/m^2]
           ]

In [None]:
def get_metric_ds(metric, url_idx):
  metric_idx = [l for l in url_idx if metric in l][0].split(":")
  # Pluck the byte offset from this line, plus the beginning offset of the next line
  line_num = int(metric_idx[0])
  range_start = metric_idx[1]
  # The line number values are 1-indexed, so we don't need to increment it to get the next list index,
  # but check we're not already reading the last line
  next_line = url_idx[line_num].split(':') if line_num < len(url_idx) else None
  # Pluck the start of the next byte offset, or nothing if we were on the last line
  range_end = next_line[1] if next_line else None
  file = tempfile.NamedTemporaryFile(prefix="tmp_", delete=False)

  headers = {"Range": f"bytes={range_start}-{range_end}"}
  resp = requests.get(url, headers=headers, stream=True)

  with file as f:
      f.write(resp.content)

  ds = xr.open_dataset(file.name, engine='cfgrib', 
                      backend_kwargs={'indexpath':''})

  return ds

In [None]:
unique_ids = grid_cells_pd[['location_id','bbox']]

In [None]:
all_data = [['location_id','date','TMP','SNOD','WEASD','SPFH','SNOWC','REFC','PRES','PWAT']]

In [None]:
# 45 min. per day
for pull_date in date_list:
  print(pull_date)
  hrrr_date = pull_date.replace('-','')

  find_url = True
  still_cycles_to_search = True
  url_found = False
  cycle = 16
  while find_url and still_cycles_to_search:
    file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"
    url = f"{blob_container}/hrrr.{hrrr_date}/{sector}/{file_path}"
    # Fetch the idx file by appending the .idx file extension to our already formatted URL
    r = requests.get(f"{url}.idx")
    url_idx = r.text.splitlines()
    if url_idx[0] == '<?xml version="1.0" encoding="UTF-8"?>':
      if cycle == 0:
        still_cycles_to_search = False
      cycle -= 1
      print(f'bad url: {url}')
    else:
      find_url = False
      url_found = True

  if url_found:
    ds_list = []
    for m in metrics:
      ds_list.append((get_metric_ds(m[0], url_idx),m[1]))
    
    for idx, row in unique_ids.iterrows():
      if idx % 5000 == 0:
        print(idx)
      cell_id = row['location_id']
      row_list = [cell_id, pull_date]
      min_lon, min_lat, max_lon, max_lat = row['bbox']
      for ds, m in ds_list:
        expand_search = 0.025 # Expand the lat lon bounds of the search to ensure we get data
        lat_values = (ds[m].latitude.values < max_lat + expand_search) & (ds[m].latitude.values > min_lat - expand_search)
        # noaa hrrr longitude values are stored as degrees east so we need to subtract 360
        lon_values = (ds[m].longitude.values - 360 < max_lon + expand_search) & (ds[m].longitude.values - 360 > min_lon - expand_search)
        mask = np.multiply(lat_values,lon_values)
        m_value = ds[m].values[mask].mean()
        row_list.append( m_value )
      all_data.append(row_list)

In [None]:
climate_df = pd.DataFrame(all_data[1:], columns=all_data[0])

In [None]:
climate_df.to_parquet(f'/content/drive/MyDrive/snocast/eval/data/hrrr/climate_{run_date}.parquet')

In [None]:
climate_df.sort_values(['location_id','date']).head(10)

In [None]:
climate_df.shape