In [None]:
# These libraries are needed for the pygrib library in Colab. 
# Note that is needed if you install pygrib using pip.
# If you use conda, the libraries will be installed automatically.
! apt-get install libeccodes-dev libproj-dev

# Install the python packages
! pip install pyproj
! pip install pygrib

# Uninstall existing shapely
# We will re-install shapely in the next step by ignoring the binary
# wheels to make it compatible with other modules that depend on 
# GEOS, such as Cartopy (used here).
!pip uninstall --yes shapely

# To install cartopy in Colab using pip, we need to install the library 
# dependencies first.

!apt-get install -qq libgdal-dev libgeos-dev
!pip install shapely --no-binary shapely
!pip install cfgrib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import io
from datetime import date, datetime, timedelta
import tempfile

import xarray as xr
import requests
import matplotlib.pyplot as plt


# Not used directly, but used via xarray
import cfgrib

In [None]:
data_dir = '/content/drive/MyDrive/snocast/train/data'

ground_measures_train = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_train_features.csv'))
ground_measures_train.columns = ['station_id'] + list(ground_measures_train.columns[1:])
gm_melt_train = ground_measures_train.melt(id_vars=["station_id"],
                                            var_name="date",
                                            value_name="swe").dropna()
            

ground_measures_test = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_test_features.csv'))
ground_measures_test.columns = ['station_id'] + list(ground_measures_test.columns[1:])
gm_melt_test = ground_measures_test.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
                           
ground_measures_metadata = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_metadata.csv'))
ground_measures_all = pd.merge(ground_measures_train, ground_measures_test, how='outer', on='station_id')
gm_melt_all = ground_measures_all.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
gm_seq = pd.merge(gm_melt_all, ground_measures_metadata, how='inner', on='station_id')

train_labels = pd.read_csv(os.path.join(data_dir, 'static/train_labels.csv'))
labels_melt_train = train_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

test_labels = pd.read_csv(os.path.join(data_dir, 'static/labels_2020_2021.csv'))
labels_melt_test = test_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

## Data Transform

In [None]:
# get latitude longitude for train and test grids
f = open(os.path.join(data_dir, 'static/grid_cells.geojson'))
grid_cells = json.load(f)
print('length grid_cells features: ', len(grid_cells['features']))

grid_features = defaultdict(dict)
for grid_cell in grid_cells['features']:
  cell_id = grid_cell['properties']['cell_id']
  coordinates = grid_cell['geometry']['coordinates'][0]
  region = grid_cell['properties']['region']
  grid_features[cell_id] = {'coordinates': coordinates[1:],
                            'region': region}

grid_features_train = defaultdict(dict)
train_ids = []
train_lats = []
train_lons = []
train_regions = []
train_bboxes = []
grid_features_test = defaultdict(dict)
test_ids = []
test_lats = []
test_lons = []
test_regions = []
test_bboxes = []


for cell_id in train_labels['cell_id'].values:
  train_ids.append(cell_id)
  lon, lat = np.mean(grid_features[cell_id]['coordinates'], axis=0)
  max_lon, max_lat = np.max(grid_features[cell_id]['coordinates'], axis=0)
  min_lon, min_lat = np.min(grid_features[cell_id]['coordinates'], axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat,max_lon, max_lat])
  train_regions = grid_features[cell_id]['region']
  train_lats.append(lat)
  train_lons.append(lon)
  train_bboxes.append(bbox)

  grid_features[cell_id]['dataset'] = 'train'

for cell_id in test_labels['cell_id'].values:
  test_ids.append(cell_id)
  lon, lat = np.mean(grid_features[cell_id]['coordinates'], axis=0)
  max_lon, max_lat = np.max(grid_features[cell_id]['coordinates'], axis=0)
  min_lon, min_lat = np.min(grid_features[cell_id]['coordinates'], axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat,max_lon, max_lat])
  test_regions = grid_features[cell_id]['region']
  test_lats.append(lat)
  test_lons.append(lon)
  test_bboxes.append(bbox)

  if 'dataset' in grid_features[cell_id].keys():
    grid_features[cell_id]['dataset'] = 'both'
  else:
    grid_features[cell_id]['dataset'] = 'test'

for cell_id in grid_features:
  if grid_features[cell_id]['dataset'] in ('test','both'):
    grid_features_test[cell_id] = grid_features[cell_id]
  if grid_features[cell_id]['dataset'] in ('train','both'):
    grid_features_train[cell_id] = grid_features[cell_id]
print("test count: ", len(grid_features_test))
print("train count: ", len(grid_features_train))


train_lat_lon = pd.DataFrame({'cell_id': train_ids, 
                              'latitude': train_lats, 
                              'longitude': train_lons, 
                              'region': train_regions,
                              'bbox': train_bboxes})
test_lat_lon = pd.DataFrame({'cell_id': test_ids, 
                             'latitude': test_lats, 
                             'longitude': test_lons, 
                             'region': test_regions,
                             'bbox': test_bboxes})

In [None]:
# Create sequential dataframes for train and test
train_label_seq = pd.merge(labels_melt_train, train_lat_lon, how='inner', on='cell_id')
train_label_seq['datetime'] = pd.to_datetime(train_label_seq['date'])

test_label_seq = pd.merge(labels_melt_test, test_lat_lon, how='inner', on='cell_id')
test_label_seq['datetime'] = pd.to_datetime(test_label_seq['date'])

In [None]:
def get_bbox(row, expand):
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  lat = row['latitude']
  lon = row['longitude']
  return [lon - expand, lat - expand, lon + expand, lat + expand]

In [None]:
# Create sequential swe by latitude and longitude for ground measure stations
gm_seq['bbox'] = gm_seq[['latitude', 'longitude']].apply(lambda x: get_bbox(x, 0.005), axis=1)
gm_seq['datetime'] = pd.to_datetime(gm_seq['date'])

## Get NOAA HRRR Data
The NOAA HRRR is a real-time 3km resolution, hourly updated, cloud-resolving, convection-allowing atmospheric model, initialized by 3km grids with 3km radar assimilation.

In [None]:
# Constants for creating the full URL
blob_container = "https://noaahrrr.blob.core.windows.net/hrrr"
blob_container = "https://noaa-hrrr-bdp-pds.s3.amazonaws.com"
sector = "conus"
yesterday = date.today() - timedelta(days=1)
cycle = 16          # time to query
forecast_hour = 0   # offset from cycle time
product = "wrfsfcf" # 2D surface levels

# Put it all together
file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"

url = f"{blob_container}/hrrr.{yesterday:%Y%m%d}/{sector}/{file_path}"

print(url)

In [None]:
# Fetch the idx file by appending the .idx file extension to our already formatted URL
r = requests.get(f"{url}.idx")
url_idx = r.text.splitlines()

# Take a peek at the content of the index
print(*url_idx[0:10], sep="\n")

In [None]:
# https://www.nco.ncep.noaa.gov/pmb/products/hrrr/hrrr.t00z.wrfsfcf02.grib2.shtml
metrics = [(":TMP:surface:anl", "t"), # temperature [K]
           (":SNOD:surface:anl", "sde"), # snow depth [m]
           (":WEASD:surface:anl", "sdwe"), # water equivalent of accumulated snow depth [kg/m^2]
           (":SPFH:2 m above ground:anl:", "q"), # specific humidity [kg/kg]
           (":SNOWC:surface:anl:", "snowc"), # snow cover [%]
          # (":ASNOW:surface:0-0 day acc fcst:", "asnow"), # total snowfall [m]
          # (":CSNOW:surface:anl:", "csnow"), # categorical snow [-]
           (":REFC:entire atmosphere:anl:", "refc"), # composite reflectivity [dB]
           (":PRES:surface:anl:", "sp"), # pressure [Pa]
           (":PWAT:entire atmosphere (considered as a single layer):anl:", "pwat"), # precipitable water [kg/m^2]
          # (":ICEC:surface:anl:", "ci"), # ice cover
          # (":TCDC:entire atmosphere:anl:", # cloud cover
          # (":APCP:surface:", "tp"), # total precipitation [kg/m^2]
          # (":PRATE:surface:anl:", "prate"), # precipitation rate [kg/m^2/s]
          # (":SSRUN")
          # (":BGRUN")
           ]

In [None]:
def get_metric_ds(metric, url_idx):
  metric_idx = [l for l in url_idx if metric in l][0].split(":")
  # Pluck the byte offset from this line, plus the beginning offset of the next line
  line_num = int(metric_idx[0])
  range_start = metric_idx[1]
  # The line number values are 1-indexed, so we don't need to increment it to get the next list index,
  # but check we're not already reading the last line
  next_line = url_idx[line_num].split(':') if line_num < len(url_idx) else None
  # Pluck the start of the next byte offset, or nothing if we were on the last line
  range_end = next_line[1] if next_line else None
  file = tempfile.NamedTemporaryFile(prefix="tmp_", delete=False)

  headers = {"Range": f"bytes={range_start}-{range_end}"}
  resp = requests.get(url, headers=headers, stream=True)

  with file as f:
      f.write(resp.content)

  ds = xr.open_dataset(file.name, engine='cfgrib', 
                      backend_kwargs={'indexpath':''})

  return ds

In [None]:
df_seq = gm_seq
gm = True

all_data = [['location_id','date','TMP','SNOD','WEASD','SPFH','SNOWC','REFC','PRES','PWAT']]

unique_dates = df_seq[['date','datetime']].drop_duplicates().sort_values(['date'])

if gm:
  location_col = 'station_id'
else:
  location_col = 'cell_id'

for days in range(3):
  for _, row in unique_dates.iterrows():
    date = row['date'].strftime('%Y-%m-%d')

    # No good climate data earlier than '2015-01-01'
    if date >= '2015-01-01':
      print(date)
      retrieveday = row['datetime'] - timedelta(days=days)
      unique_ids = df_seq[df_seq['date'] == date][[location_col,'bbox']]

      find_url = True
      still_cycles_to_search = True
      url_found = False
      cycle = 16
      while find_url and still_cycles_to_search:
        file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"
        url = f"{blob_container}/hrrr.{retrieveday:%Y%m%d}/{sector}/{file_path}"
        # Fetch the idx file by appending the .idx file extension to our already formatted URL
        r = requests.get(f"{url}.idx")
        url_idx = r.text.splitlines()
        if url_idx[0] == '<?xml version="1.0" encoding="UTF-8"?>':
          if cycle == 0:
            still_cycles_to_search = False
          cycle -= 1
          print(f'bad url: {url}')
        else:
          find_url = False
          url_found = True

      if url_found:
        ds_list = []
        for m in metrics:
          ds_list.append((get_metric_ds(m[0], url_idx),m[1]))
        
        for idx, row in unique_ids.iterrows():
          location_id = row[location_col]
          row_list = [location_id, f'{retrieveday:%Y-%m-%d}']
          min_lon, min_lat, max_lon, max_lat = row['bbox']
          for ds, m in ds_list:
            expand_search = 0.025
            lat_values = (ds[m].latitude.values < max_lat + expand_search) & (ds[m].latitude.values > min_lat - expand_search)
            # noaa hrrr longitude values are stored as degrees east so we need to subtract 360
            lon_values = (ds[m].longitude.values - 360 < max_lon + expand_search) & (ds[m].longitude.values - 360 > min_lon - expand_search)
            mask = np.multiply(lat_values,lon_values)
            m_value = ds[m].values[mask].mean()
            row_list.append( m_value )
          all_data.append(row_list)

In [None]:
gm_climate_df = pd.DataFrame(all_data[1:], columns=all_data[0])

In [None]:
gm_climate_df.to_parquet(os.path.join(data_dir, 'hrrr/gm_climate.parquet')

In [None]:
df_seq = train_label_seq
gm = False

all_data = [['location_id','date','TMP','SNOD','WEASD','SPFH','SNOWC','REFC','PRES','PWAT']]

unique_dates = df_seq[['date','datetime']].drop_duplicates().sort_values(['date'])

if gm:
  location_col = 'station_id'
else:
  location_col = 'cell_id'

for days in range(3):
  for _, row in unique_dates.iterrows():
    date = row['date'].strftime('%Y-%m-%d')

    # No good climate data earlier than '2015-01-01'
    if date >= '2015-01-01':
      print(date)
      retrieveday = row['datetime'] - timedelta(days=days)
      unique_ids = df_seq[df_seq['date'] == date][[location_col,'bbox']]

      find_url = True
      still_cycles_to_search = True
      url_found = False
      cycle = 16
      while find_url and still_cycles_to_search:
        file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"
        url = f"{blob_container}/hrrr.{retrieveday:%Y%m%d}/{sector}/{file_path}"
        # Fetch the idx file by appending the .idx file extension to our already formatted URL
        r = requests.get(f"{url}.idx")
        url_idx = r.text.splitlines()
        if url_idx[0] == '<?xml version="1.0" encoding="UTF-8"?>':
          if cycle == 0:
            still_cycles_to_search = False
          cycle -= 1
          print(f'bad url: {url}')
        else:
          find_url = False
          url_found = True

      if url_found:
        ds_list = []
        for m in metrics:
          ds_list.append((get_metric_ds(m[0], url_idx),m[1]))
        
        for idx, row in unique_ids.iterrows():
          location_id = row[location_col]
          row_list = [location_id, f'{retrieveday:%Y-%m-%d}']
          min_lon, min_lat, max_lon, max_lat = row['bbox']
          for ds, m in ds_list:
            expand_search = 0.025
            lat_values = (ds[m].latitude.values < max_lat + expand_search) & (ds[m].latitude.values > min_lat - expand_search)
            # noaa hrrr longitude values are stored as degrees east so we need to subtract 360
            lon_values = (ds[m].longitude.values - 360 < max_lon + expand_search) & (ds[m].longitude.values - 360 > min_lon - expand_search)
            mask = np.multiply(lat_values,lon_values)
            m_value = ds[m].values[mask].mean()
            row_list.append( m_value )
          all_data.append(row_list)

In [None]:
train_climate_df = pd.DataFrame(all_data[1:], columns=all_data[0])

In [None]:
train_climate_df.to_parquet(os.path.join(data_dir, 'hrrr/train_climate.parquet')

In [None]:
df_seq = test_label_seq
gm = False

all_data = [['location_id','date','TMP','SNOD','WEASD','SPFH','SNOWC','REFC','PRES','PWAT']]

unique_dates = df_seq[['date','datetime']].drop_duplicates().sort_values(['date'])

if gm:
  location_col = 'station_id'
else:
  location_col = 'cell_id'

for days in range(3):
  for _, row in unique_dates.iterrows():
    date = row['date'].strftime('%Y-%m-%d')

    # No good climate data earlier than '2015-01-01'
    if date >= '2015-01-01':
      print(date)
      retrieveday = row['datetime'] - timedelta(days=days)
      unique_ids = df_seq[df_seq['date'] == date][[location_col,'bbox']]

      find_url = True
      still_cycles_to_search = True
      url_found = False
      cycle = 16
      while find_url and still_cycles_to_search:
        file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"
        url = f"{blob_container}/hrrr.{retrieveday:%Y%m%d}/{sector}/{file_path}"
        # Fetch the idx file by appending the .idx file extension to our already formatted URL
        r = requests.get(f"{url}.idx")
        url_idx = r.text.splitlines()
        if url_idx[0] == '<?xml version="1.0" encoding="UTF-8"?>':
          if cycle == 0:
            still_cycles_to_search = False
          cycle -= 1
          print(f'bad url: {url}')
        else:
          find_url = False
          url_found = True

      if url_found:
        ds_list = []
        for m in metrics:
          ds_list.append((get_metric_ds(m[0], url_idx),m[1]))
        
        for idx, row in unique_ids.iterrows():
          location_id = row[location_col]
          row_list = [location_id, f'{retrieveday:%Y-%m-%d}']
          min_lon, min_lat, max_lon, max_lat = row['bbox']
          for ds, m in ds_list:
            expand_search = 0.025
            lat_values = (ds[m].latitude.values < max_lat + expand_search) & (ds[m].latitude.values > min_lat - expand_search)
            # noaa hrrr longitude values are stored as degrees east so we need to subtract 360
            lon_values = (ds[m].longitude.values - 360 < max_lon + expand_search) & (ds[m].longitude.values - 360 > min_lon - expand_search)
            mask = np.multiply(lat_values,lon_values)
            m_value = ds[m].values[mask].mean()
            row_list.append( m_value )
          all_data.append(row_list)

In [None]:
test_climate_df = pd.DataFrame(all_data[1:], columns=all_data[0])

In [None]:
test_climate_df.to_parquet(os.path.join(data_dir, 'hrrr/test_climate.parquet')