# Get Elevation data from Copernicus Digital Elevation Model (DEM)
Elevation data was provided for the ground measures but not for the test and train datasets. This notebook pulls the elevation mean and variance for the test and train grid cells and saves it into the data/static directory.

In [None]:
!pip install pystac_client
!pip install planetary_computer
!pip install rasterio
!pip install xarray-spatial

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import os
import json
import pandas as pd
import numpy as np
import planetary_computer
import xarray
import time
from collections import defaultdict
from pystac_client import Client

### Import Base Data Files

In [None]:
data_dir = '/content/drive/MyDrive/snocast/train/data'

ground_measures_train = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_train_features.csv'))
ground_measures_train.columns = ['station_id'] + list(ground_measures_train.columns[1:])
gm_melt_train = ground_measures_train.melt(id_vars=["station_id"],
                                            var_name="date",
                                            value_name="swe").dropna()
            

ground_measures_test = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_test_features.csv'))
ground_measures_test.columns = ['station_id'] + list(ground_measures_test.columns[1:])
gm_melt_test = ground_measures_test.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
                           
ground_measures_metadata = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_metadata.csv'))
ground_measures_all = pd.merge(ground_measures_train, ground_measures_test, how='outer', on='station_id')
gm_melt_all = ground_measures_all.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
gm_seq = pd.merge(gm_melt_all, ground_measures_metadata, how='inner', on='station_id')

train_labels = pd.read_csv(os.path.join(data_dir, 'static/train_labels.csv'))
labels_melt_train = train_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

test_labels = pd.read_csv(os.path.join(data_dir, 'static/labels_2020_2021.csv'))
labels_melt_test = test_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

In [None]:
# get latitude longitude for train and test grids
f = open(os.path.join(data_dir, 'static/grid_cells.geojson'))
grid_cells = json.load(f)
print('length grid_cells features: ', len(grid_cells['features']))

grid_features = defaultdict(dict)
for grid_cell in grid_cells['features']:
  cell_id = grid_cell['properties']['cell_id']
  coordinates = grid_cell['geometry']['coordinates'][0]
  region = grid_cell['properties']['region']
  grid_features[cell_id] = {'coordinates': coordinates[1:],
                            'region': region}

grid_features_train = defaultdict(dict)
train_ids = []
train_lats = []
train_lons = []
train_regions = []
train_bboxes = []
grid_features_test = defaultdict(dict)
test_ids = []
test_lats = []
test_lons = []
test_regions = []
test_bboxes = []


for cell_id in train_labels['cell_id'].values:
  train_ids.append(cell_id)
  lon, lat = np.mean(grid_features[cell_id]['coordinates'], axis=0)
  max_lon, max_lat = np.max(grid_features[cell_id]['coordinates'], axis=0)
  min_lon, min_lat = np.min(grid_features[cell_id]['coordinates'], axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat,max_lon, max_lat])
  train_regions = grid_features[cell_id]['region']
  train_lats.append(lat)
  train_lons.append(lon)
  train_bboxes.append(bbox)

  grid_features[cell_id]['dataset'] = 'train'

for cell_id in test_labels['cell_id'].values:
  test_ids.append(cell_id)
  lon, lat = np.mean(grid_features[cell_id]['coordinates'], axis=0)
  max_lon, max_lat = np.max(grid_features[cell_id]['coordinates'], axis=0)
  min_lon, min_lat = np.min(grid_features[cell_id]['coordinates'], axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat,max_lon, max_lat])
  test_regions = grid_features[cell_id]['region']
  test_lats.append(lat)
  test_lons.append(lon)
  test_bboxes.append(bbox)

  if 'dataset' in grid_features[cell_id].keys():
    grid_features[cell_id]['dataset'] = 'both'
  else:
    grid_features[cell_id]['dataset'] = 'test'

for cell_id in grid_features:
  if grid_features[cell_id]['dataset'] in ('test','both'):
    grid_features_test[cell_id] = grid_features[cell_id]
  if grid_features[cell_id]['dataset'] in ('train','both'):
    grid_features_train[cell_id] = grid_features[cell_id]
print("test count: ", len(grid_features_test))
print("train count: ", len(grid_features_train))


train_lat_lon = pd.DataFrame({'cell_id': train_ids, 
                              'latitude': train_lats, 
                              'longitude': train_lons, 
                              'region': train_regions,
                              'bbox': train_bboxes})
test_lat_lon = pd.DataFrame({'cell_id': test_ids, 
                             'latitude': test_lats, 
                             'longitude': test_lons, 
                             'region': test_regions,
                             'bbox': test_bboxes})

## Get Data for Copernicus Digital Elevation Model (DEM)

In [None]:
client = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    ignore_conformance=True,
)

In [None]:
def get_elevations(df):
  all_max_lat = df.latitude.max()
  all_min_lat = df.latitude.min()
  all_max_lon = df.longitude.max()
  all_min_lon = df.longitude.min()
  all_bbox = [all_min_lon, all_min_lat, all_max_lon, all_max_lat]

  # Get all relevant items within the lat/lon bounds of the df
  search = client.search(
      collections=["cop-dem-glo-30"],
      bbox=all_bbox,
  )

  items = list(search.get_items())
  if len(items) > 1:
    print(f"Returned {len(items)} items")
  
  # Ran in 30 min. for 295 items
  processed_items = []
  for i in range(len(items)):
    signed_asset = planetary_computer.sign(items[i].assets["data"])
    data = (
        xarray.open_rasterio(signed_asset.href)
        .squeeze()
        .drop("band")
        .coarsen({"y": 5, "x": 5})
        .mean()
    )
    processed_items.append(data)

  mean_elevations = []
  var_elevations = []

  for idx, row in df.iterrows():
    if idx % 100 == 0:
      print(idx)
    min_lon, min_lat, max_lon, max_lat = row['bbox']

    sample_elevations = np.array([])
    for data in processed_items:
      lat_values = (data.y.values < max_lat) & (data.y.values > min_lat)
      lon_values = (data.x.values < max_lon) & (data.x.values > min_lon)
      mask = lon_values[np.newaxis, :] & lat_values[:, np.newaxis]
      sample_elevations = np.concatenate([sample_elevations, data.values[mask]])
    mean_elevation_m = sample_elevations.mean()
    var_elevation_m = sample_elevations.var()
    mean_elevations.append(mean_elevation_m)
    var_elevations.append(var_elevation_m)

  return mean_elevations, var_elevations


In [None]:
test_mean_elevations, test_var_elevations = get_elevations(test_lat_lon)
test_lat_lon['elevation_m'] = test_mean_elevations
test_lat_lon['elevation_var_m'] = test_var_elevations

In [None]:
test_lat_lon = test_lat_lon[['cell_id', 'latitude', 'longitude', 'region', 'elevation_m','elevation_var_m']]
test_lat_lon.to_parquet('/content/drive/MyDrive/snocast/train/data/static/test_elevation.parquet')

In [None]:
train_mean_elevations, train_var_elevations = get_elevations(train_lat_lon)
train_lat_lon['elevation_m'] = train_mean_elevations
train_lat_lon['elevation_var_m'] = train_var_elevations

In [None]:
train_lat_lon = train_lat_lon[['cell_id', 'latitude', 'longitude', 'region', 'elevation_m','elevation_var_m']]
train_lat_lon.to_parquet('/content/drive/MyDrive/snocast/train/data/static/train_elevation.parquet')

In [None]:
train_lat_lon.sample(5)