In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
import ee

## Import Base Data Files

In [None]:
data_dir = '/content/drive/MyDrive/snocast/train/data'

ground_measures_train = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_train_features.csv'))
ground_measures_train.columns = ['station_id'] + list(ground_measures_train.columns[1:])
gm_melt_train = ground_measures_train.melt(id_vars=["station_id"],
                                            var_name="date",
                                            value_name="swe").dropna()
            

ground_measures_test = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_test_features.csv'))
ground_measures_test.columns = ['station_id'] + list(ground_measures_test.columns[1:])
gm_melt_test = ground_measures_test.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
                           
ground_measures_metadata = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_metadata.csv'))
ground_measures_all = pd.merge(ground_measures_train, ground_measures_test, how='outer', on='station_id')
gm_melt_all = ground_measures_all.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
gm_seq = pd.merge(gm_melt_all, ground_measures_metadata, how='inner', on='station_id')

train_labels = pd.read_csv(os.path.join(data_dir, 'static/train_labels.csv'))
labels_melt_train = train_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

test_labels = pd.read_csv(os.path.join(data_dir, 'static/labels_2020_2021.csv'))
labels_melt_test = test_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

In [None]:
# get latitude longitude for train and test grids
f = open(os.path.join(data_dir, 'static/grid_cells.geojson'))
grid_cells = json.load(f)
print('length grid_cells features: ', len(grid_cells['features']))

grid_features = defaultdict(dict)
for grid_cell in grid_cells['features']:
  cell_id = grid_cell['properties']['cell_id']
  coordinates = grid_cell['geometry']['coordinates'][0]
  region = grid_cell['properties']['region']
  grid_features[cell_id] = {'coordinates': coordinates[1:],
                            'region': region}

grid_features_train = defaultdict(dict)
train_ids = []
train_lats = []
train_lons = []
train_regions = []
train_bboxes = []
grid_features_test = defaultdict(dict)
test_ids = []
test_lats = []
test_lons = []
test_regions = []
test_bboxes = []


for cell_id in train_labels['cell_id'].values:
  train_ids.append(cell_id)
  lon, lat = np.mean(grid_features[cell_id]['coordinates'], axis=0)
  max_lon, max_lat = np.max(grid_features[cell_id]['coordinates'], axis=0)
  min_lon, min_lat = np.min(grid_features[cell_id]['coordinates'], axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat,max_lon, max_lat])
  train_regions = grid_features[cell_id]['region']
  train_lats.append(lat)
  train_lons.append(lon)
  train_bboxes.append(bbox)

  grid_features[cell_id]['dataset'] = 'train'

for cell_id in test_labels['cell_id'].values:
  test_ids.append(cell_id)
  lon, lat = np.mean(grid_features[cell_id]['coordinates'], axis=0)
  max_lon, max_lat = np.max(grid_features[cell_id]['coordinates'], axis=0)
  min_lon, min_lat = np.min(grid_features[cell_id]['coordinates'], axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat,max_lon, max_lat])
  test_regions = grid_features[cell_id]['region']
  test_lats.append(lat)
  test_lons.append(lon)
  test_bboxes.append(bbox)

  if 'dataset' in grid_features[cell_id].keys():
    grid_features[cell_id]['dataset'] = 'both'
  else:
    grid_features[cell_id]['dataset'] = 'test'

for cell_id in grid_features:
  if grid_features[cell_id]['dataset'] in ('test','both'):
    grid_features_test[cell_id] = grid_features[cell_id]
  if grid_features[cell_id]['dataset'] in ('train','both'):
    grid_features_train[cell_id] = grid_features[cell_id]
print("test count: ", len(grid_features_test))
print("train count: ", len(grid_features_train))


train_lat_lon = pd.DataFrame({'cell_id': train_ids, 
                              'latitude': train_lats, 
                              'longitude': train_lons, 
                              'region': train_regions,
                              'bbox': train_bboxes})
test_lat_lon = pd.DataFrame({'cell_id': test_ids, 
                             'latitude': test_lats, 
                             'longitude': test_lons, 
                             'region': test_regions,
                             'bbox': test_bboxes})

In [None]:
# Create sequential dataframes for train and test
train_label_seq = pd.merge(labels_melt_train, train_lat_lon, how='inner', on='cell_id')
test_pred_seq = pd.merge(labels_melt_test, test_lat_lon, how='inner', on='cell_id')
gm_seq = pd.merge(gm_melt_all, ground_measures_metadata, how='inner', on='station_id')

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

In [None]:
# Import the MODIS Terra Snow Cover Daily Global 500m collection.
terra = ee.ImageCollection('MODIS/006/MOD10A1')

# Import the MODIS Aqua Snow Cover Daily Global 500m collection.
aqua = ee.ImageCollection('MODIS/006/MYD10A1')

In [None]:
def get_modis_data(df, df_seq, gm=False):
  unique_dates = df_seq['date'].unique()
  
  min_date = (datetime.datetime.strptime(unique_dates.min(),'%Y-%m-%d') - datetime.timedelta(days=15)).strftime('%Y-%m-%d')
  max_date = (datetime.datetime.strptime(unique_dates.max(),'%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
  print(min_date, max_date)

  if gm:
    location_col = 'station_id'
  else:
    location_col = 'cell_id'
  modis_cols = [location_col,'latitude','longitude']
  unique_ids = df[modis_cols]
  print(unique_ids.shape)

  terra_snow_cover = terra.select('NDSI_Snow_Cover').filterDate(min_date, max_date)
  aqua_snow_cover = aqua.select('NDSI_Snow_Cover').filterDate(min_date, max_date)
  terra_info = terra_snow_cover.getInfo()['features']
  aqua_info = aqua_snow_cover.getInfo()['features']
  print('Terra min date: {}'.format(terra_info[0]['properties']['system:index']))
  print('Terra max date: {}'.format(terra_info[-1]['properties']['system:index']))
  print('Aqua min date: {}'.format(aqua_info[0]['properties']['system:index']))
  print('Aqua max date: {}'.format(aqua_info[-1]['properties']['system:index']))

  output_cols = ['date',
                  'longitude',
                  'latitude',
                  'time',
                  'NDSI_Snow_Cover']

  terra_list = []
  aqua_list = []
  terra_ids = []
  aqua_ids = []

  # Runs in 4 hours
  for idx, row in df.iterrows():
      if idx % 250 == 0:
        print(idx)

      # Define a region of interest with a buffer zone of 500 m
      poi = ee.Geometry.Point(row['longitude'], row['latitude'])
      roi = poi.buffer(500)

      terra_data = terra_snow_cover.getRegion(roi, scale=500).getInfo()[1:]
      terra_ids.extend([row[location_col]]*len(terra_data))
      terra_list.extend(terra_data)

      aqua_data = aqua_snow_cover.getRegion(roi, scale=500).getInfo()[1:]
      aqua_ids.extend([row[location_col]]*len(aqua_data))
      aqua_list.extend(aqua_data)

  terra_df = pd.DataFrame(terra_list, columns=output_cols)
  terra_df['location_id'] = terra_ids

  aqua_df = pd.DataFrame(aqua_list, columns=output_cols)
  aqua_df['location_id'] = aqua_ids

  return terra_df, aqua_df

In [None]:
train_terra_df, train_aqua_df = get_modis_data(train_lat_lon, train_label_seq)

In [None]:
train_terra_df.to_parquet(f'/content/drive/MyDrive/snocast/train/data/modis/modis_terra_train.parquet')
train_aqua_df.to_parquet(f'/content/drive/MyDrive/snocast/train/data/modis/modis_aqua_train.parquet')

In [None]:
test_terra_df, test_aqua_df = get_modis_data(test_lat_lon, test_pred_seq, gm=True)

In [None]:
test_terra_df.to_parquet(f'/content/drive/MyDrive/snocast/test/data/modis/modis_terra_test.parquet')
test_aqua_df.to_parquet(f'/content/drive/MyDrive/snocast/test/data/modis/modis_aqua_test.parquet')

In [None]:
gm_terra_df, gm_aqua_df = get_modis_data(ground_measures_metadata, gm_seq, gm=True)

In [None]:
gm_terra_df.to_parquet(f'/content/drive/MyDrive/snocast/train/data/modis/modis_terra_gm.parquet')
gm_aqua_df.to_parquet(f'/content/drive/MyDrive/snocast/train/data/modis/modis_aqua_gm.parquet')