In [None]:
!pip install boto3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import netCDF4
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
# File below downloaded in get_lccs_train_test.ipynb notebook
fp='/content/drive/MyDrive/snocast/train/data/static/C3S-LC-L4-LCCS-Map-300m-P1Y-2020-v2.1.1.nc' # your file name with the eventual path
nc = netCDF4.Dataset(fp) # reading the nc file and creating Dataset

In [None]:
# http://maps.elie.ucl.ac.be/CCI/viewer/download/ESACCI-LC-QuickUserGuide-LC-Maps_v2-0-7.pdf
lccs_class = nc.variables['lccs_class']
lccs_lat = np.array(nc.variables['lat'])
lccs_lon = np.array(nc.variables['lon'])

### Import Base Data Files

In [None]:
ground_measures_metadata = pd.read_csv('/content/drive/MyDrive/snocast/train/data/static/ground_measures_metadata.csv')

In [None]:
all_max_lat = ground_measures_metadata.latitude.max()
all_min_lat = ground_measures_metadata.latitude.min()
all_max_lon = ground_measures_metadata.longitude.max()
all_min_lon = ground_measures_metadata.longitude.min()
print(all_min_lon, all_min_lat, all_max_lon, all_max_lat)

In [None]:
# Figure out how to trim to only relevant lat lon
lccs_lat_values = (lccs_lat < all_max_lat) & (lccs_lat > all_min_lat)
lccs_lon_values = (lccs_lon < all_max_lon) & (lccs_lon > all_min_lon)

reduced_lccs = np.squeeze(lccs_class[:, lccs_lat_values, lccs_lon_values])
reduced_lat = lccs_lat[lccs_lat_values]
reduced_lon = lccs_lon[lccs_lon_values]

In [None]:
lccs_arr = []

for idx, row in ground_measures_metadata.iterrows():
  if idx % 100 == 0:
    print(idx)
  lat, lon = row[['latitude','longitude']].values
  # account for lat and lon resolution
  max_lat = lat + 0.0014
  min_lat = lat - 0.0014
  max_lon = lon + 0.0014
  min_lon = lon - 0.0014

  lat_values = (reduced_lat < max_lat) & (reduced_lat > min_lat)
  lon_values = (reduced_lon < max_lon) & (reduced_lon > min_lon)
  mask = lon_values[np.newaxis, :] & lat_values[:, np.newaxis]

  arr = reduced_lccs[mask]
  lccs_cat, lccs_count = np.unique(arr, return_counts=True)
  lccs_len = len(arr)

  land_cover = {}
  land_cover['station_id'] = row['station_id']
  lccs_order = np.flip(np.argsort(lccs_count))
  for i in range(3):
    if i+1 <= len(lccs_order):
      land_cover[f'lccs_{i}'] = lccs_cat[lccs_order[i]]
      land_cover[f'lccs_pct_{i}'] = lccs_count[lccs_order[i]]/lccs_len
    else:
      land_cover[f'lccs_{i}'] = 0
      land_cover[f'lccs_pct_{i}'] = np.nan
  
  lccs_arr.append(land_cover)

In [None]:
print(idx)
print(len(lccs_arr))

In [None]:
lccs_df = pd.DataFrame(lccs_arr)

In [None]:
lccs_df = lccs_df[['station_id','lccs_0']]

In [None]:
lccs_df.groupby('lccs_0').count()

In [None]:
lccs_df.to_parquet('/content/drive/MyDrive/snocast/train/data/static/gm_lccs.parquet')