In [None]:
!pip install boto3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import boto3
import netCDF4

In [None]:
BUCKET_NAME = 'drivendata-public-assets'

# enter authentication credentials
s3 = boto3.resource('s3', aws_access_key_id = 'aws_access_key_id', 
                          aws_secret_access_key = 'aws_secret_access_key')

In [None]:
KEY = 'land_cover_map.tar.gz'

try:
  s3.Bucket(BUCKET_NAME).download_file(KEY, 'land_cover_map.tar.gz')
  
except botocore.exceptions.ClientError as e:
  if e.response['Error']['Code'] == "404":
    print("The object does not exist.")
  else:
    raise

In [None]:
!tar -xf /content/land_cover_map.tar.gz

In [None]:
!cp /content/C3S-LC-L4-LCCS-Map-300m-P1Y-2020-v2.1.1.nc /content/drive/MyDrive/snocast/train/data/static/C3S-LC-L4-LCCS-Map-300m-P1Y-2020-v2.1.1.nc

In [None]:
fp='/content/drive/MyDrive/snocast/train/data/static/C3S-LC-L4-LCCS-Map-300m-P1Y-2020-v2.1.1.nc' # your file name with the eventual path
nc = netCDF4.Dataset(fp) # reading the nc file and creating Dataset

In [None]:
# http://maps.elie.ucl.ac.be/CCI/viewer/download/ESACCI-LC-QuickUserGuide-LC-Maps_v2-0-7.pdf
lccs_class = nc.variables['lccs_class']
lccs_lat = np.array(nc.variables['lat'])
lccs_lon = np.array(nc.variables['lon'])

### Import Base Data Files

In [None]:
data_dir = '/content/drive/MyDrive/snocast/train/data'

ground_measures_train = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_train_features.csv'))
ground_measures_train.columns = ['station_id'] + list(ground_measures_train.columns[1:])
gm_melt_train = ground_measures_train.melt(id_vars=["station_id"],
                                            var_name="date",
                                            value_name="swe").dropna()
            

ground_measures_test = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_test_features.csv'))
ground_measures_test.columns = ['station_id'] + list(ground_measures_test.columns[1:])
gm_melt_test = ground_measures_test.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
                           
ground_measures_metadata = pd.read_csv(os.path.join(data_dir, 'static/ground_measures_metadata.csv'))
ground_measures_all = pd.merge(ground_measures_train, ground_measures_test, how='outer', on='station_id')
gm_melt_all = ground_measures_all.melt(id_vars=["station_id"],
                           var_name="date",
                           value_name="swe").dropna()
gm_seq = pd.merge(gm_melt_all, ground_measures_metadata, how='inner', on='station_id')

train_labels = pd.read_csv(os.path.join(data_dir, 'static/train_labels.csv'))
labels_melt_train = train_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

test_labels = pd.read_csv(os.path.join(data_dir, 'static/labels_2020_2021.csv'))
labels_melt_test = test_labels.melt(id_vars=["cell_id"],
                  var_name="date",
                  value_name="swe").dropna()

In [None]:
# get latitude longitude for train and test grids
f = open(os.path.join(data_dir, 'static/grid_cells.geojson'))
grid_cells = json.load(f)
print('length grid_cells features: ', len(grid_cells['features']))

grid_features = defaultdict(dict)
for grid_cell in grid_cells['features']:
  cell_id = grid_cell['properties']['cell_id']
  coordinates = grid_cell['geometry']['coordinates'][0]
  region = grid_cell['properties']['region']
  grid_features[cell_id] = {'coordinates': coordinates[1:],
                            'region': region,
                            'geometry': grid_cell['geometry']}

grid_features_train = defaultdict(dict)
train_ids = []
train_lats = []
train_lons = []
train_regions = []
train_bboxes = []
grid_features_test = defaultdict(dict)
test_ids = []
test_lats = []
test_lons = []
test_regions = []
test_bboxes = []


for cell_id in train_labels['cell_id'].values:
  train_ids.append(cell_id)
  coordinates = grid_features[cell_id]['geometry']['coordinates'][0]
  lon, lat = np.mean(coordinates, axis=0)
  max_lon, max_lat = np.max(coordinates, axis=0)
  min_lon, min_lat = np.min(coordinates, axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat, max_lon, max_lat])
  train_regions = grid_features[cell_id]['region']
  train_lats.append(lat)
  train_lons.append(lon)
  train_bboxes.append(bbox)

  grid_features[cell_id]['dataset'] = 'train'

for cell_id in test_labels['cell_id'].values:
  test_ids.append(cell_id)
  coordinates = grid_features[cell_id]['geometry']['coordinates'][0]
  lon, lat = np.mean(coordinates, axis=0)
  max_lon, max_lat = np.max(coordinates, axis=0)
  min_lon, min_lat = np.min(coordinates, axis=0)
  # bbox = [min_lon, min_lat, max_lon, max_lat]
  bbox = np.array([min_lon, min_lat, max_lon, max_lat])
  test_regions = grid_features[cell_id]['region']
  test_lats.append(lat)
  test_lons.append(lon)
  test_bboxes.append(bbox)

  if 'dataset' in grid_features[cell_id].keys():
    grid_features[cell_id]['dataset'] = 'both'
  else:
    grid_features[cell_id]['dataset'] = 'test'

for cell_id in grid_features:
  if grid_features[cell_id]['dataset'] in ('test','both'):
    grid_features_test[cell_id] = grid_features[cell_id]
  if grid_features[cell_id]['dataset'] in ('train','both'):
    grid_features_train[cell_id] = grid_features[cell_id]
print("test count: ", len(grid_features_test))
print("train count: ", len(grid_features_train))


train_lat_lon = pd.DataFrame({'cell_id': train_ids, 
                              'latitude': train_lats, 
                              'longitude': train_lons, 
                              'region': train_regions,
                              'bbox': train_bboxes})
test_lat_lon = pd.DataFrame({'cell_id': test_ids, 
                             'latitude': test_lats, 
                             'longitude': test_lons, 
                             'region': test_regions,
                             'bbox': test_bboxes})

In [None]:
def get_lccs(df):
  all_max_lat = df.latitude.max()
  all_min_lat = df.latitude.min()
  all_max_lon = df.longitude.max()
  all_min_lon = df.longitude.min()

  # Trim to only relevant lat lon
  lccs_lat_values = (lccs_lat < all_max_lat) & (lccs_lat > all_min_lat)
  lccs_lon_values = (lccs_lon < all_max_lon) & (lccs_lon > all_min_lon)

  reduced_lccs = np.squeeze(lccs_class[:, lccs_lat_values, lccs_lon_values])
  reduced_lat = lccs_lat[lccs_lat_values]
  reduced_lon = lccs_lon[lccs_lon_values]

  lccs_arr = []

  for idx, row in df.iterrows():
    if idx % 100 == 0:
      print(idx)
    min_lon, min_lat, max_lon, max_lat = row['bbox']

    lat_values = (reduced_lat < max_lat) & (reduced_lat > min_lat)
    lon_values = (reduced_lon < max_lon) & (reduced_lon > min_lon)
    mask = lon_values[np.newaxis, :] & lat_values[:, np.newaxis]

    arr = reduced_lccs[mask]
    lccs_cat, lccs_count = np.unique(arr, return_counts=True)
    lccs_len = len(arr)

    land_cover = {}
    land_cover['location_id'] = row['cell_id']
    lccs_order = np.flip(np.argsort(lccs_count))
    for i in range(3):
      if i+1 <= len(lccs_order):
        land_cover[f'lccs_{i}'] = lccs_cat[lccs_order[i]]
        land_cover[f'lccs_pct_{i}'] = lccs_count[lccs_order[i]]/lccs_len
      else:
        land_cover[f'lccs_{i}'] = 0
        land_cover[f'lccs_pct_{i}'] = np.nan
    
    lccs_arr.append(land_cover)

  return lccs_arr

In [None]:
test_lccs_arr = get_lccs(test_lat_lon)

In [None]:
test_lccs_df = pd.DataFrame(test_lccs_arr)

In [None]:
test_lccs_df.shape

In [None]:
test_lccs_df.to_parquet('/content/drive/MyDrive/snocast/train/data/static/test_lccs.parquet')

In [None]:
train_lccs_arr = get_lccs(train_lat_lon)

In [None]:
train_lccs_df = pd.DataFrame(train_lccs_arr)

In [None]:
train_lccs_df.to_parquet('/content/drive/MyDrive/snocast/train/data/static/train_lccs.parquet')