In [1]:
## Script to generate train_label_data.csv (from Snowcast and other ASO, and SNOTEL/CDEC data)

In [2]:
## Setup Python Environment

import sys
import os
import geojson
import numpy as np
import scipy.io as sio
from osgeo import gdal
import subprocess
from datetime import datetime, timedelta
from osgeo import ogr

# To make gdal ulility programs (which are used below) work properly, you need to have them 
# on the system path and set the PROJ_LIB and GDAL_DATA environment variables. If they are not, 
# then you can do this here.  This code works for Anaconda (Windows), but is not needed if GDAL 
# is already set up.

# pypath = os.path.dirname(sys.executable)
# sys.path.append(pypath + '/Library/bin')
# os.environ['PROJ_LIB'] = pypath + '/Library/share/proj'
# os.environ['GDAL_DATA'] = pypath + '/Library/share'

In [3]:
## Get Metadata for the Evaluation Stage Grid Cells

with open('Data/Snowcast Evaluation/grid_cells.geojson') as f:
    gj = geojson.load(f)
features = gj['features']
evaluation_cell_ids = []
evaluation_coordinates = []
for feature in features:
    evaluation_cell_ids.append(feature['properties']['cell_id'])
    evaluation_coordinates.append(feature['geometry']['coordinates'])

evaluation_cell_ids = np.array(evaluation_cell_ids)  # So we can do numpy stuff to it

In [4]:
## Get Metadata for the Development Stage Grid Cells

with open('Data/Snowcast Development/grid_cells.geojson') as f:
    gj = geojson.load(f)
features = gj['features']
development_cell_ids = []
development_coordinates = []
for feature in features:
    development_cell_ids.append(feature['properties']['cell_id'])
    development_coordinates.append(feature['geometry']['coordinates'])

development_cell_ids = np.array(development_cell_ids)  # So we can do numpy stuff to it

In [5]:
## Get the Training Label Data for Development Stage (Training Dataset)

# Read the file header (to get timetamp information)
with open('Data/Snowcast Development/train_labels.csv') as f:
    tline = f.readline().replace('\n','')
    header = tline.split(',')

times = np.array(header[1:])
num_times = len(times)
num_lines = len(development_cell_ids)

# Read in the data
development_SWE = np.ones([num_lines, num_times]) * np.nan
with open('Data/Snowcast Development/train_labels.csv') as f:
    tline = f.readline()
    tline = f.readline()
    while not (tline == ''):
        fields = tline.split(',')
        id = fields[0]
        loc = development_cell_ids == id
        for d in range(len(fields)-1):
            if not (fields[d+1] == '') and len(fields[d+1]) > 1:
                development_SWE[loc,d] = float(fields[d+1])
                
        tline = f.readline()


In [6]:
## Get the Training Label Data for Development Stage (2020-2021 Dataset)

# Read the file header (to get timetamp information)
with open('Data/Snowcast Evaluation/labels_2020_2021.csv') as f:
    tline = f.readline().replace('\n','')
    header = tline.split(',')
    
times_2021 = np.array(header[1:])
num_times_2021 = len(times_2021)
num_lines = len(development_cell_ids)

# Read in the data
development_SWE_2021 = np.ones([num_lines, num_times_2021]) * np.nan
with open('Data/Snowcast Evaluation/labels_2020_2021.csv') as f:
    tline = f.readline()
    tline = f.readline()
    while not (tline == ''):
        fields = tline.split(',')
        id = fields[0]
        loc = development_cell_ids == id
        for d in range(len(fields)-1):
            if not (fields[d+1] == '') and len(fields[d+1]) > 1:
                development_SWE_2021[loc,d] = float(fields[d+1])
                
        tline = f.readline()
        

In [7]:
## Arrange data from the development stage into the new grid cells

# Concatenate the times and SWE data together
times_all = np.concatenate((times, times_2021))
development_SWE_all = np.concatenate((development_SWE, development_SWE_2021), axis=1)

# Use the ids to match the grid cells together
SWE_snowcast = np.ones([len(evaluation_cell_ids), len(times_all)]) * np.nan

c = 0
for evaluation_cell_id in evaluation_cell_ids:
    loc = development_cell_ids == evaluation_cell_id
    if np.any(loc):
        SWE_snowcast[c,:] = development_SWE_all[loc,:]
    c = c+1
    

In [8]:
## Create a database showing rasterized locations of all grid cells within 'ASO' domains for CO and CA
## This step can take up to an hour, but results are saved to file so it only needs to be done once
## Note that file is already saved

CA_box_te = '-120.5 36 -118 38.5'
CO_box_te = '-108.55 37 -105.75 39.7'

if not os.path.exists('Data/ASO/ProcessedLocations.mat'):
    print('Creating Locaton Maps')

    # Write Modified geojson with list of file locations for each cell (so we can burn them on a raster (next step))
    with open('Data/Snowcast Evaluation/grid_cells.geojson') as f:
        gj = geojson.load(f)

    features = gj['features']
    development_cell_ids = []
    development_coordinates = []
    for c in range(len(features)):
        features[c]['properties']['file_loc'] = c

    gj['features'] = features
    with open('tmp.geojson', 'w') as outfile:
        geojson.dump(gj, outfile)

    cmd = 'gdal_rasterize -a file_loc -a_srs "EPSG:4326" -te ' + CA_box_te + ' -tr 0.00042 0.00042 tmp.geojson tmp_CA_aso_locs.tif'  
    print(cmd)
    subprocess.run(cmd)
    src = gdal.Open('tmp_CA_aso_locs.tif')
    CA_box_locs = src.ReadAsArray()
    src = None

    cmd = 'gdal_rasterize -a file_loc -a_srs "EPSG:4326" -te ' + CO_box_te + ' -tr 0.00042 0.00042 tmp.geojson tmp_CO_aso_locs.tif'  
    subprocess.run(cmd)
    src = gdal.Open('tmp_CO_aso_locs.tif')
    CO_box_locs = src.ReadAsArray()
    src = None

    print('Reading in locations of each grid cell on these grids (so we can reference them later)')
    ca_locs = []
    co_locs = []
    for i in range(len(evaluation_cell_ids)):
        ca_locs.append(np.where(CA_box_locs == i))
        co_locs.append(np.where(CO_box_locs == i))
       
    # Write to file (so we only need to do this once)
    mdict = {}
    mdict['ca_locs'] = ca_locs
    mdict['co_locs'] = co_locs
    
    os.remove('tmp.geojson')
    os.remove('tmp_CA_aso_locs.tif')
    os.remove('tmp_CO_aso_locs.tif')
    
    sio.savemat('Data/ASO/ProcessedLocations.mat', mdict)
else:
    mdict = sio.loadmat('Data/ASO/ProcessedLocations.mat',simplify_cells=True)
    ca_locs = mdict['ca_locs']
    co_locs = mdict['co_locs']
    

In [13]:
## Extract the ASO data using the lookup structure created above

SWE_aso = np.ones(SWE_snowcast.shape) * np.nan

# Get a listing of all ASO files (which are referenced using a list of virtual rasters)
dir_list = os.listdir('Data/ASO/vrt')
# Loop through all of the times
for t in range(len(times_all)):
    yyyy, mm, dd = times_all[t].split('-')
    found = 0;
    fname = yyyy + '_' + mm + '_' + dd + '.vrt'
    for file in dir_list:
        if file == fname:
            found = 1
            
    # If a vrt file is found for that date, try to extract pixels for both the CA and CO domains

    if found == 1:
        print('Getting ASO Data for ' + times_all[t])
        # Expand to cover the entire CA domain
        cmd = 'gdalwarp -overwrite -t_srs "EPSG:4326" -te ' + CA_box_te + ' -tr 0.00042 0.00042 "Data/ASO/vrt/' + fname + '" tmp_CA.tif'
        subprocess.run(cmd)
        src = gdal.Open('tmp_CA.tif')
        data = src.ReadAsArray()
        src = None
        os.remove('tmp_CA.tif')
        # Use the lookup structure to query relavent cells, and if they are at least half covered 
        # by actual data, then get data for those cells
        for i in range(len(evaluation_cell_ids)):
            if len(ca_locs[i][0]) > 0:
                data_sub = data[tuple(ca_locs[i])]
                data_sub[data_sub < 0] = np.nan
                if np.sum(~np.isnan(data_sub)) > data_sub.size/2:
                    SWE_aso[i,t] = np.nanmean(data_sub) * 39.37    # Convert to inches   
       
        # Expand to cover the entire CO domain
        cmd = 'gdalwarp -overwrite -t_srs "EPSG:4326" -te ' + CO_box_te + ' -tr 0.00042 0.00042 "Data/ASO/vrt/' + fname + '" tmp_CO.tif'
        subprocess.run(cmd)
        src = gdal.Open('tmp_CO.tif')
        data = src.ReadAsArray()
        src = None
        os.remove('tmp_CO.tif')
        # Use the lookup structure to query relavent cells, and if they are at least half covered 
        # by actual data, then get data for those cells
        for i in range(len(evaluation_cell_ids)):
            if len(co_locs[i][0]) > 0:
                data_sub = data[tuple(co_locs[i])]
                data_sub[data_sub < 0] = np.nan
                if np.sum(~np.isnan(data_sub)) > data_sub.size/2:
                    SWE_aso[i,t] = np.nanmean(data_sub) * 39.37    # Convert to inches 


Getting ASO Data for 2013-04-03
Getting ASO Data for 2013-04-29
Getting ASO Data for 2013-05-03
Getting ASO Data for 2013-05-25
Getting ASO Data for 2013-06-01
Getting ASO Data for 2013-06-08
Getting ASO Data for 2014-03-25
Getting ASO Data for 2014-04-08
Getting ASO Data for 2014-04-15
Getting ASO Data for 2014-04-22
Getting ASO Data for 2014-04-29
Getting ASO Data for 2014-05-06
Getting ASO Data for 2015-02-17
Getting ASO Data for 2015-03-03
Getting ASO Data for 2015-03-24
Getting ASO Data for 2015-03-31
Getting ASO Data for 2015-04-07
Getting ASO Data for 2016-02-08
Getting ASO Data for 2016-03-26
Getting ASO Data for 2016-03-29
Getting ASO Data for 2016-04-01
Getting ASO Data for 2016-04-03
Getting ASO Data for 2016-04-04
Getting ASO Data for 2016-04-07
Getting ASO Data for 2016-04-16
Getting ASO Data for 2016-04-26
Getting ASO Data for 2016-05-09
Getting ASO Data for 2016-05-27
Getting ASO Data for 2016-06-07
Getting ASO Data for 2016-06-14
Getting ASO Data for 2016-06-21
Getting 

In [10]:
## Extract the SNOTEL/CDEC Data

SWE_snotel = np.ones(SWE_snowcast.shape) * np.nan

## Define date vector for SNOTEL data

# Function to generate date vector between two dates
def date_range(start, end):
    delta = end - start  # as timedelta
    days = [start + timedelta(days=i) for i in range(delta.days + 1)]
    return days

# All snotel data is organized into tables that start on 10/1/2010 and end on 9/30/2019 
start_date = datetime(2010, 10, 1)
end_date = datetime(2021, 9, 30)
snotel_ts = date_range(start_date, end_date)

# Convert date vecotr into a vector of formatted dates (to match the csv files)
snotel_dates = []
for date in snotel_ts:
    snotel_dates.append(date.strftime('%Y-%m-%d'))
snotel_dates = np.array(snotel_dates)
    
## Get locations and names of all available SNOTEL data

# Get information about SNOTELs from a shapefile
all_snotel_lats = []
all_snotel_lons = []
all_snotel_names = []
src = ogr.Open('Data/SNOTEL/Locations.shp')
layer = src.GetLayer(0)
for feature in layer:
    all_snotel_lats.append(feature.GetField('Latitude'))
    all_snotel_lons.append(feature.GetField('Longitude'))
    all_snotel_names.append(feature.GetField('Name'))
src = None
all_snotel_lats = np.array(all_snotel_lats)
all_snotel_lons = np.array(all_snotel_lons)

# List of cells that are probable snotels (> 50 measurements)
snotel_locs = np.sum(~np.isnan(SWE_snowcast),axis=1)>50
dir_list = os.listdir('Data/SNOTEL/Data')
# Loop through all grid cells
for i in range(len(evaluation_cell_ids)):
    
    # But only consider those that are snotel grids
    if snotel_locs[i] == True:
       
        # Get the centroid of each cells, and look for closest snotel
        X_center = (evaluation_coordinates[i][0][0][0] + evaluation_coordinates[i][0][2][0]) / 2
        Y_center = (evaluation_coordinates[i][0][0][1] + evaluation_coordinates[i][0][1][1]) / 2
        dist = np.sqrt((all_snotel_lats-Y_center)**2 + (all_snotel_lons-X_center)**2)
        
        # If a very close one is found, then read the data
        if np.min(dist) < 0.01:
            snotel_name = all_snotel_names[np.where(dist == np.min(dist))[0][0]]
            for fname in dir_list:
                if snotel_name in fname:
                    print('Getting SNOTEL data from ' + fname)
                    swe = np.genfromtxt('Data/SNOTEL/Data/' + fname, delimiter=',',skip_header=5)[:,3]
                    
                    # Extract data for the relavent times
                    for d in range(len(times_all)):
                        tloc = snotel_dates == times_all[d]
                        SWE_snotel[i,d] = swe[tloc]/25.4

                        

Getting SNOTEL data from STM-Stouts Meadow.csv
Getting SNOTEL data from GOL-Gold Lake.csv
Getting SNOTEL data from 574-LEAVITT LAKE.csv
Getting SNOTEL data from 931-BIG GOOSE.csv
Getting SNOTEL data from 408-COLUMBINE.csv
Getting SNOTEL data from 409-COLUMBINE PASS.csv
Getting SNOTEL data from CBT-Crabtree Meadow.csv
Getting SNOTEL data from 753-SHORT CREEK.csv
Getting SNOTEL data from 843-VALLECITO.csv
Getting SNOTEL data from 538-IDARADO.csv
Getting SNOTEL data from 1137-VACARRO SPRING.csv
Getting SNOTEL data from 517-HAYDEN FORK.csv
Getting SNOTEL data from 599-LOST HORSE.csv
Getting SNOTEL data from 800-SUMMER RIM.csv
Getting SNOTEL data from 400-CLEAR CREEK #2.csv
Getting SNOTEL data from 944-GUNSIGHT PASS.csv
Getting SNOTEL data from 705-PROMONTORY.csv
Getting SNOTEL data from 932-POORMAN CREEK.csv
Getting SNOTEL data from 382-CAMAS CREEK DIVIDE.csv
Getting SNOTEL data from 774-SOUTH MTN..csv
Getting SNOTEL data from 323-BEAR MOUNTAIN.csv
Getting SNOTEL data from 527-HOLE-IN-MOUN

Getting SNOTEL data from 776-SPENCER MEADOW.csv
Getting SNOTEL data from 695-PINE CREEK PASS.csv
Getting SNOTEL data from 836-TWIN LAKES.csv
Getting SNOTEL data from 435-DANIELS-STRAWBERRY.csv
Getting SNOTEL data from 359-BOSTETTER R.S..csv
Getting SNOTEL data from 469-EMERY CREEK.csv
Getting SNOTEL data from KIB-Lower Kibbie Ridge.csv
Getting SNOTEL data from 378-BURRO MOUNTAIN.csv
Getting SNOTEL data from 518-HEAVENLY VALLEY.csv
Getting SNOTEL data from 979-VAN WYCK.csv
Getting SNOTEL data from 1151-GEORGE CREEK.csv
Getting SNOTEL data from 1056-LIGHTNING RIDGE.csv
Getting SNOTEL data from 515-HARTS PASS.csv
Getting SNOTEL data from 307-BADGER PASS.csv
Getting SNOTEL data from 343-BIGELOW CAMP.csv
Getting SNOTEL data from 405-COLD SPRINGS.csv
Getting SNOTEL data from 406-COLD SPRINGS CAMP.csv
Getting SNOTEL data from FOR-Four Trees.csv
Getting SNOTEL data from 351-BLAZED ALDER.csv
Getting SNOTEL data from 362-BOWMAN SPRINGS.csv
Getting SNOTEL data from 934-TOLBY.csv
Getting SNOTEL da

In [14]:
## Put All of the data together

SWE_final = np.ones(SWE_snowcast.shape) * np.nan

## For non-SNOTEL grid cells, primarily rely on external ASO data

non_snotel_locs = np.sum(~np.isnan(SWE_snowcast),axis=1) < 50
SWE_final[non_snotel_locs,:] = SWE_aso[non_snotel_locs,:]

# This date has bad data though, so disregard
SWE_final[non_snotel_locs,253] = np.nan

# Where there isn't any other ASO data, but the Snowcast files have non-zero SWE, use the Snowcast data
non_snotel_locs = np.tile(np.reshape(non_snotel_locs, (-1, len(non_snotel_locs))).T,[1, 324])
locs = non_snotel_locs * np.isnan(SWE_final) * (SWE_snowcast > 0)
SWE_final[locs] = SWE_snowcast[locs]

## For SNOTEL grid cells, primarily rely on external SNOTEL data

snotel_locs = np.where(np.sum(~np.isnan(SWE_snowcast),axis=1) > 50)
SWE_final[snotel_locs,:] = SWE_snotel[snotel_locs,:]

# From manual inspection, it was determined that the Snowcast data was better for the following locations
# (either more complete record, does not show any artifacts, or where external SNOTEL data is missing)
SWE_final[snotel_locs[0][4],:] = SWE_snowcast[snotel_locs[0][4],:]
SWE_final[snotel_locs[0][43],:] = SWE_snowcast[snotel_locs[0][43],:]
SWE_final[snotel_locs[0][52],:] = SWE_snowcast[snotel_locs[0][52],:]
SWE_final[snotel_locs[0][59],:] = SWE_snowcast[snotel_locs[0][59],:]
SWE_final[snotel_locs[0][76],:] = SWE_snowcast[snotel_locs[0][76],:]
SWE_final[snotel_locs[0][80],:] = SWE_snowcast[snotel_locs[0][80],:]
SWE_final[snotel_locs[0][116],:] = SWE_snowcast[snotel_locs[0][116],:]
SWE_final[snotel_locs[0][126],:] = SWE_snowcast[snotel_locs[0][126],:]
SWE_final[snotel_locs[0][129],:] = SWE_snowcast[snotel_locs[0][129],:]
SWE_final[snotel_locs[0][137],:] = SWE_snowcast[snotel_locs[0][137],:]
SWE_final[snotel_locs[0][142],:] = SWE_snowcast[snotel_locs[0][142],:]
SWE_final[snotel_locs[0][161],:] = SWE_snowcast[snotel_locs[0][161],:]
SWE_final[snotel_locs[0][165],:] = SWE_snowcast[snotel_locs[0][165],:]
SWE_final[snotel_locs[0][184],:] = SWE_snowcast[snotel_locs[0][184],:]
SWE_final[snotel_locs[0][186],:] = SWE_snowcast[snotel_locs[0][186],:]
SWE_final[snotel_locs[0][207],:] = SWE_snowcast[snotel_locs[0][207],:]
SWE_final[snotel_locs[0][209],:] = SWE_snowcast[snotel_locs[0][209],:]

SWE_final[SWE_final<0] = 0


In [15]:
## Write Output File

f = open('Training Tables/train_label_data.csv', 'w')

# Write the first line
f.write('cell_id')
for time in times:
    f.write(',' + time)
f.write('\n')

# For subsequent lines, write the cell id and then the data for each date
i = 0
for evaluation_cell_id in evaluation_cell_ids:
    f.write(evaluation_cell_id)
    for d in range(len(times_all)):
        f.write(',{:.2f}'.format(SWE_final[i, d]))
    f.write('\n')
    i = i+1
    
f.close()