# Run script - Google Earth Engine 

In this script we will gather catchment data from satellite products (e.g. tree cover, NDVI, elevation) using Google Earth Engine (GEE). GEE allows us to directly use satellite data, avoiding the struggle of downloading them. Before using it, you need to create an account: https://signup.earthengine.google.com/#!/

This scripts only works in the conda environment **sr_env**. In this environment all required packages are available. If you have **not** installed and activated this environment before opening this script, you should check the installation section in the *README* file. 


### 1. Getting started
First, import all the required packages.

In [1]:
# import packages
import ee
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import glob
from pathlib import Path
from pathos.threading import ThreadPool as Pool
import datetime
from dateutil.relativedelta import relativedelta

Before using the Earth Engine API or earthengine command line tool, you must perform a one-time authentication that authorizes access to Earth Engine on behalf of your Google account. Below you run the authentication command. A URL will be provided that generates an authorization code upon agreement. Copy the authorization code and enter it in the box below.

In [2]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

Enter verification code:  4/1AfJohXkKb7bV4TG6P5rYMx4lTUQ3SqoPT6j9FxwT2-XXWlnZEkcnaooJFoI



Successfully saved authorization token.


After authentication we can import all the python functions defined in the scripts *f_earth_engine.py*.

In [3]:
from f_earth_engine import *

### 2. Define working and data directories
Here we define the working directory, where all the scripts and output are saved.

We also define the data directory where you have the following subdirectories:

/data/forcing/*netcdf forcing files*\
/data/shapes/*catchment shapefiles*\
/data/gsim_discharge/*gsim discharge timeseries*

In [4]:
# Check current working directory (helpful when filling in work_dir below)
os.getcwd()

'/home/fvanoorschot/global_sr_module'

In [5]:
# define your script working directory
# work_dir=Path("/home/fransjevanoors/global_sr_module")
work_dir=Path("/mnt/u/LSM root zone/global_sr/")
# work_dir=Path("/work/users/vanoorschot/fransje/scripts/GLOBAL_SR/global_sr_module")
# work_dir=Path("/scratch/fransjevanoors/global_sr")
# define your data directory
# data_dir=Path("/work/users/vanoorschot/fransje/scripts/GLOBAL_SR/global_sr_module/data")

### 3. Load your list of catchment IDs
Here we load the list of catchments IDs that was generated in the *run_script_main*.

In [6]:
catch_id_list = np.genfromtxt(f'{work_dir}/output/gsim_aus_catch_id_list_lo_sel.txt',dtype='str')

### 4. Earth Engine treecover
We are interested in the treecover in a catchment. For this we use the MODIS treecover data (https://modis.gsfc.nasa.gov/data/dataprod/mod44.php). This product includes the percentage tree cover, non tree cover, and bare soil on a 250x250 m grid. Here we regrid the tree cover to a 1x1 km grid (to reduce computational costs), average the values over the time period of interest and extract the catchment statistics (mean, max, min and std).

First we create the output directory:

In [8]:
# make output directory
if not os.path.exists(f'{work_dir}/output/earth_engine_timeseries/treecover'):
    os.makedirs(f'{work_dir}/output/earth_engine_timeseries/treecover')

Now we run the *preprocess_treecover_data* and *catchment_treecover* functions from the *f_earth_engine.py* script. The output is a dataframe with the treecover statistics for each catchment.

In [9]:
# define your time period
start_date = '2000-01-01'
end_date = '2020-12-31'

# define your directories
shape_dir = Path(f'{data_dir}/shapes/')
out_dir = Path(f'{work_dir}/output/earth_engine_timeseries/treecover')

# preprocess your modis satellite data for your time period (interpolation and averaging)
(MOD44B_tree_res, MOD44B_nontree_res) = preprocess_treecover_data(start_date,end_date)

# loop over catch ids
for catch_id in catch_id_list:
    # extract catchment values and store in dataframe
    catchment_treecover(MOD44B_tree_res, MOD44B_nontree_res, catch_id, shape_dir, out_dir)

In [10]:
# print treecover statistics for catchment [0] in catch_id_list
catch_id = catch_id_list[0]
c = pd.read_csv(f'{out_dir}/{catch_id}.csv',index_col=0)
c.head()

Unnamed: 0,max_tc,mean_tc,min_tc,std_tc,max_ntc,mean_ntc,min_ntc,std_ntc,mean_nonveg
br_0000495,40.438792,40.389229,40.329825,0.027467,47.847697,47.804183,47.755285,0.024777,11.806588


In [16]:
# update conda earth engine -> update environment.yml

### 5. Earth Engine DEM

In [6]:
# run elevation stats
catch_list = np.genfromtxt(f'{work_dir}/output/gsim_aus_catch_id_list_lo_sel.txt',dtype='str')[:]
work_dir_list = [work_dir]*len(catch_list)
run_function_parallel_elevation_stats(catch_list,work_dir_list)

In [7]:
# check which catchments are missing
catch_list = np.genfromtxt(f'{work_dir}/output/gsim_aus_catch_id_list_lo_sel.txt',dtype='str')[:]
el_id_list=[]
for filepath in glob.iglob(f'{work_dir}/output/elevation/stats/*.csv'):
    f = os.path.split(filepath)[1] # remove full path
    f = f[:-4] # remove .year extension
    el_id_list.append(f)
dif = list(set(catch_list) - set(el_id_list))
len(dif)

523

In [9]:
catch_list = dif[:]
work_dir_list = [work_dir]*len(catch_list)
run_function_parallel_elevation_stats(catch_list,work_dir_list)

In [6]:
# run elevation zones
catch_list = np.genfromtxt(f'{work_dir}/output/snow/catch_id_list_snow_t_and_p.txt',dtype='str')[:]
work_dir_list = [work_dir]*len(catch_list)
run_function_parallel_elevation_zones(catch_list,work_dir_list)

In [7]:
#combine all files with elevation stats 
files = glob.glob(f"{work_dir}/output/elevation/stats/*")
li=[] #empty list
for filename in files:
    df = pd.read_csv(filename, index_col=0) #read file as dataframe
    li.append(df) #append file to list
f = pd.concat(li, axis=0) #concatenate lists
f.to_csv(f'{work_dir}/output/elevation/all_catchment_elevation_stats.csv')

In [8]:
f

Unnamed: 0,max_ele,mean_ele,min_ele,std_ele
us_0005238,343.0,312.484613,269.0,13.725366
us_0007161,1658.0,575.471371,257.0,203.173182
ca_0000031,455.0,228.055524,90.0,71.531958
dk_0000031,131.0,58.761104,12.0,23.506379
br_0000991,1031.0,916.331157,786.0,60.193428
...,...,...,...,...
401012,2144.0,1099.375385,319.0,312.248318
us_0002055,199.0,88.046234,8.0,39.553167
ca_0001655,270.0,104.886951,19.0,52.654263
br_0000180,632.0,351.924966,176.0,113.036885


In [9]:
#combine all files with elevation stats 
files = glob.glob(f"{work_dir}/output/elevation/stats_hydrosheds/ele*")
li=[] #empty list
for filename in files:
    df = pd.read_csv(filename, index_col=0) #read file as dataframe
    li.append(df) #append file to list
f = pd.concat(li, axis=0) #concatenate lists
f.to_csv(f'{work_dir}/output/elevation/all_catchment_elevation_stats_hydrosheds.csv')

In [10]:
f

Unnamed: 0,max_ele,mean_ele,min_ele,std_ele
gb_0000095,196.000000,84.922715,17.000000,41.090700
fi_0000119,209.766769,114.088654,30.877821,34.458885
at_0000051,2283.000000,1485.988113,795.000000,315.558690
za_0000167,3346.000000,1704.004750,934.000000,424.675172
ch_0000092,860.000000,802.074109,743.000000,31.723866
...,...,...,...,...
us_0004962,4270.000000,1764.777514,562.000000,599.067547
at_0000121,1084.000000,700.612439,305.000000,114.303929
us_0006140,131.000000,87.955566,66.000000,11.464721
se_0000081,1520.459229,474.060638,12.350017,201.520522


In [11]:
#combine all files with elevation stats 
files = glob.glob(f"{work_dir}/output/elevation/stats_hydrosheds/slope*")
li=[] #empty list
for filename in files:
    df = pd.read_csv(filename, index_col=0) #read file as dataframe
    li.append(df) #append file to list
f = pd.concat(li, axis=0) #concatenate lists
f.to_csv(f'{work_dir}/output/elevation/all_catchment_slope_stats_hydrosheds.csv')

In [12]:
f

Unnamed: 0,max_slope,mean_slope,min_slope,std_slope
us_0006320,4.539580,1.455199,0.245985,0.464403
za_0000288,23.141903,9.458607,0.821960,3.999931
de_0000073,14.819275,5.093054,0.398146,2.442230
ca_0004232,42.247360,16.009535,0.000000,8.808050
ca_0005395,29.980062,15.597303,3.155189,5.547729
...,...,...,...,...
ca_0004158,48.630215,18.262962,0.000000,9.552706
at_0000022,35.759460,18.120913,1.688931,6.874375
us_0005114,2.032722,0.552257,0.025803,0.270654
fr_0001345,13.407174,3.476060,0.091281,1.882886


## 6. soil types
See code editor 'soil_types_catchments' for getting catchment soil characteristics. Here postprocess.

In [20]:
def process_soil(s):
    files = glob.glob(f"{work_dir}/output/soil_types/ee_output/{s}*")[:]
    li=[] #empty list
    for filename in files:
        df = pd.read_csv(filename) #read shapefile as geopandas dataframe
        if ('FILENAME' in df.columns):
            df.index = df.FILENAME
        if ('Code' in df.columns):
            df.index = df.Code
        if ('CatchID' in df.columns):
            df.index = df.CatchID
        df = df[['max','mean','median','min']]
        li.append(df) #append shapefile to list
    f = pd.concat(li, axis=0) #concatenate lists
    f.to_csv(f'{work_dir}/output/soil_types/processed/{s}.csv')

In [21]:
process_soil('clay')
process_soil('sand')
process_soil('bulk')
process_soil('carb')
process_soil('text')

In [39]:
textures = ['Cl','SiCl','SaCl','ClLo','SiClLo','SaClLo','Lo','SiLo','SaLo','Si','LoSa','Sa']
t = pd.read_csv(f'{work_dir}/output/soil_types/processed/text.csv',index_col=0)
t = t.astype(int)
t['med_text'] = np.zeros(len(t.index))
for i in range(len(textures)):
    ix = t[t['median']==i].index
    t.loc[ix,'med_text'] = textures[i]
t.to_csv(f'{work_dir}/output/soil_types/processed/text.csv')

## Earth Engine HAND
see code editor 'hand_catchments' for getting catchment hand values. https://presentations.copernicus.org/EGU2016/EGU2016-17445_presentation.pdf
https://gee-community-catalog.org/projects/hand/

In [7]:
files = glob.glob(f"{work_dir}/output/HAND/ee_output/*")[:]
li=[] #empty list
for filename in files:
    df = pd.read_csv(filename) #read shapefile as geopandas dataframe
    if ('FILENAME' in df.columns):
        df.index = df.FILENAME
    if ('Code' in df.columns):
        df.index = df.Code
    if ('CatchID' in df.columns):
        df.index = df.CatchID
    df = df[['max','mean','median','min']]
    li.append(df) #append shapefile to list
f = pd.concat(li, axis=0) #concatenate lists
f.to_csv(f'{work_dir}/output/HAND/hand_all_catchments.csv')

In [6]:
f

Unnamed: 0,max,mean,median,min
es_0000991,1112.468750,321.912613,299.649537,19.187500
us_0003601,316.765625,7.771120,4.900978,0.000000
at_0000053,1328.937500,315.746512,276.699170,10.625000
br_0000155,4883.328125,172.393988,43.369052,0.000000
br_0000270,339.203125,45.647960,33.107872,2.781250
...,...,...,...,...
ca_0005777,127.375000,65.948861,73.515625,14.328125
fr_0001269,124.937500,37.335819,32.171875,6.562500
ar_0000009,3348.531250,316.327435,247.688682,3.953125
no_0000199,922.296875,158.300688,114.022795,0.000000


## GPP/NPP

In [22]:
import geemap

In [25]:
js_snippet = """
//Use a function to iterate through months and calculate average NDVI
var months = ee.List.sequence(1, 12);
print("months",months);
var years = ee.List.sequence(2001, 2010);
print("years",years);

// Map filtering and reducing across year-month combinations and convert to ImageCollection
var yrMo = ee.ImageCollection.fromImages(
  years.map(function (y) {
        return months.map(function (m) {
            return collection
              .filter(ee.Filter.calendarRange(y, y, 'year'))
              .filter(ee.Filter.calendarRange(m, m, 'month'))
              .mean()
              .set('year',y)
              .set('month',m);
        });
    }).flatten());
print("yrMo",yrMo);
"""

In [26]:

geemap.js_snippet_to_py(
    js_snippet, add_new_cell=True, import_ee=True, import_geemap=True, show_map=True
)
   

In [None]:
import ee
import geemap

Map = geemap.Map()
import geemap

Map = geemap.Map()

#Use a function to iterate through months and calculate average NDVI
months = ee.List.sequence(1, 12)
print("months",months)
years = ee.List.sequence(2001, 2010)
print("years",years)

# Map filtering and reducing across year-month combinations and convert to ImageCollection
yrMo = ee.ImageCollection.fromImages(

def func_yku (y):
        return months.map(function (m) {
            return collection \
              .filter(ee.Filter.calendarRange(y, y, 'year')) \
              .filter(ee.Filter.calendarRange(m, m, 'month')) \
              .mean() \
              .set('year',y) \
              .set('month',m)
        })

  years.map(func_yku
).flatten())

).flatten())
print("yrMo",yrMo)
Map

In [None]:
import ee
import geemap

Map = geemap.Map()
import geemap

Map = geemap.Map()

# Load an image.
image = ee.Image('LANDSAT/LC08/C01/T1_TOA/LC08_044034_20140318')

# Define the visualization parameters.
vizParams = {
  'bands': ['B5', 'B4', 'B3'],
  'min': 0,
  'max': 0.5,
  'gamma': [0.95, 1.1, 1]
}

# Center the map and display the image.
Map.setCenter(-122.1899, 37.5010, 10); # San Francisco Bay
Map.addLayer(image, vizParams, 'False color composite')

Map

In [8]:
shape_dir = Path(f'{work_dir}/output/selected_shapes/')
shapefile_list = glob.glob(f'{shape_dir}/*.shp')[:]
shapefile_list[0:3]

['/mnt/u/LSM root zone/global_sr/output/selected_shapes/ar_0000001.shp',
 '/mnt/u/LSM root zone/global_sr/output/selected_shapes/ar_0000005.shp',
 '/mnt/u/LSM root zone/global_sr/output/selected_shapes/ar_0000006.shp']

In [20]:
#%% reducer function
def reducer_function(geometry,reducer=ee.Reducer.mean(), scale=10,crs='EPSG:4326', maxPixels=2e10,):
    def reduce_region_function(image):
        stats = image.reduceRegion(reducer=reducer,geometry=geometry,scale=scale,crs=crs,maxPixels=maxPixels)
        return ee.Feature(geometry, stats).set({'millis': image.date().millis()})
    return reduce_region_function

# function to write python dictionary
def write_dict(fc):
    names = fc.first().propertyNames()
    lists = fc.reduceColumns(
    reducer=ee.Reducer.toList().repeat(names.size()),
    selectors=names).get('list')

    return ee.Dictionary.fromLists(names, lists)

In [35]:
# collection = ee.ImageCollection('MODIS/006/MOD10A1').filterDate(startDate, endDate).select('NDSI_Snow_Cover')
collection=ee.ImageCollection("MODIS/061/MOD17A3HGF").select('Npp')

#Use a function to iterate through months and calculate average NDVI
months = ee.List.sequence(1, 12)
print("months",months)
years = ee.List.sequence(2001, 2010)
print("years",years)

# Map filtering and reducing across year-month combinations and convert to ImageCollection
yrMo = ee.ImageCollection.fromImages(def func_yku (y):
        return months.map(function (m) {
            return collection \
              .filter(ee.Filter.calendarRange(y, y, 'year')) \
              .filter(ee.Filter.calendarRange(m, m, 'month')) \
              .mean() \
              .set('year',y) \
              .set('month',m)
        })

    years.map(func_yku
).flatten())

# ).flatten())
print("yrMo",yrMo)

# # Map filtering and reducing across year-month combinations and convert to ImageCollection
# yrMo = ee.ImageCollection.fromImages(
#     def func_yku(y):
#         return months.map(function (m) {return collection.filter(ee.Filter.calendarRange(y, y, 'year')).filter(ee.Filter.calendarRange(m, m, 'month')).mean().set('year',y).set('month',m)})
#         years.map(func_yku).flatten())

# ).flatten())
# print("yrMo",yrMo)



SyntaxError: invalid syntax (1334758014.py, line 11)

In [None]:

for i in range(len(shapefile_list[0:3])):
    print(i)
    shapefile = gpd.read_file(f'{paths}/{shapefile_list[i]}')
    features = []
    for j in range(shapefile.shape[0]):
        geom = shapefile.iloc[j:j+1,:] 
        jsonDict = eval(geom.to_json()) 
        geojsonDict = jsonDict['features'][0] 
        features.append(ee.Feature(geojsonDict))

    aoi = ee.FeatureCollection(features).geometry()

    
    reduce_snow = reducer_function(geometry=aoi, reducer=ee.Reducer.mean(), scale=10, crs='EPSG:4326')
    snow_stat_fc = ee.FeatureCollection(snow.map(reduce_snow)).filter(ee.Filter.notNull(snow.first().bandNames()))
    
    snow_dict = write_dict(snow_stat_fc).getInfo()
    snow_df = pd.DataFrame(snow_dict)
    
    snow_df['NDSI_Snow_Cover'] = snow_df['NDSI_Snow_Cover']
    
    # add dates to dataframe
    snow_df['Timestamp'] = pd.to_datetime(snow_df['millis'], unit='ms')
    snow_df['Year'] = pd.DatetimeIndex(snow_df['Timestamp']).year
    snow_df['Month'] = pd.DatetimeIndex(snow_df['Timestamp']).month
    snow_df['Day'] = pd.DatetimeIndex(snow_df['Timestamp']).day
    snow_df['DOY'] = pd.DatetimeIndex(snow_df['Timestamp']).dayofyear
    
    snow_df.index = snow_df['Timestamp']
    snow_df = snow_df.drop(columns={'millis','system:index','Timestamp','Year','Month','Day','DOY'})
    snow_df_monthly = snow_df.groupby(pd.Grouper(freq='M')).mean()
    print(snow_df)
    
    f = files_list[i][:-4]
    snow_df.to_csv(f'{paths}/ndsi_snowcover/ndsi_snowcover_{startDate}_{endDate}_{f}.csv')

In [None]:
var landsat_resample = function(image){
  return image.reproject(landsat_pro, null, 500) // insert here the desired scale in meters

    // Force the next reprojection to aggregate instead of resampling.
    .reduceResolution({
      reducer: ee.Reducer.mean(),
      maxPixels: 1024
    })
  .copyProperties(image)
}

var ndviResample = ndvi.map(landsat_resample)

In [21]:
# #%% get snow cover timeseries for 1 catchment
paths = f'{work_dir}/output/selected_shapes/'
all_files = os.listdir(paths)           
files_list = []                             
for files in all_files:                    
    if '.shp' in files:
        files_list.append(files)
        
files_list = files_list[0:3] #only test 3 catchments here -> for all catchments run on cluster

# paths = f'{work_dir}/output/selected_shapes/'
# shape_dir = Path(f'{work_dir}/output/selected_shapes/')
# shapefile_list = glob.glob(f'{shape_dir}/*.shp')[:]
# files_list = shapefile_list[0:3]

for i in range(len(files_list)):
    print(i)
    shapefile = gpd.read_file(f'{paths}/{files_list[i]}')
    features = []
    for j in range(shapefile.shape[0]):
        geom = shapefile.iloc[j:j+1,:] 
        jsonDict = eval(geom.to_json()) 
        geojsonDict = jsonDict['features'][0] 
        features.append(ee.Feature(geojsonDict))

    aoi = ee.FeatureCollection(features).geometry()
    startDate = '2005-01-01'
    endDate = '2005-12-31'
    snow = ee.ImageCollection('MODIS/006/MOD10A1').filterDate(startDate, endDate).select('NDSI_Snow_Cover')
    # snow = snow.resample('bilinear').reproject(crs=clay.projection().crs(), scale=1110);
    
    reduce_snow = reducer_function(geometry=aoi, reducer=ee.Reducer.mean(), scale=10, crs='EPSG:4326')
    snow_stat_fc = ee.FeatureCollection(snow.map(reduce_snow)).filter(ee.Filter.notNull(snow.first().bandNames()))
    
    snow_dict = write_dict(snow_stat_fc).getInfo()
    snow_df = pd.DataFrame(snow_dict)
    
    snow_df['NDSI_Snow_Cover'] = snow_df['NDSI_Snow_Cover']
    
    # add dates to dataframe
    snow_df['Timestamp'] = pd.to_datetime(snow_df['millis'], unit='ms')
    snow_df['Year'] = pd.DatetimeIndex(snow_df['Timestamp']).year
    snow_df['Month'] = pd.DatetimeIndex(snow_df['Timestamp']).month
    snow_df['Day'] = pd.DatetimeIndex(snow_df['Timestamp']).day
    snow_df['DOY'] = pd.DatetimeIndex(snow_df['Timestamp']).dayofyear
    
    snow_df.index = snow_df['Timestamp']
    snow_df = snow_df.drop(columns={'millis','system:index','Timestamp','Year','Month','Day','DOY'})
    snow_df_monthly = snow_df.groupby(pd.Grouper(freq='M')).mean()
    print(snow_df)
    
    f = files_list[i][:-4]
    # snow_df.to_csv(f'{paths}/ndsi_snowcover/ndsi_snowcover_{startDate}_{endDate}_{f}.csv')


0


EEException: Computation timed out.

In [14]:
snow_df

NameError: name 'snow_df' is not defined

## 6. Earth Engine CHIRPS

In [6]:
catch_list = np.genfromtxt(f'{work_dir}/output/gsim_aus_catch_id_list_lo_sel.txt',dtype='str')[0:3]
start_date='1990-01-01'
work_dir_list = [work_dir]*len(catch_list)
start_date_list = [start_date]*len(catch_list)

ee_chirps_parallel(catch_list,start_date_list,work_dir_list)

EEException: Computation timed out.

In [6]:
catch_list = np.genfromtxt(f'{work_dir}/output/gsim_aus_catch_id_list_lo_sel.txt',dtype='str')[0:3]
start_date='1993-01-01'
work_dir_list = [work_dir]*len(catch_list)
start_date_list = [start_date]*len(catch_list)

ee_chirps_parallel(catch_list,start_date_list,work_dir_list)

EEException: Computation timed out.

In [None]:
catch_list = np.genfromtxt(f'{work_dir}/output/gsim_aus_catch_id_list_lo_sel.txt',dtype='str')[0:3]
start_date='1996-01-01'
work_dir_list = [work_dir]*len(catch_list)
start_date_list = [start_date]*len(catch_list)

ee_chirps_parallel(catch_list,start_date_list,work_dir_list)