In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from functools import partial
from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = "refit-fvs@refit-fvs.iam.gserviceaccount.com"
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, "../../gee_key.json")
ee.Initialize(credentials)

# Retrieve GRIDMET Climatic Data
For each of the plots in a GeoDataFrame we will filter the GRIDMET down-scaled climate data collection from GEE to our Area of Interest. We want to get a daily time-series of precipitation and reference evapotranspiration for each point. We will use these data in correspondence with soil moisture estimates to build a leaky bucket model for estimating soil moisture availability based on precipitation and reference evapotranspiration.

In [3]:
DATA_DIR = "../../data/"
PLOTS = os.path.join(DATA_DIR, "interim", "plot_info_for_climatena.csv")
plots = pd.read_csv(PLOTS).rename({"ID1": "PLOT_ID"}, axis=1).drop(["ID2"], axis=1)
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


In [4]:
plots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12156 entries, 0 to 12155
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PLOT_ID  12156 non-null  int64  
 1   lat      12156 non-null  float64
 2   lon      12156 non-null  float64
 3   el       12156 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 380.0 KB


In [5]:
collection = ee.ImageCollection("IDAHO_EPSCOR/GRIDMET")


def get_timeseries(x, y, epsg=4326):
    """Returns daily precip and ET time series from GRIDMET for a single point."""
    aoi = ee.Geometry.Point((x, y), proj=f"EPSG:{epsg}")
    coll = collection.filterBounds(aoi).filterDate(
        "2010-01-01", "2022-08-02"
    )  # extend five years before NASA SMAP

    def get_point(img):
        result = img.reduceRegion(
            reducer=ee.Reducer.mean(), geometry=aoi, crs=f"EPSG:{epsg}", scale=1
        )
        ppt = result.get("pr")
        eref_grass = result.get("eto")
        tmin = result.get("tmmn")
        tmax = result.get("tmmx")
        vpd = result.get("vpd")

        return (
            img.set("date", img.date().format())
            .set("ppt", ppt)
            .set("eref", eref_grass)
            .set("tmin", tmin)
            .set("tmax", tmax)
            .set("vpd", vpd)
        )

    values = (
        coll.map(get_point)
        .reduceColumns(
            ee.Reducer.toList(6), ["date", "ppt", "eref", "tmin", "tmax", "vpd"]
        )
        .values()
        .get(0)
    )

    return values.getInfo()


def get_dataframe(point_id, x, y, epsg=4326):
    try:
        values = get_timeseries(x, y, epsg=epsg)
        df = pd.DataFrame(
            values, columns=["DATE", "PPT", "EREF_GRASS", "TMIN", "TMAX", "VPD"]
        )
        df.insert(0, "PLOT_ID", int(point_id))
        df["DATE"] = pd.to_datetime(df["DATE"])
        return df
    except Exception as e:
        print("Failed on", point_id, e)
        return

In [6]:
plots.iloc[0]

PLOT_ID    6.010155e+10
lat        4.180623e+01
lon       -1.237887e+02
el         7.610000e+02
Name: 0, dtype: float64

In [7]:
A_PLOT = plots.iloc[0]
PLOT_ID, LAT, LON, EL = A_PLOT.values
get_dataframe(PLOT_ID, LON, LAT)

Unnamed: 0,PLOT_ID,DATE,PPT,EREF_GRASS,TMIN,TMAX,VPD
0,60101550679,2010-01-01 06:00:00,10.055064,1.075119,274.932709,283.147430,0.195173
1,60101550679,2010-01-02 06:00:00,0.000000,0.561059,273.832794,281.466553,0.186048
2,60101550679,2010-01-03 06:00:00,0.000000,1.036437,275.853516,286.187286,0.421681
3,60101550679,2010-01-04 06:00:00,6.711240,1.320106,277.623016,285.826660,0.364689
4,60101550679,2010-01-05 06:00:00,20.346174,1.078480,279.303528,286.206665,0.336908
...,...,...,...,...,...,...,...
4591,60101550679,2022-07-28 06:00:00,0.000000,6.400000,290.399994,308.700012,2.630000
4592,60101550679,2022-07-29 06:00:00,0.000000,6.500000,291.000000,309.500000,2.770000
4593,60101550679,2022-07-30 06:00:00,0.000000,5.000000,290.600006,306.000000,2.210000
4594,60101550679,2022-07-31 06:00:00,0.000000,4.500000,288.899994,302.500000,1.660000


In [8]:
results = []
OVERWRITE = True

ALREADY_DONE = os.path.join(DATA_DIR, "raw", "GRIDMET_daily_for_plots.csv")
if os.path.exists(ALREADY_DONE) and not OVERWRITE:
    already_done = pd.read_csv(ALREADY_DONE)
    results.append(already_done)
    already_done_plots = np.unique(already_done["PLOT_ID"].values)
else:
    already_done_plots = []

with ProcessPoolExecutor(40) as executor:
    print("Starting to get data from Google Earth Engine.")
    jobs = [
        executor.submit(get_dataframe, *[row["PLOT_ID"], row["lon"], row["lat"]])
        for _, row in plots.iterrows()
        if row["PLOT_ID"] not in already_done_plots
    ]

    for job in tqdm(as_completed(jobs), total=len(jobs)):
        results.append(job.result())

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]

In [9]:
result_df = pd.concat(results, axis=0, ignore_index=True)
result_df["DATE"] = result_df.DATE.dt.to_period("D")
result_df = result_df.set_index(["PLOT_ID", "DATE"])
result_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 55868976 entries, (60101551969, Period('2010-01-01', 'D')) to (530906599117, Period('2022-08-01', 'D'))
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   PPT         float64
 1   EREF_GRASS  float64
 2   TMIN        float64
 3   TMAX        float64
 4   VPD         float64
dtypes: float64(5)
memory usage: 2.3 GB


In [10]:
result_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PPT,EREF_GRASS,TMIN,TMAX,VPD
PLOT_ID,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
60101551969,2010-01-01,9.362696,1.028153,275.038483,282.326965,0.181254
60101551969,2010-01-02,0.0,0.479725,273.718597,280.197144,0.135297
60101551969,2010-01-03,0.0,1.065584,276.508667,286.147339,0.449029
60101551969,2010-01-04,7.671114,1.479384,278.268524,286.086853,0.419944
60101551969,2010-01-05,20.771038,1.116175,279.539154,285.577393,0.349697


In [11]:
OUT_RAW = os.path.join(DATA_DIR, "raw", "GRIDMET_daily_for_plots.csv")
result_df.to_csv(OUT_RAW, index=True, header=True)