In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from functools import partial
from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = "refit-fvs@refit-fvs.iam.gserviceaccount.com"
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, "../../gee_key.json")
ee.Initialize(credentials)

# Retrieve GRIDMET Climatic Data
For each of the plots in a GeoDataFrame we will filter the GRIDMET down-scaled climate data collection from GEE to our Area of Interest. We want to get a daily time-series of precipitation and reference evapotranspiration for each point. We will use these data in correspondence with soil moisture estimates to build a leaky bucket model for estimating soil moisture availability based on precipitation and reference evapotranspiration.

In [3]:
DATA_DIR = "../../data/"
PLOTS = os.path.join(DATA_DIR, "interim", "plot_info_for_climatena.csv")
plots = pd.read_csv(PLOTS).rename({"ID1": "PLOT_ID"}, axis=1).drop(["ID2"], axis=1)
plots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12156 entries, 0 to 12155
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PLOT_ID  12156 non-null  int64  
 1   lat      12156 non-null  float64
 2   lon      12156 non-null  float64
 3   el       12156 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 380.0 KB


In [4]:
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


In [5]:
FIA = "../../data/interim/FIA_remeasured_trees_for_training.csv"
fia = pd.read_csv(
    FIA,
    usecols=["PLOT_ID", "MEASYEAR", "MEASMON", "MEASDAY"],
    dtype={"PLOT_ID": int},
)
for col in ["MEASYEAR", "MEASMON", "MEASDAY"]:
    fia[col] = fia[col].astype(int).astype(str)
fia["MEASDATE"] = fia["MEASYEAR"] + "-" + fia["MEASMON"] + "-" + fia["MEASDAY"]
fia["MEASDATE"] = pd.to_datetime(fia["MEASDATE"])
fia = fia.groupby(by=["PLOT_ID"])["MEASDATE"].min().to_frame()
fia["START_DATE"] = fia["MEASDATE"] - pd.DateOffset(years=5)
fia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12156 entries, 60101550679 to 530907572668
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   MEASDATE    12156 non-null  datetime64[ns]
 1   START_DATE  12156 non-null  datetime64[ns]
dtypes: datetime64[ns](2)
memory usage: 284.9 KB


In [6]:
fia.head()

Unnamed: 0_level_0,MEASDATE,START_DATE
PLOT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
60101550679,2001-06-06,1996-06-06
60101551744,2008-09-23,2003-09-23
60101551969,2008-09-30,2003-09-30
60101552953,2001-05-29,1996-05-29
60101553315,2009-09-17,2004-09-17


In [7]:
collection = ee.ImageCollection("IDAHO_EPSCOR/GRIDMET")


def get_timeseries(x, y, start_date, epsg=4326):
    """Returns daily precip and ET time series from GRIDMET for a single point."""
    aoi = ee.Geometry.Point((x, y), proj=f"EPSG:{epsg}")
    coll = collection.filterBounds(aoi).filterDate(
        start_date, "2023-01-01"
    )

    def get_point(img):
        result = img.reduceRegion(
            reducer=ee.Reducer.mean(), geometry=aoi, crs=f"EPSG:{epsg}", scale=1
        )
        ppt = result.get("pr")
        eref_grass = result.get("eto")
        tmin = result.get("tmmn")
        tmax = result.get("tmmx")
        vpd = result.get("vpd")

        return (
            img.set("date", img.date().format("YYYY-MM-dd"))
            .set("ppt", ppt)
            .set("eref", eref_grass)
            .set("tmin", tmin)
            .set("tmax", tmax)
            .set("vpd", vpd)
        )

    values = (
        coll.map(get_point)
        .reduceColumns(
            ee.Reducer.toList(6), ["date", "ppt", "eref", "tmin", "tmax", "vpd"]
        )
        .values()
        .get(0)
    )

    return values.getInfo()


def get_dataframe(point_id, x, y, epsg=4326):
    start_date = fia.loc[point_id, "START_DATE"].strftime("%Y-%m-%d")
    try:
        values = get_timeseries(x, y, start_date, epsg=epsg)
        df = pd.DataFrame(
            values, columns=["DATE", "PPT", "EREF_GRASS", "TMIN", "TMAX", "VPD"]
        )
        df.insert(0, "PLOT_ID", int(point_id))
        return df
    except Exception as e:
        print("Failed on", point_id, e)
        return
    
def download_dataframe(point_id, x, y, epsg=4326, overwrite=False):
    outfile = os.path.join(DATA_DIR, 'raw', 'gridmet', f"{point_id}.parquet")
    if not os.path.exists(outfile) or overwrite:
        df = get_dataframe(point_id, x, y, epsg=epsg)
        df.to_parquet(outfile, index=False)
    
    return

In [8]:
os.makedirs(os.path.join(DATA_DIR, 'raw', 'gridmet'), exist_ok=True)

In [9]:
A_PLOT = plots.iloc[0]
PLOT_ID, LAT, LON, EL = A_PLOT.values
get_dataframe(PLOT_ID, LON, LAT)

Unnamed: 0,PLOT_ID,DATE,PPT,EREF_GRASS,TMIN,TMAX,VPD
0,60101550679,1996-06-06,0.000000,6.428544,283.857422,299.521545,1.345665
1,60101550679,1996-06-07,0.000000,6.216956,282.368103,298.901764,1.129196
2,60101550679,1996-06-08,0.000000,5.839959,282.207367,297.251495,1.056247
3,60101550679,1996-06-09,0.000000,5.617691,279.947021,295.522156,0.916728
4,60101550679,1996-06-10,0.000000,5.676290,279.817688,296.251648,0.974138
...,...,...,...,...,...,...,...
9700,60101550679,2022-12-27,10.600000,1.000000,273.600006,282.000000,0.140000
9701,60101550679,2022-12-28,13.700000,1.000000,273.200012,281.200012,0.250000
9702,60101550679,2022-12-29,44.900002,0.900000,275.000000,282.600006,0.150000
9703,60101550679,2022-12-30,35.700001,0.900000,278.399994,284.200012,0.170000


In [10]:
OVERWRITE = False

with ProcessPoolExecutor(48) as executor:
    print("Starting to get data from Google Earth Engine.")
    to_run = partial(download_dataframe, overwrite=OVERWRITE)
    jobs = [
        executor.submit(to_run, *[int(row["PLOT_ID"]), row["lon"], row["lat"]])
                                 for _, row in plots.iterrows()]

    for job in tqdm(as_completed(jobs), total=len(jobs)):
        pass

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]