In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from functools import partial
from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = "refit-fvs@refit-fvs.iam.gserviceaccount.com"
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, "../../gee_key.json")
ee.Initialize(credentials)

# Retrieve Soil Moisture Data
For each of the plots in a GeoDataFrame, and each year the imagery are available, we will filter the NASA Soil Moisture Active Passive (SMAP) collection from GEE to our Area of Interest. We want to get a monthly time-series of soil profile moisture for each point.

In [3]:
DATA_DIR = "../../data/"
PLOTS = os.path.join(DATA_DIR, "interim", "plot_info_for_climatena.csv")
plots = pd.read_csv(PLOTS).rename({"ID1": "PLOT_ID"}, axis=1).drop(["ID2"], axis=1)
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


In [4]:
plots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12156 entries, 0 to 12155
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PLOT_ID  12156 non-null  int64  
 1   lat      12156 non-null  float64
 2   lon      12156 non-null  float64
 3   el       12156 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 380.0 KB


In [5]:
collection = ee.ImageCollection("NASA_USDA/HSL/SMAP10KM_soil_moisture")


def get_timeseries(x, y, epsg=4326):
    """Returns soil moisture data from the NASA SMAP collection filtered
    to intersect with a specific point."""
    aoi = ee.Geometry.Point((x, y), proj=f"EPSG:{epsg}")
    coll = collection.filterBounds(aoi)

    def get_point(img):
        result = img.reduceRegion(
            reducer=ee.Reducer.mean(), geometry=aoi, crs=f"EPSG:{epsg}", scale=1
        )
        smp = result.get("smp")
        ssm = result.get("ssm")
        susm = result.get("susm")

        return (
            img.set("date", img.date().format("YYYY-MM-dd"))
            .set("smp", smp)
            .set("ssm", ssm)
            .set("susm", susm)
        )

    values = (
        coll.map(get_point)
        .reduceColumns(ee.Reducer.toList(4), ["date", "smp", "ssm", "susm"])
        .values()
        .get(0)
    )

    return values.getInfo()  # , ssm.getInfo(), susm.getInfo()


def get_dataframe(point_id, x, y, epsg=4326):
    try:
        values = get_timeseries(x, y, epsg=epsg)
        df = pd.DataFrame(values, columns=["DATE", "SMP", "SSM", "SUSM"])
        df["SM_MM"] = df["SSM"] + df["SUSM"]
        df.insert(0, "PLOT_ID", int(point_id))
        return df
    except Exception as e:
        print("Failed on", point_id, e)
        return

def download_dataframe(point_id, x, y, epsg=4326, overwrite=False):
    outfile = os.path.join(DATA_DIR, "raw", "nasa_smap", f"{point_id}.parquet")
    if not os.path.exists(outfile) or overwrite:
        df = get_dataframe(point_id, x, y, epsg=epsg)
        df.to_parquet(outfile, index=False)
    
    return    

In [6]:
os.makedirs(os.path.join(DATA_DIR, 'raw', 'nasa_smap'), exist_ok=True)

In [7]:
A_PLOT = plots.iloc[0]
PLOT_ID, LAT, LON, EL = A_PLOT.values
get_dataframe(PLOT_ID, LON, LAT)

Unnamed: 0,PLOT_ID,DATE,SMP,SSM,SUSM,SM_MM
0,60101550679,2015-04-02,0.873135,21.037519,68.895363,89.932882
1,60101550679,2015-04-05,0.958604,25.334164,73.402039,98.736202
2,60101550679,2015-04-08,0.980448,24.243017,76.743172,100.986189
3,60101550679,2015-04-11,0.957361,23.351637,75.256546,98.608183
4,60101550679,2015-04-14,0.980566,23.926762,77.071548,100.998310
...,...,...,...,...,...,...
889,60101550679,2022-07-21,0.235841,5.250250,19.041393,24.291643
890,60101550679,2022-07-24,0.199953,4.437977,16.157156,20.595133
891,60101550679,2022-07-27,0.179267,4.194283,14.270215,18.464498
892,60101550679,2022-07-30,0.161411,3.726025,12.899270,16.625295


In [8]:
results = []
OVERWRITE = False

with ProcessPoolExecutor(48) as executor:
    print("Starting to get data from Google Earth Engine.")
    to_run = partial(download_dataframe, overwrite=OVERWRITE)
    jobs = [
        executor.submit(to_run, *[int(row["PLOT_ID"]), row["lon"], row["lat"]])
        for _, row in plots.iterrows()
    ]

    for job in tqdm(as_completed(jobs), total=len(jobs)):
        pass

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]