In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from functools import partial
from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = "refit-fvs@refit-fvs.iam.gserviceaccount.com"
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, "../gee_key.json")
ee.Initialize(credentials)

# Retrieve Soil Moisture Data
For each of the plots in a GeoDataFrame, and each year the imagery are available, we will filter the NASA Soil Moisture Active Passive (SMAP) collection from GEE to our Area of Interest. We want to get a monthly time-series of soil profile moisture for each point.

In [3]:
DATA_DIR = "../data/"
PLOTS = os.path.join(DATA_DIR, "interim", "plot_info_for_climatena.csv")
plots = pd.read_csv(PLOTS).rename({"ID1": "PLOT_ID"}, axis=1).drop(["ID2"], axis=1)
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


In [4]:
plots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12156 entries, 0 to 12155
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PLOT_ID  12156 non-null  int64  
 1   lat      12156 non-null  float64
 2   lon      12156 non-null  float64
 3   el       12156 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 380.0 KB


In [6]:
collection = ee.ImageCollection("NASA_USDA/HSL/SMAP10KM_soil_moisture")


def get_timeseries(x, y, epsg=4326):
    """Returns soil moisture data from the NASA SMAP collection filtered
    to intersect with a specific point."""
    aoi = ee.Geometry.Point((x, y), proj=f"EPSG:{epsg}")
    coll = collection.filterBounds(aoi)

    def get_point(img):
        result = img.reduceRegion(
            reducer=ee.Reducer.mean(), geometry=aoi, crs=f"EPSG:{epsg}", scale=1
        )
        smp = result.get("smp")
        ssm = result.get("ssm")
        susm = result.get("susm")

        return (
            img.set("date", img.date().format())
            .set("smp", smp)
            .set("ssm", ssm)
            .set("susm", susm)
        )

    values = (
        coll.map(get_point)
        .reduceColumns(ee.Reducer.toList(4), ["date", "smp", "ssm", "susm"])
        .values()
        .get(0)
    )

    return values.getInfo()  # , ssm.getInfo(), susm.getInfo()


def get_dataframe(point_id, x, y, epsg=4326):
    try:
        values = get_timeseries(x, y, epsg=epsg)
        df = pd.DataFrame(values, columns=["DATE", "SMP", "SSM", "SUSM"])
        df.insert(0, "PLOT_ID", int(point_id))
        df["DATE"] = pd.to_datetime(df["DATE"])
        return df
    except Exception as e:
        print("Failed on", point_id, e)
        return

In [7]:
plots.iloc[0]

PLOT_ID    6.010155e+10
lat        4.180623e+01
lon       -1.237887e+02
el         7.610000e+02
Name: 0, dtype: float64

In [8]:
A_PLOT = plots.iloc[0]
PLOT_ID, LAT, LON, EL = A_PLOT.values
get_dataframe(PLOT_ID, LON, LAT)

Unnamed: 0,PLOT_ID,DATE,SMP,SSM,SUSM
0,60101550679,2015-04-02 12:00:00,0.873135,21.037519,68.895363
1,60101550679,2015-04-05 12:00:00,0.958604,25.334164,73.402039
2,60101550679,2015-04-08 12:00:00,0.980448,24.243017,76.743172
3,60101550679,2015-04-11 12:00:00,0.957361,23.351637,75.256546
4,60101550679,2015-04-14 12:00:00,0.980566,23.926762,77.071548
...,...,...,...,...,...
889,60101550679,2022-07-21 12:00:00,0.235841,5.250250,19.041393
890,60101550679,2022-07-24 12:00:00,0.199953,4.437977,16.157156
891,60101550679,2022-07-27 12:00:00,0.179267,4.194283,14.270215
892,60101550679,2022-07-30 12:00:00,0.161411,3.726025,12.899270


In [9]:
results = []
OVERWRITE = True

ALREADY_DONE = os.path.join(DATA_DIR, "raw", "NASA_SMAP_soil_moisture_for_plots.csv")
if os.path.exists(ALREADY_DONE) and not OVERWRITE:
    already_done = pd.read_csv(ALREADY_DONE)
    results.append(already_done)
    already_done_plots = np.unique(already_done["PLOT_ID"].values)
else:
    already_done_plots = []

with ProcessPoolExecutor(40) as executor:
    print("Starting to get data from Google Earth Engine.")
    jobs = [
        executor.submit(get_dataframe, *[row["PLOT_ID"], row["lon"], row["lat"]])
        for _, row in plots.iterrows()
        if row["PLOT_ID"] not in already_done_plots
    ]

    for job in tqdm(as_completed(jobs), total=len(jobs)):
        results.append(job.result())

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]

In [12]:
result_df = pd.concat(results, axis=0, ignore_index=True)
result_df["DATE"] = result_df.DATE.dt.to_period("D")
result_df = result_df.set_index(["PLOT_ID", "DATE"])
result_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10747328 entries, (60101551969, Period('2015-04-02', 'D')) to (530906595968, Period('2022-08-02', 'D'))
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   SMP     float64
 1   SSM     float64
 2   SUSM    float64
dtypes: float64(3)
memory usage: 287.4 MB


In [13]:
result_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SMP,SSM,SUSM
PLOT_ID,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60101551969,2015-04-02,0.794608,18.617226,63.227345
60101551969,2015-04-05,0.896759,24.619665,67.746483
60101551969,2015-04-08,0.96527,23.360319,76.062454
60101551969,2015-04-11,0.920033,22.047394,72.715996
60101551969,2015-04-14,0.939503,22.633663,74.135155


In [14]:
OUT_RAW = os.path.join(DATA_DIR, "raw", "NASA_SMAP_soil_moisture_for_plots.csv")
result_df.to_csv(OUT_RAW, index=True, header=True)

In [36]:
daily_moisture = (
    result_df.reset_index()
    .set_index("DATE")
    .groupby("PLOT_ID")
    .resample("D")
    .interpolate()
    .drop(["PLOT_ID"], axis=1)
)

In [24]:
OUT_DAILY = os.path.join(DATA_DIR, "interim", "NASA_SMAP_daily_soil_moisture.csv")
daily_moisture.to_csv(OUT_DAILY, index=True, header=True)

In [37]:
daily_moisture

Unnamed: 0_level_0,Unnamed: 1_level_0,SMP,SSM,SUSM
PLOT_ID,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60101550679,2015-04-02,0.873135,21.037519,68.895363
60101550679,2015-04-03,0.901625,22.469734,70.397588
60101550679,2015-04-04,0.930114,23.901949,71.899813
60101550679,2015-04-05,0.958604,25.334164,73.402039
60101550679,2015-04-06,0.965885,24.970448,74.515750
...,...,...,...,...
530907572668,2022-07-29,0.120627,2.478284,16.339570
530907572668,2022-07-30,0.112623,2.428088,15.141101
530907572668,2022-07-31,0.109870,2.465419,14.674336
530907572668,2022-08-01,0.107117,2.502749,14.207572


In [43]:
monthly_moisture = daily_moisture.copy()
monthly_moisture["PERIOD"] = (
    monthly_moisture.index.get_level_values(1).to_timestamp().to_period("M")
)
monthly_moisture = (
    monthly_moisture.reset_index()
    .drop("DATE", axis=1)
    .groupby(by=["PLOT_ID", "PERIOD"])
    .last()
)
monthly_moisture

Unnamed: 0_level_0,Unnamed: 1_level_0,SMP,SSM,SUSM
PLOT_ID,PERIOD,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60101550679,2015-04,0.654284,14.791111,52.600174
60101550679,2015-05,0.496460,14.380551,36.754831
60101550679,2015-06,0.095009,2.199256,7.586694
60101550679,2015-07,0.113418,3.091108,8.590922
60101550679,2015-08,0.119196,3.102284,9.174930
...,...,...,...,...
530907572668,2022-04,0.897382,21.765985,118.225644
530907572668,2022-05,0.735883,16.013319,98.784462
530907572668,2022-06,0.536729,9.811569,73.918190
530907572668,2022-07,0.109870,2.465419,14.674336


In [44]:
OUT_MONTHLY = os.path.join(DATA_DIR, "interim", "NASA_SMAP_monthly_soil_moisture.csv")
monthly_moisture.to_csv(OUT_MONTHLY, index=True, header=True)