In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = 'refit-fvs@refit-fvs.iam.gserviceaccount.com'
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, '../gee_key.json')
ee.Initialize(credentials)

# Retrieve Soil Moisture Data
For each of the plots in a GeoDataFrame, and each year the imagery are available, we will filter the NASA Soil Moisture Active Passive (SMAP) collection from GEE to our Area of Interest. We want to get a monthly time-series of soil profile moisture for each point.

In [3]:
DATA_DIR = '../data/'
PLOTS = os.path.join(DATA_DIR, 'interim', 'plot_info_for_climatena.csv')
plots = pd.read_csv(PLOTS).rename({'ID1': 'PLOT_ID'}, axis=1).drop(['ID2'], axis=1)
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


In [4]:
plots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12156 entries, 0 to 12155
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PLOT_ID  12156 non-null  int64  
 1   lat      12156 non-null  float64
 2   lon      12156 non-null  float64
 3   el       12156 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 380.0 KB


## Collection Processing Functions
The following functions work on Google Earth Engine ImageCollections. 

In [5]:
smap = ee.ImageCollection('NASA_USDA/HSL/SMAP10KM_soil_moisture')

def last_obs_of_month(coll, month, year):
    start = ee.Date(f'{year}-{month}-01')
    end = start.advance(1, 'month')
    return coll.filterDate(start, end).sort('system:time_start', False).first()

def get_smap_values(x, y, epsg=4326):
    """Returns soil moisture profile from the NASA SMAP collection filtered 
    to intersect with a specific point."""
    aoi = ee.Geometry.Point((x,y), proj=f'EPSG:{epsg}')
    coll = smap.filterBounds(aoi)
    last_obs = ee.ImageCollection([last_obs_of_month(coll, month, year) 
                                    for year in range(2015, 2022)
                                    for month in range(1, 13)]).select(['smp'])
    

    def get_point(img):
        result = img.reduceRegion(reducer=ee.Reducer.mean(), 
                                  geometry=aoi, crs=f'EPSG:{epsg}', 
                                  scale=1).get('smp')
    
        return img.set('date', img.date().format()).set('result', result)    
    
    values = last_obs.map(get_point).reduceColumns(ee.Reducer.toList(2), ['date', 'result']).values().get(0)

    return values.getInfo()

def get_smap_df(point_id, x, y, epsg=4326):
    values = get_smap_values(x, y, epsg=epsg)
    df = pd.DataFrame(values, columns=['DATE', 'SMP'])
    df.insert(0, 'PLOT_ID', point_id)
    return df

In [6]:
results = []

ALREADY_DONE = os.path.join(DATA_DIR, 'interim', 'NASA_SMAP_soil_moisture.csv')
if os.path.exists(ALREADY_DONE):
    already_done = pd.read_csv(ALREADY_DONE)
    results.append(already_done)
    already_done_plots = np.unique(already_done['PLOT_ID'].values)
else:
    already_done_plots = []

# with ThreadPoolExecutor(1) as executor:
with ProcessPoolExecutor(40) as executor:
    print('Starting to get data from Google Earth Engine.')
    jobs = [executor.submit(get_smap_df, *[row['PLOT_ID'], row['lon'], row['lat']]) for _, row in plots.iterrows() if row['PLOT_ID'] not in already_done_plots]
        
    for job in tqdm(as_completed(jobs), total=len(jobs)):
        results.append(job.result())

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]

In [7]:
result_df = pd.concat(results, axis=0, ignore_index=True)
result_df['DATE'] = pd.to_datetime(result_df['DATE'])
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973926 entries, 0 to 973925
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   PLOT_ID  973926 non-null  float64       
 1   DATE     973926 non-null  datetime64[ns]
 2   SMP      973926 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 22.3 MB


In [8]:
result_df.head()

Unnamed: 0,PLOT_ID,DATE,SMP
0,60101560000.0,2015-04-29 12:00:00,0.580573
1,60101560000.0,2015-05-29 12:00:00,0.335329
2,60101560000.0,2015-06-28 12:00:00,0.09758
3,60101560000.0,2015-07-31 12:00:00,0.083542
4,60101560000.0,2015-08-30 12:00:00,0.097807


In [9]:
out_csv = os.path.join(DATA_DIR, 'interim', 'NASA_SMAP_soil_moisture.csv')
result_df.to_csv(out_csv, index=False, header=True)