In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = 'refit-fvs@refit-fvs.iam.gserviceaccount.com'
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, '../gee_key.json')
ee.Initialize(credentials)

# Retrieve Soil Moisture Data
For each of the plots in a GeoDataFrame, and each year the imagery are available, we will filter the NASA Soil Moisture Active Passive (SMAP) collection from GEE to our Area of Interest. We want to get a monthly time-series of soil profile moisture for each point.

In [3]:
DATA_DIR = '../data/'
PLOTS = os.path.join(DATA_DIR, 'interim', 'plot_info_for_climatena.csv')
plots = pd.read_csv(PLOTS).rename({'ID1': 'PLOT_ID'}, axis=1).drop(['ID2'], axis=1)
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


## Collection Processing Functions
The following functions work on Google Earth Engine ImageCollections. 

In [7]:
CLOUDS, FRACTION_BAND = ee.ImageCollection("MODIS/061/MOD08_M3"), 'Cloud_Fraction_Nadir_Day_Mean_Mean'

def get_month(coll, month, year):
    start = ee.Date(f'{year}-{month}-01')
    end = start.advance(1, 'month').advance(-1, 'day')
    return coll.filterDate(start, end).first()

def get_cloud_fraction_values(x, y, epsg=4326):
    """Returns cloud fraction image collection filtered 
    to intersect with a specific point."""
    aoi = ee.Geometry.Point((x,y), proj=f'EPSG:{epsg}')
    cloud_coll = CLOUDS.filterBounds(aoi)
    cloud_img = ee.ImageCollection([get_month(cloud_coll, month, year) 
                                    for year in range(2000, 2022)
                                    for month in range(1, 13)]).select([FRACTION_BAND])
    

    def get_point(img):
        result = img.reduceRegion(reducer=ee.Reducer.mean(), 
                                  geometry=aoi, crs=f'EPSG:{epsg}', 
                                  scale=1).get(FRACTION_BAND)
    
        return img.set('date', img.date().format()).set('result', result)    
    
    values = cloud_img.map(get_point).reduceColumns(ee.Reducer.toList(2), ['date', 'result']).values().get(0)

    return values.getInfo()

def get_clouds_df(point_id, x, y, epsg=4326):
    values = get_cloud_fraction_values(x, y, epsg=epsg)
    df = pd.DataFrame(values, columns=['DATE', 'CLOUDS'])
    df.insert(0, 'PLOT_ID', point_id)
    df['DATE'] = pd.to_datetime(df['DATE'])
    df['CLOUDS'] = df['CLOUDS'] / 10000 # to scale cloudiness to be from 0-1
    return df

In [8]:
results = []

ALREADY_DONE = os.path.join(DATA_DIR, 'interim', 'MODIS_monthly_cloud_fraction.csv')
if os.path.exists(ALREADY_DONE):
    already_done = pd.read_csv(ALREADY_DONE)
    results.append(already_done)
    already_done_plots = np.unique(already_done['PLOT_ID'].values)
else:
    already_done_plots = []

# with ThreadPoolExecutor(1) as executor:
with ProcessPoolExecutor(40) as executor:
    print('Starting to get data from Google Earth Engine.')
    jobs = [executor.submit(get_clouds_df, *[row['PLOT_ID'], row['lon'], row['lat']]) for _, row in plots.iterrows() if row['PLOT_ID'] not in already_done_plots]
        
    for job in tqdm(as_completed(jobs), total=len(jobs)):
        results.append(job.result())

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]

In [9]:
results[0]

Unnamed: 0,PLOT_ID,DATE,CLOUDS
0,6.010157e+10,2000-02-01,0.9913
1,6.010157e+10,2000-03-01,0.5010
2,6.010157e+10,2000-04-01,0.5461
3,6.010157e+10,2000-05-01,0.6189
4,6.010157e+10,2000-06-01,0.3208
...,...,...,...
256,6.010157e+10,2021-08-01,0.0651
257,6.010157e+10,2021-09-01,0.1769
258,6.010157e+10,2021-10-01,0.4170
259,6.010157e+10,2021-11-01,0.5734


In [10]:
result_df = pd.concat(results, axis=0, ignore_index=True)
result_df['DATE'] = pd.to_datetime(result_df['DATE'])
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3172716 entries, 0 to 3172715
Data columns (total 3 columns):
 #   Column   Dtype         
---  ------   -----         
 0   PLOT_ID  float64       
 1   DATE     datetime64[ns]
 2   CLOUDS   float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 72.6 MB


In [11]:
result_df.head()

Unnamed: 0,PLOT_ID,DATE,CLOUDS
0,60101570000.0,2000-02-01,0.9913
1,60101570000.0,2000-03-01,0.501
2,60101570000.0,2000-04-01,0.5461
3,60101570000.0,2000-05-01,0.6189
4,60101570000.0,2000-06-01,0.3208


In [12]:
out_csv = os.path.join(DATA_DIR, 'interim', 'MODIS_monthly_cloud_fraction.csv')
result_df.to_csv(out_csv, index=False, header=True)