In [1]:
import os
import glob

import numpy as np
import geopandas as gpd
import pandas as pd
import ee

from tqdm.notebook import tqdm
from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor

## Get Earth Engine Running
To access GEE, we will need to authenticate our account, and then initialize a connection to a server. 

In [2]:
SERVICE_ACCOUNT = "refit-fvs@refit-fvs.iam.gserviceaccount.com"
credentials = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, "../../gee_key.json")
ee.Initialize(credentials)

# Retrieve Soil Moisture Data
For each of the plots in a GeoDataFrame, and each year the imagery are available, we will filter the NASA Soil Moisture Active Passive (SMAP) collection from GEE to our Area of Interest. We want to get a monthly time-series of soil profile moisture for each point.

In [3]:
DATA_DIR = "../../data/"
PLOTS = os.path.join(DATA_DIR, "interim", "plot_info_for_climatena.csv")
plots = pd.read_csv(PLOTS).rename({"ID1": "PLOT_ID"}, axis=1).drop(["ID2"], axis=1)
plots.head()

Unnamed: 0,PLOT_ID,lat,lon,el
0,60101550679,41.806228,-123.788726,761
1,60101551744,41.980638,-124.193526,91
2,60101551969,41.681432,-123.803842,701
3,60101552953,41.938125,-123.870868,640
4,60101553315,41.738938,-123.783382,1432


## Collection Processing Functions
The following functions work on Google Earth Engine ImageCollections. 

In [4]:
CLOUDS, FRACTION_BAND = (
    ee.ImageCollection("MODIS/061/MOD08_M3"),
    "Cloud_Fraction_Nadir_Day_Mean_Mean",
)


def get_month(coll, month, year):
    start = ee.Date(f"{year}-{month}-01")
    end = start.advance(1, "month").advance(-1, "day")
    return coll.filterDate(start, end).first()


def get_cloud_fraction_values(x, y, epsg=4326):
    """Returns cloud fraction image collection filtered
    to intersect with a specific point."""
    aoi = ee.Geometry.Point((x, y), proj=f"EPSG:{epsg}")
    cloud_coll = CLOUDS.filterBounds(aoi)
    cloud_img = ee.ImageCollection(
        [
            get_month(cloud_coll, month, year)
            for year in range(2000, 2022)
            for month in range(1, 13)
        ]
    ).select([FRACTION_BAND])

    def get_point(img):
        result = img.reduceRegion(
            reducer=ee.Reducer.mean(), geometry=aoi, crs=f"EPSG:{epsg}", scale=1
        ).get(FRACTION_BAND)

        return img.set("date", img.date().format()).set("result", result)

    values = (
        cloud_img.map(get_point)
        .reduceColumns(ee.Reducer.toList(2), ["date", "result"])
        .values()
        .get(0)
    )

    return values.getInfo()


def get_clouds_df(point_id, x, y, epsg=4326):
    values = get_cloud_fraction_values(x, y, epsg=epsg)
    df = pd.DataFrame(values, columns=["DATE", "CLOUDS"])
    df.insert(0, "PLOT_ID", point_id)
    df["DATE"] = pd.to_datetime(df["DATE"])
    df["PERIOD"] = df.apply(
        lambda x: pd.Period(year=x.DATE.year, month=x.DATE.month, freq="M"), axis=1
    )
    df["CLOUDS"] = df["CLOUDS"] / 10000  # to scale cloudiness to be from 0-1
    return df.set_index(["PLOT_ID", "PERIOD"])["CLOUDS"]

In [6]:
results = []

ALREADY_DONE = os.path.join(DATA_DIR, "interim", "MODIS_monthly_cloud_fraction.csv")
if os.path.exists(ALREADY_DONE):
    already_done = pd.read_csv(ALREADY_DONE)
    results.append(already_done)
    already_done_plots = np.unique(already_done["PLOT_ID"].values)
else:
    already_done_plots = []

with ProcessPoolExecutor(40) as executor:
    print("Starting to get data from Google Earth Engine.")
    jobs = [
        executor.submit(get_clouds_df, *[row["PLOT_ID"], row["lon"], row["lat"]])
        for _, row in plots.iterrows()
        if row["PLOT_ID"] not in already_done_plots
    ]

    for job in tqdm(as_completed(jobs), total=len(jobs)):
        results.append(job.result())

Starting to get data from Google Earth Engine.


  0%|          | 0/12156 [00:00<?, ?it/s]

In [7]:
results[0]

PLOT_ID       PERIOD 
6.010155e+10  2000-02    0.9913
              2000-03    0.5010
              2000-04    0.5461
              2000-05    0.6189
              2000-06    0.3208
                          ...  
              2021-08    0.0651
              2021-09    0.1769
              2021-10    0.4170
              2021-11    0.5734
              2021-12    0.7065
Name: CLOUDS, Length: 261, dtype: float64

In [8]:
result_df = pd.concat(results, axis=0)
result_df.info()

<class 'pandas.core.series.Series'>
MultiIndex: 3172716 entries, (60101550679.0, Period('2000-02', 'M')) to (530906597320.0, Period('2021-12', 'M'))
Series name: CLOUDS
Non-Null Count    Dtype  
--------------    -----  
3172716 non-null  float64
dtypes: float64(1)
memory usage: 36.7 MB


In [9]:
result_df.head()

PLOT_ID       PERIOD 
6.010155e+10  2000-02    0.9913
              2000-03    0.5010
              2000-04    0.5461
              2000-05    0.6189
              2000-06    0.3208
Name: CLOUDS, dtype: float64

In [10]:
out_csv = os.path.join(DATA_DIR, "interim", "MODIS_monthly_cloud_fraction.csv")
result_df.to_csv(out_csv, index=True, header=True)