# Description

This notebook is used to run an pre-trained ML model to predict whether irrigation is in use in a geographic zone for a date range.  It generates images which can then be visualized in Google Earth Engine.

The SEBAL calculations for ET are leveraged from https://github.com/gee-hydro/geeSEBAL


# Setup Notebook

In [1]:
#!pip install geemap
#!pip install geopandas

In [2]:
import ee
ee.Authenticate()
ee.Initialize(project='second-impact-342800')
import geemap

import numpy as np
import pandas as pd
import datetime
import json
from timeit import default_timer as timer
from datetime import timedelta
import ipyleaflet
import ipywidgets as widgets
from tqdm.notebook import tqdm
import calendar

Enter verification code: 4/1AfJohXl4J6sy9nlHCPhCKndEJFfIsf8fKKlQyYe2fhjvhb-0wzHVwb-em5c

Successfully saved authorization token.


*** Earth Engine *** Share your feedback by taking our Annual Developer Satisfaction Survey: https://google.qualtrics.com/jfe/form/SV_doiqkQG3NJ1t8IS?source=API


In [3]:
# Run in a cloud environment
#%cd /content
#!git clone https://github.com/gee-hydro/geeSEBAL.git
#!git clone https://github.com/earthdataanalytics/irrigation.git
#!cp -r geeSEBAL/etbrasil/ irrigation/
#%cd irrigation/

#### Custom libraries

In [4]:
from etbrasil.geesebal import Image
from pipeline import cropmasks as msk
from pipeline import utils

# Train GEE Random Forest classifier

##### Set training parameters

In [5]:
# To customize for each run
project_name = 'ssebop_1'
startDate = '2016-08-01'
endDate = '2022-10-31'
model_name = '20231112_ssebop_1' # This retrieves the best model for the area.  Was previously f'{project_name}'

# Defaults
et_var = 'ET_24h' # could also use 'ET_24h_R' for the GEESEBAL model
label_var = 'loc_type'

##### Retrieve training data

In [6]:
model_path = f'./runs/{model_name}/best/'
with open(model_path+'summary_stats.json', 'r', encoding='utf-8') as f:
    local_best = json.load(f)

best_run = local_best['exp_ref']
feature_path = f'./runs/{model_name}/{best_run}/'

In [16]:
df_features = pd.read_pickle(feature_path+'features.pkl')
df_features.columns

Index(['ET0_24h', 'ET_24h', 'ET_fraction', 'NDVI', 'TMAX', 'TMIN',
       'actual_vapor_pressure', 'date', 'loc_type', 'precip',
       'solar_radiation', 'wind_speed', 'longitude', 'latitude', 'yyyy', 'mm',
       'loc_idx', 'type', 'sum_precip_priorX', 'last_rain'],
      dtype='object')

In [17]:
# cols should be aligned with he definition in pipelines/ET_Train_RF.py
# cols = [et_var, 'NDVI', 'LandT_G', 'last_rain', 'sum_precip_priorX', 'mm', label_var, 'latitude', 'longitude']
#generic_cols = [et_var, 'NDVI', 'LandT_G', 'sum_precip_priorX', 'last_rain', 'mm', 'yyyy', 'loc_idx', 'date']
#num_cols_rf = 2 # changed from 5 to 2 on 2023.11.12 to test SSEBOP # changed from 6 on 2022.08.02 to remove yyyy from predictors
#cols = generic_cols[:num_cols_rf]

cols = ['ET_24h', 'ET_fraction', 'NDVI', 'actual_vapor_pressure', 'wind_speed']

In [18]:
#with ee.profilePrinting():
if True:
    ee.data.setDefaultWorkloadTag('hydracarta_backend')
    ee.data.setWorkloadTag('hydracarta_train_model')

    # set all columns to numeric
    for col in cols:
        df_features[col] = df_features[col].astype(float)

    # convert from celsius to kelvin
    # Removed on 2023.11.12 by BCJ because the initial conversion was removed in utils.py
    # df_features.LandT_G = df_features.LandT_G + 273.15

    fc_data = geemap.pandas_to_ee(df_features, latitude="latitude", longitude="longitude")
    print('Total number of samples:   ', fc_data.size().getInfo())

    fc_split = utils.train_test_split(fc_data)
    print('Number of training samples:', fc_split['train_partition'].size().getInfo())

Total number of samples:    214
Number of training samples: 157


##### Create and train GEE RandomForest Classifier

In [19]:
#with ee.profilePrinting():
if True:
    ee.data.setWorkloadTag('hydracarta_train_model')

    num_trees = 1000
    bag_fraction = 0.63
    #variables_per_split = 10
    feature_list = cols[:6]

    classifier = ee.Classifier.smileRandomForest(
        numberOfTrees=num_trees,
    #    variablesPerSplit=variables_per_split,
        bagFraction=bag_fraction,
        seed=10
    )
    classifier = classifier.train(
        features=fc_split['train_partition'],
        classProperty=label_var,
        inputProperties=feature_list,
        subsamplingSeed=10
    )

#### Score GEE classifier

In [20]:
%%time
#with ee.profilePrinting():
if True:
    ee.data.setWorkloadTag('hydracarta_score_model')

    validated = fc_split['test_partition'].classify(classifier)

    # Get a confusion matrix representing expected accuracy.
    if classifier.mode() != 'PROBABILITY':
        validation_matrix = validated.errorMatrix(label_var, 'classification')
        cm = validation_matrix.getInfo()
        gee_acc = validation_matrix.accuracy().getInfo()
        gee_kappa = validation_matrix.kappa().getInfo()

        print(f'Validation error matrix: {cm}')
        print(f'Validation accuracy:     {gee_acc:.3f}')
        print(f'Validation kappa:        {gee_kappa:.3f}')

Validation error matrix: [[15, 12], [7, 23]]
Validation accuracy:     0.667
Validation kappa:        0.325
CPU times: user 416 ms, sys: 23.2 ms, total: 439 ms
Wall time: 6.04 s


In [27]:
ee.Geometry(geo_json)

NameError: name 'geo_json' is not defined

In [33]:
#TO COMPLETE:  Save model to GEE

trees = ee.List(ee.Dictionary(classifier.explain()).get('trees'))
dummy = ee.Feature(ee.Geometry.Point([0, 0]))
fc_classifier = ee.FeatureCollection(trees.map(lambda x: dummy.set('tree',x)))
task = ee.batch.Export.table.toAsset(fc_classifier,'save_classifier','projects/eda-bjonesneu-proto/assets/irrigation/test_RF_model')
task.start()

#### Compare GEE and local classfiers

In [22]:
print(f'GEE RF Classifier Accuracy   {gee_acc:.2f}')
print(f'Local RF Classifier Accuracy {local_best["rf_accuracy"]:.2f}')

GEE RF Classifier Accuracy   0.67
Local RF Classifier Accuracy 0.70


# Infer classes for AOI

##### Define Area of Interest (AOI)

In [15]:
Map = geemap.Map()
Map.add_basemap('HYBRID')
Map.centerObject(bnd.central_ca)
Map

Map(center=[20, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(children=(Togg…

In [22]:
# In general it has been best to select an AOI which is less than (<) 3,000 hectares in area
# otherwise the export step tends to crash when processing takes too long, which occurs
# frequently when area > 3k hectares.

if Map and (len(Map.draw_features) > 0):
    aoi = ee.FeatureCollection(Map.draw_features)
else:
    print("Please select an Area of Interest on the map")

##### Retrieve data for inference

In [36]:
def retrieveRGBandClassPredictions(startDate=None, endDate=None, applyCropMask=False):
    if (startDate==None) or (endDate==None):
        print("Please specify start and end date")
        return None, None

    etcol = Image(window_start=startDate, window_end=endDate, 
                      aoi=aoi, cloud_max=30, et_var=et_var) \
                .ETandMeteo

    if applyCropMask: # added on 2023-02-07
        etcol = etcol.map(lambda x: x.updateMask(msk.createGFSADmask(aoi)))

    num_images = etcol.size().getInfo()
    print('Number of ET sample images retrieved: ', num_images)

    rgbCol = ['R', 'G', 'B']
    # 2022.08 - in versions before the monthly data labels version Aug-2022, 
    # the rgb_mosaic was calculated as a mosaic, however this started producing
    # too many bad images.
    rgb_mosaic = (
        etcol
            .first()
            .clip(aoi)
            .select(['R', 'GR', 'B'])
            .rename(rgbCol)
            .set('custom:date', startDate+'_'+endDate)
    )

    pred_col = etcol.map(lambda x: x.classify(classifier))

    # create a map which takes the mean over all predictions in the time window
    # to represent the probability of the pixel being irrigated or not during the window
    pred_prob = (
        pred_col
            .mean()
            .clip(aoi)
            .set('custom:date', startDate+'_'+endDate)
            .set('custom:num_input_images', num_images)
    )
            
    return rgb_mosaic, pred_prob, etcol, num_images

In [None]:
%%time
#with ee.profilePrinting():
if True:
    ee.data.setWorkloadTag('hydracarta_infer_aoi')

    rgb_mosaic, pred_prob, et_data, cnt = retrieveRGBandClassPredictions(startDate, endDate, applyCropMask=False)


##### Visualize results


In [None]:
Map.centerObject(aoi)
Map.addLayer(rgb_mosaic, rgb_viz, 'rgb', True, 1.0)
Map

In [None]:
rgbCol = ['R', 'G', 'B']
rgb_viz = {'bands': rgbCol, 'min': 0.0, 'max': 3000, 'gamma': 1.4} # LS-Col-1-SR
pred_viz = {'min': 0.0, 'max': 1.0, 'palette': ['red', 'blue']}
prob_viz = {'min': 0.0, 'max': 1.0, 'palette': ['red', 'magenta', 'pink', 'yellow', 'aqua', 'blue', 'darkblue']}

#with ee.profilePrinting():
if True: # skip this cell 
    ee.data.setWorkloadTag('hydracarta_view_infer_results')

    Map.centerObject(aoi)
    Map.addLayer(aoi, {}, 'aoi boundary', True, 0.55)
    Map.addLayer(rgb_mosaic, rgb_viz, 'rgb', True, 1.0)
    Map.addLayer(pred_prob, prob_viz, 'probabilities', True, 1.0)
    Map

# Export Data

In [31]:
def exportImages(rgb_mosaic, pred_prob, startDate, endDate, aoi=None):
    #print("RGB export start")
    rgb_filename = f'../outputmaps/monthly_rgb_{startDate}_{endDate}.tif'
    geemap.ee_export_image(rgb_mosaic, filename=rgb_filename, scale=30, 
                            region=aoi, file_per_band=False,
                            timeout=300)
    #print("RGB export end")
    
    #print("Class prediction export start")
    class_filename = f'../outputmaps/monthly_classes_{startDate}_{endDate}.tif'
    geemap.ee_export_image(pred_prob, filename=class_filename, scale=30, 
                       region=aoi, file_per_band=False,
                       timeout=1200)
    #print("Class prediction export end")

In [None]:
%%time
from dateutil.relativedelta import relativedelta
# The output from this cell is saved to the local instance under the folder /outputmaps
# No tasks are created on GEE

retrieval_dates = pd.date_range(startDate,endDate, 
              freq='MS').strftime("%Y-%m-%d").tolist()

#with ee.profilePrinting():
if True:
    ee.data.setWorkloadTag('hydracarta_export_infer_results')

    for start_date in tqdm(retrieval_dates):
        start_date_ts = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        end_date = start_date_ts + relativedelta(months=1) + timedelta(days=-1)
        end_date = end_date.strftime('%Y-%m-%d')

        print('Processing', start_date, end_date)
        rgb_mosaic, pred_prob, et_data, img_cnt = retrieveRGBandClassPredictions(start_date, end_date)
        if img_cnt > 1:
            exportImages(rgb_mosaic, pred_prob, start_date, end_date, aoi=aoi.geometry()) # or aoi=Map.user_roi
        else:
            print('\tWarning:  Less than two(2) ET images retrieved for period', start_date, end_date)

In [42]:
project_name

'Restart'

In [41]:
import subprocess
#!gsutil mv *.tif gs://staging_demo/CA_monthly_01
run_string = f'gsutil mv ../outputmaps/*.tif gs://staging_demo/{project_name}'
subprocess.call(run_string, shell=True)

1

Create image collections for each type

In [43]:
run_string = f'earthengine create collection projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_{project_name}'
subprocess.call(run_string, shell=True)

0

In [44]:
run_string = f'earthengine create collection projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_{project_name}'
subprocess.call(run_string, shell=True)

0

In [45]:
!earthengine ls projects/eda-bjonesneu-proto/assets/irrigation

projects/eda-bjonesneu-proto/assets/irrigation/classes_col
projects/eda-bjonesneu-proto/assets/irrigation/labels
projects/eda-bjonesneu-proto/assets/irrigation/labels_new
projects/eda-bjonesneu-proto/assets/irrigation/labels_old
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_Clark_Ranch
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_Restart
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_Westwind
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_col_monthly
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_col_monthly_02
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_Clark_Ranch
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_Restart
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_Westwind
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_col_monthly
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_col_monthly_02
projects/eda-bjonesneu-proto/assets

Upload the individual images to each image collection

In [52]:
from google.cloud import storage
import subprocess

bucket_name = "staging_demo"
storage_client = storage.Client.from_service_account_json("second-impact-342800-51af159903ca.json")
blobs = storage_client.list_blobs(bucket_name)

cnt=0
for blob in blobs:
    if project_name in blob.name:
        filename = blob.name
        print(blob.name)
        if '.tif' in filename:
            tmp_name = filename.split('/')[-1]
            project_type, asset_type, start, end = tmp_name[:-4].split('_')
            asset_name = f'projects/eda-bjonesneu-proto/assets/irrigation/{project_type}_{asset_type}_{project_name}/{tmp_name[:-4]}'
            gs_name = f'gs://{bucket_name}/{filename}'
            run_string = f"earthengine upload image --asset_id={asset_name} {gs_name}"
            #print(run_string)
            out = subprocess.call(run_string, shell=True)

<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-03-01_2016-03-31.tif, 1660007854784470>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-04-01_2016-04-30.tif, 1660007854936408>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-05-01_2016-05-31.tif, 1660007855100679>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-06-01_2016-06-30.tif, 1660007855265766>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-07-01_2016-07-31.tif, 1660007855441392>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-08-01_2016-08-31.tif, 1660007855635241>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-09-01_2016-09-30.tif, 1660007855770685>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-10-01_2016-10-31.tif, 1660007855948929>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-11-01_2016-11-30.tif, 1660007856116652>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-12-01_2016-12-31.tif, 1660007856301638>
<Blob: staging_demo, CA_monthly_01/month

<Blob: staging_demo, Restart/monthly_classes_2021-09-01_2021-09-30.tif, 1695735201789861>
processing Restart/monthly_classes_2021-09-01_2021-09-30.tif
Restart/monthly_classes_2021-09-01_2021-09-30.tif
processing Restart/monthly_classes_2021-09-01_2021-09-30.tif
<Blob: staging_demo, Restart/monthly_classes_2021-10-01_2021-10-31.tif, 1695735211143185>
processing Restart/monthly_classes_2021-10-01_2021-10-31.tif
Restart/monthly_classes_2021-10-01_2021-10-31.tif
processing Restart/monthly_classes_2021-10-01_2021-10-31.tif
<Blob: staging_demo, Restart/monthly_classes_2021-11-01_2021-11-30.tif, 1695735220389022>
processing Restart/monthly_classes_2021-11-01_2021-11-30.tif
Restart/monthly_classes_2021-11-01_2021-11-30.tif
processing Restart/monthly_classes_2021-11-01_2021-11-30.tif
<Blob: staging_demo, Restart/monthly_rgb_2021-08-01_2021-08-31.tif, 1695735229507121>
processing Restart/monthly_rgb_2021-08-01_2021-08-31.tif
Restart/monthly_rgb_2021-08-01_2021-08-31.tif
processing Restart/monthl

In [None]:
STOP

##### Utility to delete unneeded files in folders

In [None]:
#!earthengine ls projects/eda-bjonesneu-proto/assets/irrigation/rgb > todelete.txt

In [None]:
if False:
    with open('todelete.txt') as todelete:
        for item in todelete:
            run_string = f"earthengine rm {item}"
            out = subprocess.call(run_string, shell=True)

In [None]:
#!earthengine ls projects/eda-bjonesneu-proto/assets/irrigation/rgb

# Retrieve job run costs

In [38]:
# Since no jobs are launched, retrieve metrics directly from https://console.cloud.google.com/monitoring/