# Description

This notebook is used to run an pre-trained ML model to predict whether irrigation is in use in a geographic zone for a date range.  It generates images which can then be visualized in Google Earth Engine.

The SEBAL calculations for ET are leveraged from https://github.com/gee-hydro/geeSEBAL


# Setup Notebook

In [None]:
#!pip install geemap
#!pip install geopandas

In [3]:
import ee
ee.Authenticate()
ee.Initialize(project='second-impact-342800')
import geemap

import numpy as np
import pandas as pd
import datetime
import json
from timeit import default_timer as timer
from datetime import timedelta
import ipyleaflet
import ipywidgets as widgets
from tqdm.notebook import tqdm
import calendar

Enter verification code: 4/1AfJohXkpHGvQHvOJoRs-5F-sZUVZQhFhZ6wm5y0RO4fz-YgmS9cRRegkNfs

Successfully saved authorization token.


In [4]:
# Run in a cloud environment
#%cd /content
#!git clone https://github.com/gee-hydro/geeSEBAL.git
#!git clone https://github.com/earthdataanalytics/irrigation.git
#!cp -r geeSEBAL/etbrasil/ irrigation/
#%cd irrigation/

#### Custom libraries

In [5]:
from etbrasil.geesebal import Image
from etbrasil.geesebal import etretrieval as etr
from pipeline import boundaries as bnd
from pipeline import cropmasks as msk
from pipeline import labeledsamples as ls
from pipeline import dataextract as dx
from pipeline import utils

# Train GEE Random Forest classifier

##### Set training parameters

In [6]:
# To customize for each run
project_name = 'Restart'
startDate = '2021-08-01'
endDate = '2022-10-31'
model_name = '20230925_gee_cost_est' # This retrieves the best model for the area.  Was previously f'{project_name}'

# Defaults
et_var = 'ET_24h_R'
label_var = 'loc_type'
cols = [et_var, 'NDVI', 'LandT_G', 'last_rain', 'sum_precip_priorX',	'mm', label_var, 'latitude', 'longitude']

##### Retrieve training data

In [7]:
model_path = f'../runs/{model_name}/best/'
with open(model_path+'summary_stats.json', 'r', encoding='utf-8') as f:
    local_best = json.load(f)

best_run = local_best['exp_ref']
feature_path = f'../runs/{model_name}/{best_run}/'

In [10]:
with ee.profilePrinting():
    ee.data.setDefaultWorkloadTag('hydracarta_backend')
    ee.data.setWorkloadTag('hydracarta_train_model')

    df_features = pd.read_pickle(feature_path+'features.pkl')
    df_features = df_features[cols]

    # set all columns to numeric
    for col in cols:
        df_features[col] = df_features[col].astype(float)

    # convert from celsius to kelvin
    df_features.LandT_G = df_features.LandT_G + 273.15

    fc_data = geemap.pandas_to_ee(df_features, latitude="latitude", longitude="longitude")
    print('Total number of samples:   ', fc_data.size().getInfo())

    fc_split = utils.train_test_split(fc_data)
    print('Number of training samples:', fc_split['train_partition'].size().getInfo())

Total number of samples:    145
Number of training samples: 114


 EECU·s PeakMem Count  Description
  0.066    162k   643  (plumbing)
  0.022    240k   146  no description available
  0.001    1.6k     2  Algorithm Collection.reduceColumns with reducer Reducer.count
  0.000    2.5k     2  Listing collection
   -       286k     6  Algorithm Collection
   -       201k     2  Algorithm List.(construct from elements)
   -        11k     4  Algorithm Collection.reduceColumns
   -       3.8k     3  Algorithm Collection.filter
   -       3.6k   870  Algorithm Feature
   -       3.5k     3  Algorithm Collection.randomColumn
   -       3.4k     4  Algorithm AggregateFeatureCollection.count
   -       3.3k     6  Algorithm Collection.size
   -       3.3k     3  Algorithm Filter.lt
   -       2.0k   182  Algorithm GeometryConstructors.Point


##### Create and train GEE RandomForest Classifier

In [11]:
with ee.profilePrinting():
    ee.data.setWorkloadTag('hydracarta_train_model')

    num_trees = 1000
    bag_fraction = 0.63
    #variables_per_split = 10
    feature_list = cols[:6]

    classifier = ee.Classifier.smileRandomForest(
        numberOfTrees=num_trees,
    #    variablesPerSplit=variables_per_split,
        bagFraction=bag_fraction,
        seed=10
    )
    classifier = classifier.train(
        features=fc_split['train_partition'],
        classProperty=label_var,
        inputProperties=feature_list,
        subsamplingSeed=10
    )

 EECU·s PeakMem Count  Description


#### Score GEE classifier

In [13]:
%%time
with ee.profilePrinting():
    ee.data.setWorkloadTag('hydracarta_score_model')

    validated = fc_split['test_partition'].classify(classifier)

    # Get a confusion matrix representing expected accuracy.
    if classifier.mode() != 'PROBABILITY':
        validation_matrix = validated.errorMatrix(label_var, 'classification')
        cm = validation_matrix.getInfo()
        gee_acc = validation_matrix.accuracy().getInfo()
        gee_kappa = validation_matrix.kappa().getInfo()

        print(f'Validation error matrix: {cm}')
        print(f'Validation accuracy:     {gee_acc:.3f}')
        print(f'Validation kappa:        {gee_kappa:.3f}')

Validation error matrix: [[9, 5], [2, 15]]
Validation accuracy:     0.774
Validation kappa:        0.535
CPU times: user 137 ms, sys: 8.88 ms, total: 146 ms
Wall time: 4.85 s


 EECU·s PeakMem Count  Description
  0.466    355k    18  Training a SmileRandomForest classifier
  0.466     15M  2585  (plumbing)
  0.131    242k   882  no description available
  0.002    2.5k     9  Listing collection
  0.001    3.4k    15  Algorithm Classifier.TrainingContainer
  0.001     15k    12  Algorithm Collection.errorMatrix
   -       286k    18  Algorithm Collection
   -       201k     6  Algorithm List.(construct from elements)
   -       7.0k   435  Algorithm Dictionary.(construct from elements)
   -       6.5k     3  Classification using a SMILE classifier
   -       6.1k     9  Algorithm FeatureCollection.classify
   -       5.1k     6  Algorithm Classifier.train
   -       3.9k    27  Algorithm Collection.filter
   -       3.7k  2610  Algorithm Feature
   -       3.5k    18  Algorithm Collection.randomColumn
   -       3.3k     9  Algorithm Filter.gt
   -       3.3k     9  Algorithm Filter.lt
   -       3.3k     3  Algorithm ConfusionMatrix
   -       3.2k     9  Al

#### Compare GEE and local classfiers

In [14]:
print(f'GEE RF Classifier Accuracy   {gee_acc:.2f}')
print(f'Local RF Classifier Accuracy {local_best["rf_accuracy"]:.2f}')

GEE RF Classifier Accuracy   0.77
Local RF Classifier Accuracy 0.93


# Infer classes for AOI

##### Define Area of Interest (AOI)

In [15]:
Map = geemap.Map()
Map.add_basemap('HYBRID')
Map.centerObject(bnd.central_ca)
Map

Map(center=[20, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(children=(Togg…

In [22]:
# In general it has been best to select an AOI which is less than (<) 3,000 hectares in area
# otherwise the export step tends to crash when processing takes too long, which occurs
# frequently when area > 3k hectares.

if Map and (len(Map.draw_features) > 0):
    aoi = ee.FeatureCollection(Map.draw_features)
else:
    print("Please select an Area of Interest on the map")

##### Retrieve data for inference

In [36]:
def retrieveRGBandClassPredictions(startDate=None, endDate=None, applyCropMask=False):
    if (startDate==None) or (endDate==None):
        print("Please specify start and end date")
        return None, None

    etcol = Image(window_start=startDate, window_end=endDate, 
                      aoi=aoi, cloud_max=30, et_var=et_var) \
                .ETandMeteo

    if applyCropMask: # added on 2023-02-07
        etcol = etcol.map(lambda x: x.updateMask(msk.createGFSADmask(aoi)))

    num_images = etcol.size().getInfo()
    print('Number of ET sample images retrieved: ', num_images)

    rgbCol = ['R', 'G', 'B']
    # 2022.08 - in versions before the monthly data labels version Aug-2022, 
    # the rgb_mosaic was calculated as a mosaic, however this started producing
    # too many bad images.
    rgb_mosaic = (
        etcol
            .first()
            .clip(aoi)
            .select(['R', 'GR', 'B'])
            .rename(rgbCol)
            .set('custom:date', startDate+'_'+endDate)
    )

    pred_col = etcol.map(lambda x: x.classify(classifier))

    # create a map which takes the mean over all predictions in the time window
    # to represent the probability of the pixel being irrigated or not during the window
    pred_prob = (
        pred_col
            .mean()
            .clip(aoi)
            .set('custom:date', startDate+'_'+endDate)
            .set('custom:num_input_images', num_images)
    )
            
    return rgb_mosaic, pred_prob, etcol, num_images

In [28]:
%%time
with ee.profilePrinting():
    ee.data.setWorkloadTag('hydracarta_infer_aoi')

    rgb_mosaic, pred_prob, et_data, cnt = retrieveRGBandClassPredictions(startDate, endDate, applyCropMask=False)

Number of ET sample images retrieved:  77


 EECU·s PeakMem Count  Description
  0.520     85k  4623  (plumbing)
  0.017    138k   224  Algorithm (user-defined function)
  0.012     448    80  Listing collection
  0.011     94k    78  Algorithm Collection.reduceColumns with reducer Reducer.count
  0.007    160k   500  Algorithm Image.constant
  0.006    164k   631  Loading assets: (...)/(...)
  0.001     13k   105  Algorithm Image.divide
  0.000     12k    37  Algorithm Image.log
  0.000    7.5k   248  Algorithm Image.multiply
  0.000     17k    76  Algorithm Image.addBands
  0.000    5.6k    77  Algorithm Image.subtract
  0.000    7.6k     4  Algorithm Collection.geometry
  0.000    8.6k   247  Algorithm Image.rename
  0.000    3.3k    12  Algorithm Image.sin
  0.000    3.4k    87  Algorithm Image.select
  0.000    3.3k     6  Algorithm Image.tan
  0.000    3.3k    10  Algorithm Image.cos
  0.000    2.3k     3  Algorithm Image.pixelLonLat
  0.000    6.9k    88  Algorithm Image.add
   -       1.7M    62  Algorithm Collection.map

AttributeError: 'ImageCollection' object has no attribute 'clip'


##### Visualize results


In [34]:
Map.centerObject(aoi)
Map.addLayer(rgb_mosaic, rgb_viz, 'rgb', True, 1.0)
Map

EEException: Too many concurrent aggregations.

In [29]:
rgbCol = ['R', 'G', 'B']
rgb_viz = {'bands': rgbCol, 'min': 0.0, 'max': 3000, 'gamma': 1.4} # LS-Col-1-SR
pred_viz = {'min': 0.0, 'max': 1.0, 'palette': ['red', 'blue']}
prob_viz = {'min': 0.0, 'max': 1.0, 'palette': ['red', 'magenta', 'pink', 'yellow', 'aqua', 'blue', 'darkblue']}

if True: # skip this cell 
    with ee.profilePrinting():
        ee.data.setWorkloadTag('hydracarta_view_infer_results')

        Map.centerObject(aoi)
        Map.addLayer(aoi, {}, 'aoi boundary', True, 0.55)
        Map.addLayer(rgb_mosaic, rgb_viz, 'rgb', True, 1.0)
        Map.addLayer(pred_prob, prob_viz, 'probabilities', True, 1.0)
        Map

 EECU·s PeakMem Count  Description
562.433    5.0M  1792  Algorithm Image.normalizedDifference computing pixels
539.502    1.0M  2630  Algorithm Image.reduceRegion
296.686     27M 20496  Algorithm Image.multiply computing pixels
241.742     18M  1008  Algorithm Image.exp computing pixels
196.401     82M  1176  Algorithm Image.gradient computing pixels
179.848     32M   336  Reprojection precalculation between EPSG:4326 and EPSG:32610
125.313     18M  9296  Algorithm Image.updateMask computing pixels
112.716     36M  1344  Algorithm Image.sin computing pixels
111.953     36M  1344  Algorithm Image.cos computing pixels
108.434     18M   672  Algorithm Image.resample computing pixels
 70.269     19M 561812  (plumbing)
 58.524     11M 111026  Loading assets: (...)/(...)
 43.002    3.9M  1344  Algorithm Image.clip computing pixels
 42.372     32M   840  Algorithm Image.pixelLonLat computing pixels
 28.658    2.0M  2912  Algorithm Image.eq computing pixels
  7.318    3.5M 69539  Algorithm Im

EEException: Too many concurrent aggregations.

# Export Data

In [31]:
def exportImages(rgb_mosaic, pred_prob, startDate, endDate, aoi=None):
    #print("RGB export start")
    rgb_filename = f'../outputmaps/monthly_rgb_{startDate}_{endDate}.tif'
    geemap.ee_export_image(rgb_mosaic, filename=rgb_filename, scale=30, 
                            region=aoi, file_per_band=False,
                            timeout=300)
    #print("RGB export end")
    
    #print("Class prediction export start")
    class_filename = f'../outputmaps/monthly_classes_{startDate}_{endDate}.tif'
    geemap.ee_export_image(pred_prob, filename=class_filename, scale=30, 
                       region=aoi, file_per_band=False,
                       timeout=1200)
    #print("Class prediction export end")

In [37]:
%%time
from dateutil.relativedelta import relativedelta
# The output from this cell is saved to the local instance under the folder /outputmaps
# No tasks are created on GEE

retrieval_dates = pd.date_range(startDate,endDate, 
              freq='MS').strftime("%Y-%m-%d").tolist()

with ee.profilePrinting():
    ee.data.setWorkloadTag('hydracarta_export_infer_results')

    for start_date in tqdm(retrieval_dates):
        start_date_ts = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        end_date = start_date_ts + relativedelta(months=1) + timedelta(days=-1)
        end_date = end_date.strftime('%Y-%m-%d')

        print('Processing', start_date, end_date)
        rgb_mosaic, pred_prob, et_data, img_cnt = retrieveRGBandClassPredictions(start_date, end_date)
        if img_cnt > 1:
            exportImages(rgb_mosaic, pred_prob, start_date, end_date, aoi=aoi.geometry()) # or aoi=Map.user_roi
        else:
            print('\tWarning:  Less than two(2) ET images retrieved for period', start_date, end_date)

  0%|          | 0/15 [00:00<?, ?it/s]

Processing 2021-08-01 2021-08-31
Number of ET sample images retrieved:  7
Generating URL ...
Downloading data from https://earthengine.googleapis.com/v1/projects/second-impact-342800/thumbnails/c46f09f3b018b9cfd1b0ac20c243afa9-5a27dacdf92607dd008d5602b4f5ded4:getPixels
Please wait ...
Data downloaded to /Users/blairjones/Desktop/earthdataanalytics/outputmaps/monthly_rgb_2021-08-01_2021-08-31.tif
Generating URL ...
Downloading data from https://earthengine.googleapis.com/v1/projects/second-impact-342800/thumbnails/f53efb62a74066b37df380648b9ae3a4-a41b911d33afeca28b400d7ef342599d:getPixels
Please wait ...
Data downloaded to /Users/blairjones/Desktop/earthdataanalytics/outputmaps/monthly_classes_2021-08-01_2021-08-31.tif
Processing 2021-09-01 2021-09-30
Number of ET sample images retrieved:  7
Generating URL ...
Downloading data from https://earthengine.googleapis.com/v1/projects/second-impact-342800/thumbnails/09f0b810ff62112cdc12109ba7846a42-39f2dc0541af82d67e5023bef7a27edd:getPixels
Pl

 EECU·s PeakMem Count  Description
9539.544     70M 537330  Algorithm Image.updateMask computing pixels
8679.093     35M 925124  Algorithm Image.multiply computing pixels
8617.008    134k 43881  Algorithm reduce.sum
6878.056     18M 40308  Algorithm Image.exp computing pixels
6794.477     32M 15374  Reprojection precalculation between (...) and (...)
6134.572     82M 46368  Algorithm Image.gradient computing pixels
5355.487     18M 340412  Algorithm Image.add computing pixels
4916.345     63M 65520  Algorithm Image.sin computing pixels
4667.771     54M 61344  Algorithm Image.cos computing pixels
4484.610     45M 39604  Algorithm Image.resample computing pixels
3608.106    1.0M 31047  Algorithm Image.reduceRegion
2660.209    5.0M 27504  Algorithm Image.normalizedDifference computing pixels
2301.472    3.0M 32224  Algorithm Image.lte computing pixels
2071.151    9.0M  4176  Algorithm Image.reduce computing pixels
1636.251     27M 37296  Algorithm Image.where computing pixels
1444.455    

KeyboardInterrupt: 

In [42]:
project_name

'Restart'

In [41]:
import subprocess
#!gsutil mv *.tif gs://staging_demo/CA_monthly_01
run_string = f'gsutil mv ../outputmaps/*.tif gs://staging_demo/{project_name}'
subprocess.call(run_string, shell=True)

1

Create image collections for each type

In [43]:
run_string = f'earthengine create collection projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_{project_name}'
subprocess.call(run_string, shell=True)

0

In [44]:
run_string = f'earthengine create collection projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_{project_name}'
subprocess.call(run_string, shell=True)

0

In [45]:
!earthengine ls projects/eda-bjonesneu-proto/assets/irrigation

projects/eda-bjonesneu-proto/assets/irrigation/classes_col
projects/eda-bjonesneu-proto/assets/irrigation/labels
projects/eda-bjonesneu-proto/assets/irrigation/labels_new
projects/eda-bjonesneu-proto/assets/irrigation/labels_old
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_Clark_Ranch
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_Restart
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_Westwind
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_col_monthly
projects/eda-bjonesneu-proto/assets/irrigation/monthly_classes_col_monthly_02
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_Clark_Ranch
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_Restart
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_Westwind
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_col_monthly
projects/eda-bjonesneu-proto/assets/irrigation/monthly_rgb_col_monthly_02
projects/eda-bjonesneu-proto/assets

Upload the individual images to each image collection

In [52]:
from google.cloud import storage
import subprocess

bucket_name = "staging_demo"
storage_client = storage.Client.from_service_account_json("second-impact-342800-51af159903ca.json")
blobs = storage_client.list_blobs(bucket_name)

cnt=0
for blob in blobs:
    if project_name in blob.name:
        filename = blob.name
        print(blob.name)
        if '.tif' in filename:
            tmp_name = filename.split('/')[-1]
            project_type, asset_type, start, end = tmp_name[:-4].split('_')
            asset_name = f'projects/eda-bjonesneu-proto/assets/irrigation/{project_type}_{asset_type}_{project_name}/{tmp_name[:-4]}'
            gs_name = f'gs://{bucket_name}/{filename}'
            run_string = f"earthengine upload image --asset_id={asset_name} {gs_name}"
            #print(run_string)
            out = subprocess.call(run_string, shell=True)

<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-03-01_2016-03-31.tif, 1660007854784470>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-04-01_2016-04-30.tif, 1660007854936408>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-05-01_2016-05-31.tif, 1660007855100679>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-06-01_2016-06-30.tif, 1660007855265766>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-07-01_2016-07-31.tif, 1660007855441392>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-08-01_2016-08-31.tif, 1660007855635241>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-09-01_2016-09-30.tif, 1660007855770685>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-10-01_2016-10-31.tif, 1660007855948929>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-11-01_2016-11-30.tif, 1660007856116652>
<Blob: staging_demo, CA_monthly_01/monthly_classes_2016-12-01_2016-12-31.tif, 1660007856301638>
<Blob: staging_demo, CA_monthly_01/month

<Blob: staging_demo, Restart/monthly_classes_2021-09-01_2021-09-30.tif, 1695735201789861>
processing Restart/monthly_classes_2021-09-01_2021-09-30.tif
Restart/monthly_classes_2021-09-01_2021-09-30.tif
processing Restart/monthly_classes_2021-09-01_2021-09-30.tif
<Blob: staging_demo, Restart/monthly_classes_2021-10-01_2021-10-31.tif, 1695735211143185>
processing Restart/monthly_classes_2021-10-01_2021-10-31.tif
Restart/monthly_classes_2021-10-01_2021-10-31.tif
processing Restart/monthly_classes_2021-10-01_2021-10-31.tif
<Blob: staging_demo, Restart/monthly_classes_2021-11-01_2021-11-30.tif, 1695735220389022>
processing Restart/monthly_classes_2021-11-01_2021-11-30.tif
Restart/monthly_classes_2021-11-01_2021-11-30.tif
processing Restart/monthly_classes_2021-11-01_2021-11-30.tif
<Blob: staging_demo, Restart/monthly_rgb_2021-08-01_2021-08-31.tif, 1695735229507121>
processing Restart/monthly_rgb_2021-08-01_2021-08-31.tif
Restart/monthly_rgb_2021-08-01_2021-08-31.tif
processing Restart/monthl

In [None]:
STOP

##### Utility to delete unneeded files in folders

In [None]:
#!earthengine ls projects/eda-bjonesneu-proto/assets/irrigation/rgb > todelete.txt

In [None]:
if False:
    with open('todelete.txt') as todelete:
        for item in todelete:
            run_string = f"earthengine rm {item}"
            out = subprocess.call(run_string, shell=True)

In [None]:
#!earthengine ls projects/eda-bjonesneu-proto/assets/irrigation/rgb

# Retrieve job run costs

In [38]:
# Since no jobs are launched, retrieve metrics directly from https://console.cloud.google.com/monitoring/