In [None]:
# The following notebook will get two different datasets of explanatory variables: temporal an non-temporal
# related. 
# In order to improve the speed time, this notebook will create the respective datasets and it will send a task to 
# EarthEngine with a ReduceByRegion operation, we have proved that this method is faster than using the individual
# calls to the API.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

The objective would be to loop over the points or the dates...<br>
After testing this script https://code.earthengine.google.com/b18e876cca44266be704924b7354ddff <br>
I found out that the best way to do it is to loop over the dates, and then pass the reduceregions. <br>


In [3]:
df = pd.read_csv('data/field_data_unique_coords.csv')
high_corr_ids = pd.read_csv('data/ids_high_corr.csv')
len(df)

274266

In [4]:
unique_coords = df[df.id.isin(high_corr_ids.id)][["id", "lon", "lat"]].drop_duplicates()
unique_coords.head()
len(unique_coords)

# Convert them as a geodataframe and save them

geometry = [Point(xy) for xy in zip(unique_coords.lon, unique_coords.lat)]
gdf = gpd.GeoDataFrame(unique_coords, geometry=geometry)
# gdf.crs = {'init': 'epsg:4326'}
# gdf.to_file("data/0_shp/high_corr_stations.shp")

In [5]:
# Read regions shapefile


# I have two regions, first is to only the high correlated stations and the second is all the stations
gdf_regions = gpd.GeoDataFrame.from_file("data/0_shp/regions_to_request_explanatory.gpkg")

gdf_regions = gpd.GeoDataFrame.from_file("data/0_shp/regions_to_request_explanatory_all.gpkg")
# Add a sequential id to the regions
gdf_regions["id"] = gdf_regions.index + 1
# resave it
gdf_regions.to_file("data/0_shp/regions_to_request_explanatory_all.gpkg", driver="GPKG")
gdf_regions

Unnamed: 0,id,geometry
0,1,"MULTIPOLYGON (((96.37854 4.01317, 96.76923 3.9..."
1,2,"MULTIPOLYGON (((102.96446 -0.63790, 104.82488 ..."
2,3,"MULTIPOLYGON (((140.00836 -7.80760, 140.75163 ..."
3,4,"MULTIPOLYGON (((105.23245 -2.56075, 105.62785 ..."
4,5,"MULTIPOLYGON (((101.59551 1.61281, 101.45686 0..."
5,6,"MULTIPOLYGON (((100.69365 2.01094, 100.81080 2..."
6,7,"MULTIPOLYGON (((108.80424 1.60848, 109.83126 1..."
7,8,"MULTIPOLYGON (((110.98152 -2.86934, 114.00610 ..."
8,9,"MULTIPOLYGON (((132.99060 -0.68691, 133.43736 ..."
9,10,"MULTIPOLYGON (((116.84967 3.98347, 117.30926 3..."


In [6]:
df = pd.read_csv('data/field_data_unique_coords.csv')

# To only get the high correlated stations, uncomment the following line
# high_corr_ids = pd.read_csv('data/ids_high_corr.csv')

# df = df[df['id'].isin(high_corr_ids['id'])]

# Remove those date where the gwl measure is out of reasonable range
upper_thres = 20
lower_thres = -100

df = df[(df.gwl_cm < upper_thres) & (df.gwl_cm > lower_thres)]

# save the final points

df.to_csv('field_data_high_corr.csv', index=False)

# Get the coordinates of the individual points

unique_coords = df[["id", 'lon', 'lat']].drop_duplicates()
len(unique_coords)

2074

In [9]:
import ee
from gee_scripts.get_sources import get_s1_image, get_gldas, get_gpm
ee.Initialize()

In [12]:
# Create geodataframe from x y coordinates

gdf_unique_coords = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")


# Add the region id to each point

gdf_unique_coords = gpd.sjoin(gdf_unique_coords, gdf_regions[["id", "geometry"]], how="left", predicate="within")


# I need to extract all the dates from the first group of points
# first get the ids of the first group of points

## Get temporal explanatory variables

In [32]:
base_selectors = ["system:index", "lat", "lon", "id"]
s1_selectors = ["LIA", "VH", "VV", "VVVH_ratio", "angle"]
gldas_selectors = ['gldas_mean', 'gldas_stddev', 'sm_1', 'sm_3', 'sm_7', 'sm_30']
gpm_selectors = ['precipitation', 'prec_3', 'prec_7', 'prec_30']


def get_temporal_explanatory(region_id):
   """Get the explanatory temporal based variables"""

   region = gdf_regions[gdf_regions.id == region_id].to_crs("EPSG:4326")[:]
   dates = df[df.id.isin(gdf_unique_coords[gdf_unique_coords.id_right == region_id].id_left.unique())].date.unique()
   points = gdf_unique_coords[gdf_unique_coords.id_right == region_id][["id_left", "geometry"]].rename(columns={"id_left": "id"}).to_crs("EPSG:4326")

   print(len(dates), len(points))
   # Convert to ee elements

   ee_dates = ee.FeatureCollection(ee.List([ ee.Feature(None, {"date": date}) for date in dates]))
   ee_points = ee.FeatureCollection(points.__geo_interface__)
   ee_region = ee.FeatureCollection(region.__geo_interface__)

   def get_sources(date_feature):
      
      date_range = ee.Date(date_feature.get("date")).getRange('day')

      s1_composite = get_s1_image(date_range, ee_region)

      return s1_composite.set({
         "numberOfBands" : s1_composite.bandNames().size(),
         "date" : ee.Date(date_feature.get("date"))
         })
      
   def reduce_composite(composite):
      
      # Filter the extra data with the matching date
      date = composite.get("date")
      date_range = ee.Date(date).getRange('day')

      gldas_composite = get_gldas(date_range, ee_region)
      gpm_composite = get_gpm(date_range, ee_region)

      composite = (ee.Image(composite)
            .addBands(gldas_composite)
            .addBands(gpm_composite)
      )
      
      return composite.reduceRegions(**{
         "collection" : ee_points,
         "reducer" : ee.Reducer.first(),
         "scale" : 10,
         "tileScale" : 16
      }).filter(ee.Filter.notNull(['VH']))


   task = (ee_dates
         .map(get_sources)
         .filter(ee.Filter.gt('numberOfBands', 0))
         .map(reduce_composite).flatten()
   )

   ee_task = ee.batch.Export.table.toDrive(**{
      "collection": task, 
      "folder" : "INDONESIA_GWL",
      "description": f"All_temporal_non_resample_region_{region_id}_dates_{len(dates)}_points_{len(points)}",
      "selectors": base_selectors + s1_selectors + gldas_selectors + gpm_selectors
   })

   # ee_task.start()

   print(f"All_temporal_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}",)


[get_temporal_explanatory(region_id) for region_id in gdf_regions.id.unique()]

520 24
All_temporal_non_resample_at_all_region_1_dates_520_points_24
1773 149
All_temporal_non_resample_at_all_region_2_dates_1773_points_149
479 1
All_temporal_non_resample_at_all_region_3_dates_479_points_1
988 348
All_temporal_non_resample_at_all_region_4_dates_988_points_348
1796 717
All_temporal_non_resample_at_all_region_5_dates_1796_points_717
489 43
All_temporal_non_resample_at_all_region_6_dates_489_points_43
1274 477
All_temporal_non_resample_at_all_region_7_dates_1274_points_477
1671 221
All_temporal_non_resample_at_all_region_8_dates_1671_points_221
379 17
All_temporal_non_resample_at_all_region_9_dates_379_points_17
846 77
All_temporal_non_resample_at_all_region_10_dates_846_points_77


[None, None, None, None, None, None, None, None, None, None]

## Get non temporal explanatory variables

In [13]:
from gee_scripts.get_sources import get_srtm, get_globcover, get_gedi, get_gldas_stats

In [33]:
# We'll try to get all the points at once, not by region (so we won't filter by region)
region = gdf_regions.to_crs("EPSG:4326")[:]
ee_region = ee.FeatureCollection(region.__geo_interface__)
points = gdf_unique_coords[["id_left", "geometry"]].rename(columns={"id_left": "id"}).to_crs("EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__)
len(points)

2074

In [34]:
composite = (
    get_srtm()
        .addBands(get_globcover())
        .addBands(get_gedi(ee_region))
        .addBands(get_gldas_stats(ee_region))
)
composite.bandNames().getInfo()

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['canopy_height']))

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description": f"All_Non_temporal_points_{len(points)}",
    "selectors": base_selectors + ['elevation', 'aspect', 'slope', 'land_cov', 'canopy_height', "gldas_mean", "gldas_stddev"]
})

ee_task.start()

# Get Hansen data

In [None]:
# We need to get the 

# Merge explanatory variables

In [26]:
from pathlib import Path

In [27]:
explanatory_path = Path("data/7_training_data/")
temporal_file_names = [
    "All_temporal_non_resample_region_1_dates_485_points_2.csv",
    "All_temporal_non_resample_region_2_dates_626_points_11.csv",
    "All_temporal_non_resample_region_3_dates_1737_points_13.csv",
    "All_temporal_non_resample_region_4_dates_653_points_12.csv",
    "All_temporal_non_resample_region_5_dates_1542_points_21.csv",
    "All_temporal_non_resample_region_6_dates_479_points_1.csv",
]

In [42]:
temporal_cols = [
    'id', 'lat', 'lon', "date", 'LIA', 'VH', 'VV', 'VVVH_ratio', 'angle',
    'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation',
    'prec_3', 'prec_7', 'prec_30'
]

def add_date_to_explanatory_df(region_id, explain_df):
    """Add the corresponding date to the explanatory dataframe.

    As the result from GEE didn't come with the date, we need to add it manually.
    For each of the .csv results, we have to use the "dates" list that was used to get the data,
    and by its index, we can merge the date to the dataframe.
    
    For each region we will have different dates.
    """

    dates = pd.DataFrame(
        df[df.id.isin(gdf_unique_coords[gdf_unique_coords.id_right == region_id].id_left.unique())].date.unique(),
        columns=["date"]
    ).reset_index()

    # Get the date from the measurement based on the "system:index" col
    explain_df["date_idx"] = explain_df["system:index"].apply(lambda x: int(x.split("_")[0]))

    return explain_df.merge(dates, left_on="date_idx", right_on="index")[temporal_cols]


# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
    add_date_to_explanatory_df(idx, explain_df) 
    for idx, explain_df 
    in enumerate(
        [
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
        start=1
        )
    ], axis=0)

temp_explanatory_dfs

Unnamed: 0,id,lat,lon,date,LIA,VH,VV,VVVH_ratio,angle,sm_1,sm_3,sm_7,sm_30,precipitation,prec_3,prec_7,prec_30
0,121_APC_D15,,,2020-12-02,37.158626,-13.377886,-6.305714,0.188173,39.002327,36.214001,36.373625,35.832196,36.176533,0.264000,0.140833,0.205122,0.152134
1,121_APC_H09,,,2020-12-02,36.953262,-15.453578,-5.003018,0.287521,38.996140,36.214001,36.373625,35.832196,36.176533,0.370000,0.200833,0.263049,0.191682
2,121_APC_D15,,,2021-09-14,37.852636,-12.313376,-4.441834,0.300894,36.006908,41.771999,37.499333,37.388982,36.215733,0.168889,0.210000,0.673171,0.283413
3,121_APC_H09,,,2021-09-14,37.631559,-13.279904,-5.471688,0.236691,35.960140,41.771999,37.499333,37.388982,36.215733,0.211111,0.329706,0.745000,0.331018
4,121_APC_D15,,,2021-12-08,37.155728,-14.935464,-7.701659,0.137663,38.999432,38.425999,38.001083,38.607357,36.762179,0.480000,0.349211,0.675584,0.423739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,BRG_910111_01,,,2023-02-04,41.714628,-13.912240,-10.221497,0.054404,42.762909,32.912998,32.184917,32.222143,33.007162,0.046364,0.026667,0.030580,0.145016
73,BRG_910111_01,,,2023-02-16,39.653029,-11.491989,-6.076784,0.175861,38.837509,38.230999,36.179000,35.612732,33.761704,0.590000,0.400000,0.388310,0.203987
74,BRG_910111_01,,,2023-02-19,41.714839,-12.510206,-7.792358,0.110149,42.763123,34.505001,34.895708,35.436232,33.991917,0.034545,0.309000,0.323286,0.184466
75,BRG_910111_01,,,2023-03-01,39.657848,-14.591243,-7.201469,0.155738,38.842331,35.603001,33.867458,33.315286,33.766904,0.433000,0.214516,0.161333,0.172443


## Merge non temporal explanatory

In [43]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

non_temporal_file_name = "All_Non_temporal_points_2074.csv"

non_temporal_df = pd.read_csv(explanatory_path/non_temporal_file_name)
non_temporal_df

Unnamed: 0,system:index,lat,lon,id,elevation,aspect,slope,land_cov,canopy_height,gldas_mean,gldas_stddev
0,0,,,BRG_140301_01,12,26.573112,2.073173,40,15,32.506657,1.989591
1,1799,,,BRG_140301_02,13,270.000000,0.927708,40,6,32.530491,1.982611
2,2530,,,BRG_140302_01,13,270.000000,1.854979,30,4,32.526196,1.994038
3,4174,,,BRG_140302_02,13,165.959034,3.818558,40,13,32.566113,1.973969
4,5970,,,BRG_140802_02,20,239.038901,5.392593,160,0,32.519711,1.973292
...,...,...,...,...,...,...,...,...,...,...,...
2069,272775,,,brg6,7,314.990916,1.311649,40,0,31.976336,2.483077
2070,273254,,,jambi1,17,-0.000000,1.854334,30,10,31.922771,2.548600
2071,273497,,,kalbar1,8,243.435103,2.073039,12,3,32.688313,2.437983
2072,273890,,,kalteng1,8,116.546267,2.074387,40,10,31.701403,3.461483


## Create final explanatory variables dataset

In [45]:
# Merge the non-temporal variables with the temporal ones

explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
explanatory_df

Unnamed: 0,id,lat_x,lon_x,date,LIA,VH,VV,VVVH_ratio,angle,sm_1,...,system:index,lat_y,lon_y,elevation,aspect,slope,land_cov,canopy_height,gldas_mean,gldas_stddev
0,121_APC_D15,,,2020-12-02,37.158626,-13.377886,-6.305714,0.188173,39.002327,36.214001,...,37909,,,11,270.000000,1.858527,12,11,35.598789,3.224160
1,121_APC_D15,,,2021-09-14,37.852636,-12.313376,-4.441834,0.300894,36.006908,41.771999,...,37909,,,11,270.000000,1.858527,12,11,35.598789,3.224160
2,121_APC_D15,,,2021-12-08,37.155728,-14.935464,-7.701659,0.137663,38.999432,38.425999,...,37909,,,11,270.000000,1.858527,12,11,35.598789,3.224160
3,121_APC_D15,,,2022-06-10,37.159135,-14.816424,-6.980119,0.167454,39.002838,35.361000,...,37909,,,11,270.000000,1.858527,12,11,35.598789,3.224160
4,121_APC_D15,,,2021-09-23,37.158385,-13.969044,-5.805131,0.222621,39.002090,35.499001,...,37909,,,11,270.000000,1.858527,12,11,35.598789,3.224160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5817,BRG_910111_01,,,2023-02-04,41.714628,-13.912240,-10.221497,0.054404,42.762909,32.912998,...,267642,,,8,45.294772,1.318238,150,0,31.034153,4.342498
5818,BRG_910111_01,,,2023-02-16,39.653029,-11.491989,-6.076784,0.175861,38.837509,38.230999,...,267642,,,8,45.294772,1.318238,150,0,31.034153,4.342498
5819,BRG_910111_01,,,2023-02-19,41.714839,-12.510206,-7.792358,0.110149,42.763123,34.505001,...,267642,,,8,45.294772,1.318238,150,0,31.034153,4.342498
5820,BRG_910111_01,,,2023-03-01,39.657848,-14.591243,-7.201469,0.155738,38.842331,35.603001,...,267642,,,8,45.294772,1.318238,150,0,31.034153,4.342498
