In [None]:
# The following notebook will get two different datasets of explanatory variables: temporal an non-temporal
# related. 
# In order to improve the speed time, this notebook will create the respective datasets and it will send a task to 
# EarthEngine with a ReduceByRegion operation, we have proved that this method is faster than using the individual
# calls to the API.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

The objective would be to loop over the points or the dates...<br>
After testing this script https://code.earthengine.google.com/b18e876cca44266be704924b7354ddff <br>
I found out that the best way to do it is to loop over the dates, and then pass the reduceregions. <br>


In [3]:
df = pd.read_csv('data/field_data_unique_coords.csv')
df["date"] = pd.to_datetime(df["date"])
high_corr_ids = pd.read_csv('data/ids_high_corr.csv')
len(df)
df

Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,ach,BRG_140301_01,102.099167,1.519444,2018-10-15,-14.400
1,ach,BRG_140301_01,102.099167,1.519444,2018-10-16,-17.900
2,ach,BRG_140301_01,102.099167,1.519444,2018-10-17,-20.600
3,ach,BRG_140301_01,102.099167,1.519444,2018-10-18,-18.100
4,ach,BRG_140301_01,102.099167,1.519444,2018-10-19,-23.100
...,...,...,...,...,...,...
274261,old_brg,kecil1,113.805611,-2.856089,2019-10-26,-3.021
274262,old_brg,kecil1,113.805611,-2.856089,2019-10-27,-3.023
274263,old_brg,kecil1,113.805611,-2.856089,2019-10-31,-3.023
274264,old_brg,kecil1,113.805611,-2.856089,2019-11-02,-3.023


In [4]:
unique_coords = df[df.id.isin(high_corr_ids.id)][["id", "lon", "lat"]].drop_duplicates()
unique_coords.head()
len(unique_coords)

# Convert them as a geodataframe and save them

geometry = [Point(xy) for xy in zip(unique_coords.lon, unique_coords.lat)]
gdf = gpd.GeoDataFrame(unique_coords, geometry=geometry)
# gdf.crs = {'init': 'epsg:4326'}
# gdf.to_file("data/0_shp/high_corr_stations.shp")

In [5]:
# Read regions shapefile


# I have two regions, first is to only the high correlated stations and the second is all the stations
# gdf_regions = gpd.GeoDataFrame.from_file("data/0_shp/regions_to_request_explanatory.gpkg")
gdf_regions = gpd.GeoDataFrame.from_file("data/0_shp/regions_to_request_explanatory_all.gpkg")
gdf_regions

Unnamed: 0,id,geometry
0,1,"MULTIPOLYGON (((96.37854 4.01317, 96.76923 3.9..."
1,2,"MULTIPOLYGON (((102.96446 -0.63790, 104.82488 ..."
2,3,"MULTIPOLYGON (((140.00836 -7.80760, 140.75163 ..."
3,4,"MULTIPOLYGON (((105.23245 -2.56075, 105.62785 ..."
4,5,"MULTIPOLYGON (((101.59551 1.61281, 101.45686 0..."
5,6,"MULTIPOLYGON (((100.69365 2.01094, 100.81080 2..."
6,7,"MULTIPOLYGON (((108.80424 1.60848, 109.83126 1..."
7,8,"MULTIPOLYGON (((110.98152 -2.86934, 114.00610 ..."
8,9,"MULTIPOLYGON (((132.99060 -0.68691, 133.43736 ..."
9,10,"MULTIPOLYGON (((116.84967 3.98347, 117.30926 3..."


In [6]:
df = pd.read_csv('data/field_data_unique_coords.csv')
df["date"] = pd.to_datetime(df.date)

# To only get the high correlated stations, uncomment the following lines
# high_corr_ids = pd.read_csv('data/ids_high_corr.csv')
# df = df[df['id'].isin(high_corr_ids['id'])]

# Remove those date where the gwl measure is out of reasonable range
upper_thres = 20
lower_thres = -100

df = df[(df.gwl_cm < upper_thres) & (df.gwl_cm > lower_thres)]

# save the final points

# df.to_csv('field_data_high_corr.csv', index=False)

# Get the coordinates of the individual points

unique_coords = df[["id", 'lon', 'lat']].drop_duplicates()
len(df), len(unique_coords)

(260942, 2074)

In [7]:
import ee
from gee_scripts.get_sources import get_s1_image, get_gldas, get_gpm
ee.Initialize()

In [8]:
# Create geodataframe from x y coordinates

gdf_unique_coords = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")


# Add the region id to each point

gdf_unique_coords = gpd.sjoin(gdf_unique_coords, gdf_regions[["id", "geometry"]], how="left", predicate="within")


# I need to extract all the dates from the first group of points
# first get the ids of the first group of points

gdf_unique_coords[gdf_unique_coords.id_left == "15_RAPP_TP-I-53"]

Unnamed: 0,id_left,lon,lat,geometry,index_right,id_right
93142,15_RAPP_TP-I-53,102.015869,0.563137,POINT (102.01587 0.56314),4,5


## Get temporal explanatory variables

In [9]:
base_selectors = ["system:index", "lat", "lon", "id", "date"]
s1_selectors = ["LIA", "VH", "VV", "VVVH_ratio", "angle"]
gldas_selectors = ['sm_1', 'sm_3', 'sm_7', 'sm_30']
gpm_selectors = ['precipitation', 'prec_3', 'prec_7', 'prec_30']


def get_temporal_explanatory(region_id):
   """Get the explanatory temporal based variables"""

   region = gdf_regions[gdf_regions.id == region_id].to_crs("EPSG:4326")[:]
   dates = df[df.id.isin(gdf_unique_coords[gdf_unique_coords.id_right == region_id].id_left.unique())].date.unique()
   points = gdf_unique_coords[gdf_unique_coords.id_right == region_id][["id_left", "geometry"]].rename(columns={"id_left": "id"}).to_crs("EPSG:4326")

   # print(len(dates), len(points))
   # Convert to ee elements

   ee_dates = ee.FeatureCollection(ee.List([ ee.Feature(None, {"date": date}) for date in dates]))
   ee_points = ee.FeatureCollection(points.__geo_interface__)
   ee_region = ee.FeatureCollection(region.__geo_interface__)

   def get_sources(date_feature):
      
      date_range = ee.Date(date_feature.get("date")).getRange('day')

      s1_composite = get_s1_image(date_range, ee_region)

      return s1_composite.set({
         "numberOfBands" : s1_composite.bandNames().size(),
         "date" : ee.Date(date_feature.get("date"))
         })
      
   def reduce_composite(composite):
      
      # Filter the extra data with the matching date
      date = composite.get("date")
      date_range = ee.Date(date).getRange('day')

      gldas_composite = get_gldas(date_range, ee_region)
      gpm_composite = get_gpm(date_range, ee_region)

      composite = (ee.Image(composite)
            .addBands(gldas_composite)
            .addBands(gpm_composite)
      )
      
      return composite.reduceRegions(**{
         "collection" : ee_points,
         "reducer" : ee.Reducer.first(),
         "scale" : 10,
         "tileScale" : 16
      }).filter(ee.Filter.notNull(['VH'])).map(lambda feature: feature.set({
         "date" : date
      }))


   task = (ee_dates
         .map(get_sources)
         .filter(ee.Filter.gt('numberOfBands', 0))
         .map(reduce_composite).flatten()
   )

   task_name = f"All_temporal_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}_with_date"

   ee_task = ee.batch.Export.table.toDrive(**{
      "collection": task, 
      "folder" : "INDONESIA_GWL",
      "description": task_name,
      "selectors": base_selectors + s1_selectors + gldas_selectors + gpm_selectors
   })

   # Uncoment to start the task
   ee_task.start()
   
   print(task_name)

[get_temporal_explanatory(region_id) for region_id in gdf_regions.id.unique()]

All_temporal_non_resample_at_all_region_1_dates_520_points_24_with_date
All_temporal_non_resample_at_all_region_2_dates_1773_points_149_with_date
All_temporal_non_resample_at_all_region_3_dates_479_points_1_with_date
All_temporal_non_resample_at_all_region_4_dates_988_points_348_with_date
All_temporal_non_resample_at_all_region_5_dates_1796_points_717_with_date
All_temporal_non_resample_at_all_region_6_dates_489_points_43_with_date
All_temporal_non_resample_at_all_region_7_dates_1274_points_477_with_date
All_temporal_non_resample_at_all_region_8_dates_1671_points_221_with_date
All_temporal_non_resample_at_all_region_9_dates_379_points_17_with_date
All_temporal_non_resample_at_all_region_10_dates_846_points_77_with_date


[None, None, None, None, None, None, None, None, None, None]

## Get "yearly" temporal explanatory variables (Hansen)

In [None]:
from gee_scripts.get_sources import get_hansen

In [None]:
hansen_selectors = ["year", "B3","B4","B5","B7","ndvi","ndmi","ndbri"]

# get all the years from the field data
years = sorted([y for y in df.date.dt.year.unique() if y != 2013] )

for year in years:

    points = df[["id", "lon", "lat"]].drop_duplicates()
    points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat), crs="EPSG:4326")
    ee_points = ee.FeatureCollection(points.__geo_interface__)

    image = get_hansen(year)

    result = image.reduceRegions(**{
        "collection" : ee_points,
        "reducer" : ee.Reducer.first(),
        "scale" : 30,
        "tileScale" : 16
    }).map(lambda feature: feature.set("year", str(year)))

    ee_task = ee.batch.Export.table.toDrive(**{
        "collection": result, 
        "folder" : "INDONESIA_GWL",
        "description": f"Hansen_year_{year}_points_{len(points)}_f",
        "selectors": base_selectors + hansen_selectors
    })

    print(f"Hansen_year_{year}_points_{len(points)}_f")

    # ee_task.start()

## Get non temporal explanatory variables (others)

In [None]:
from gee_scripts.get_sources import get_srtm, get_globcover, get_gedi, get_gldas_stats

In [None]:
# We'll try to get all the points at once, not by region (so we won't filter by region)
region = gdf_regions.to_crs("EPSG:4326")[:]
ee_region = ee.FeatureCollection(region.__geo_interface__)
points = gdf_unique_coords[["id_left", "geometry"]].rename(columns={"id_left": "id"}).to_crs("EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__)
len(points)

In [None]:
composite = (
    get_srtm()
        .addBands(get_globcover())
        .addBands(get_gedi(ee_region))
        .addBands(get_gldas_stats(ee_region))
)
composite.bandNames().getInfo()

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['canopy_height']))

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description": f"All_Non_temporal_points_{len(points)}",
    "selectors": base_selectors + ['elevation', 'aspect', 'slope', 'land_cov', 'canopy_height', "gldas_mean", "gldas_stddev"]
})

# Uncoment to start the task
# ee_task.start()

# Merge explanatory variables

## - Read temporal variables

In [None]:
from pathlib import Path

In [None]:
explanatory_path = Path("data/7_training_data/")
temporal_file_names = [
    "High_corr_All_temporal_non_resample_region_1_dates_485_points_2.csv",
    "High_corr_All_temporal_non_resample_region_2_dates_626_points_11.csv",
    "High_corr_All_temporal_non_resample_region_3_dates_1737_points_13.csv",
    "High_corr_All_temporal_non_resample_region_4_dates_653_points_12.csv",
    "High_corr_All_temporal_non_resample_region_5_dates_1542_points_21.csv",
    "High_corr_All_temporal_non_resample_region_6_dates_479_points_1.csv",
]

# temporal_file_names = [
#     "All_temporal_non_resample_region_10_dates_846_points_77.csv",
#     "All_temporal_non_resample_region_1_dates_520_points_24.csv",
#     "All_temporal_non_resample_region_2_dates_1773_points_149.csv",
#     "All_temporal_non_resample_region_3_dates_479_points_1.csv",
#     "All_temporal_non_resample_region_4_dates_988_points_348.csv",
#     "All_temporal_non_resample_region_5_dates_1796_points_717.csv",
#     "All_temporal_non_resample_region_6_dates_489_points_43.csv",
#     "All_temporal_non_resample_region_7_dates_1274_points_477.csv",
#     "All_temporal_non_resample_region_8_dates_1671_points_221.csv",
#     "All_temporal_non_resample_region_9_dates_379_points_17.csv",
# ]

In [None]:
temporal_cols = [
    'id', 'lat', 'lon', "date", 'LIA', 'VH', 'VV', 'VVVH_ratio', 'angle',
    'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation',
    'prec_3', 'prec_7', 'prec_30'
]

def add_date_to_explanatory_df(region_id, explain_df):
    """Add the corresponding date to the explanatory dataframe.

    As the result from GEE didn't come with the date, we'll need to add it manually.
    For each of the .csv results, we have to use the "dates" list that was used to get the data,
    and by its index, we can merge the date to the dataframe.
    
    For each region we will have different dates.
    """

    dates = pd.DataFrame(
        df[df.id.isin(gdf_unique_coords[gdf_unique_coords.id_right == region_id].id_left.unique())].date.unique(),
        columns=["date"]
    ).reset_index()

    # Get the date of the measurement based on the "system:index" col
    explain_df["date_idx"] = explain_df["system:index"].apply(lambda x: int(x.split("_")[0]))

    # return explain_df.merge(dates, left_on="date_idx", right_on="index")
    return explain_df.merge(dates, left_on="date_idx", right_on="index")[temporal_cols]



# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
    add_date_to_explanatory_df(region_id, explain_df) 
    for region_id, explain_df 
    in enumerate(
        [
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
        start=1
        )
    ], axis=0)

In [None]:
# I have checked this value and compared it with the latest request I did when I assigned directly the date
assert temp_explanatory_dfs[(temp_explanatory_dfs.id=="15_RAPP_TP-I-53") & (temp_explanatory_dfs.date == "2022-11-22")]["LIA"].values[0] == 33.80583090268845

In [None]:
# Test that dates assignation are correct
assert temp_explanatory_dfs[temp_explanatory_dfs.id == "BRG_910111_01"].date.dt.year.unique().tolist() == [2018, 2019, 2020]

## Read Hansen yearly variables

In [None]:
from pathlib import Path

In [None]:
explanatory_path = Path("data/7_training_data/")
hansen_file_names = [
    "Hansen_year_2018_points_2075_f.csv",
    "Hansen_year_2019_points_2075_f.csv",
    "Hansen_year_2020_points_2075_f.csv",
    "Hansen_year_2021_points_2075_f.csv",
    "Hansen_year_2022_points_2075_f.csv",
    "Hansen_year_2023_points_2075_f.csv"
]

hansen_df = pd.concat([
    pd.read_csv(explanatory_path/file_name) 
    for file_name 
    in hansen_file_names
], axis=0)

## - Read non temporal explanatory

In [None]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

non_temporal_file_name = "All_Non_temporal_points_2074.csv"
non_temporal_df = pd.read_csv(explanatory_path/non_temporal_file_name)

# Create final explanatory variables dataset

In [None]:
hansen_df

In [None]:
# Merge the non-temporal variables with the temporal ones

explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

# Merge hansen data with year and id
explanatory_df["year"] = explanatory_df.date.dt.year
hansen_df["year"] = hansen_df["year"].astype(int)
explanatory_df = explanatory_df.merge(hansen_df[["id"] + hansen_selectors], on=["id", "year"], how="left")

# I get more values here because I have requested Hansen for all the years
explanatory_df

In [None]:
export_vars = [
    'id', 'date', 'LIA', 'VH', 'VV', 'VVVH_ratio',
    'angle', 'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation', 'prec_3',
    'prec_7', 'prec_30', 'elevation',
    'aspect', 'slope', 'land_cov', 'canopy_height', 'gldas_mean',
    'gldas_stddev', 'B3', 'B4',
    'B5', 'B7', 'ndvi', 'ndmi', 'ndbri'
]

# FINAL STEP: Merge explanatory variables with response variable

In [None]:
explanatory_with_response_var = df[["id", "date", "gwl_cm", "lat", "lon"]].merge(explanatory_df[export_vars], on=["id", "date"])

# Add day of the year as a variable
explanatory_with_response_var["doy"] = explanatory_with_response_var.date.dt.dayofyear
explanatory_with_response_var.to_csv("data/7_training_data/explanatory_with_response_var.csv", index=False)

In [None]:
explanatory_with_response_var.columns