In [1]:
# The following notebook will get two different datasets of explanatory variables: temporal an non-temporal
# related. 
# In order to improve the speed time, this notebook will create the respective datasets and it will send a task to 
# EarthEngine with a ReduceByRegion operation, we have proved that this method is faster than using the individual
# calls to the API.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import ee
from gee_scripts.get_sources import get_s1_image, get_gldas, get_gpm, get_hansen
from gee_scripts.get_sources import get_srtm, get_globcover, get_gedi, get_gldas_stats
ee.Initialize()

The objective would be to loop over the points or the dates...<br>
After testing this script https://code.earthengine.google.com/b18e876cca44266be704924b7354ddff <br>
I found out that the best way to do it is to loop over the dates, and then pass the reduceregions. <br>


In [4]:
df = pd.read_csv('data/field_data_unique_coords.csv')
df["date"] = pd.to_datetime(df["date"])
high_corr_ids = pd.read_csv('data/ids_high_corr.csv')
len(df)
df

Unnamed: 0,id,date,source,lon,lat,gwl_cm
0,02_AHL_SBG-B076,2020-11-05,wal,117.007750,3.937760,-37.00
1,02_AHL_SBG-B076,2020-11-17,wal,117.007750,3.937760,-39.00
2,02_AHL_SBG-B076,2020-12-05,wal,117.007750,3.937760,-39.00
3,02_AHL_SBG-B076,2020-12-16,wal,117.007750,3.937760,-35.00
4,02_AHL_SBG-B076,2021-01-02,wal,117.007750,3.937760,-34.00
...,...,...,...,...,...,...
267198,kecil1,2019-10-26,old_brg,113.805611,-2.856089,-30.21
267199,kecil1,2019-10-27,old_brg,113.805611,-2.856089,-30.23
267200,kecil1,2019-10-31,old_brg,113.805611,-2.856089,-30.23
267201,kecil1,2019-11-02,old_brg,113.805611,-2.856089,-30.23


##########################
## Set type of output
##########################

In [5]:
# dataset can be either "all" or "high_corr"
# The difference here is just the number of points that we'll order to GEE
# As we did a test with only "high_corr" datasets, we'll use everything
dataset = "all"

# This notebook can be run entirely, if we want to task the orders to GEE we'll set this variable to True
send_task = False

##################################3

In [6]:
unique_coords = df[df.id.isin(high_corr_ids.id)][["id", "lon", "lat"]].drop_duplicates()
unique_coords.head()
len(unique_coords)

# Convert them as a geodataframe and save them
geometry = [Point(xy) for xy in zip(unique_coords.lon, unique_coords.lat)]
gdf = gpd.GeoDataFrame(unique_coords, geometry=geometry)

In [7]:
# Read regions shapefile


# I have two regions, first is to only the high correlated stations and the second is all the stations
# I will use either depending on the dataset we have selected above

shp_path = Path("data/0_shp/")
region_path = "regions_to_request_explanatory_all.gpkg" if dataset == "all" else "regions_to_request_explanatory.gpkg"

gdf_regions = gpd.GeoDataFrame.from_file(shp_path/region_path)
gdf_regions

Unnamed: 0,region_id,geometry
0,1,"MULTIPOLYGON (((96.37854 4.01317, 96.76923 3.9..."
1,2,"MULTIPOLYGON (((102.96446 -0.63790, 104.82488 ..."
2,3,"MULTIPOLYGON (((140.00836 -7.80760, 140.75163 ..."
3,4,"MULTIPOLYGON (((105.23245 -2.56075, 105.62785 ..."
4,5,"MULTIPOLYGON (((101.59551 1.61281, 101.45686 0..."
5,6,"MULTIPOLYGON (((100.69365 2.01094, 100.81080 2..."
6,7,"MULTIPOLYGON (((108.80424 1.60848, 109.83126 1..."
7,8,"MULTIPOLYGON (((110.98152 -2.86934, 114.00610 ..."
8,9,"MULTIPOLYGON (((132.99060 -0.68691, 133.43736 ..."
9,10,"MULTIPOLYGON (((116.84967 3.98347, 117.30926 3..."


In [8]:
df = pd.read_csv('data/field_data_unique_coords.csv')
df["date"] = pd.to_datetime(df.date)

if dataset == "high_corr":
    high_corr_ids = pd.read_csv('data/ids_high_corr.csv')
    df = df[df['id'].isin(high_corr_ids['id'])]

# Remove those date where the gwl measure is out of reasonable range
upper_thres = 20
lower_thres = -100

df = df[(df.gwl_cm < upper_thres) & (df.gwl_cm > lower_thres)]

# Get the coordinates of the individual points

unique_coords = df[["id", 'lon', 'lat']].drop_duplicates()
len(df), len(unique_coords)

(254906, 2072)

In [9]:
# Create geodataframe from x y coordinates
gdf_unique_coords = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")


# Add the region id to each point
gdf_unique_coords = gpd.sjoin(gdf_unique_coords, gdf_regions[["region_id", "geometry"]], how="left", predicate="within")

In [10]:
# Failing datasets array(['BRG_150710_01', 'BRG_621107_03'], dtype=object)
gdf_unique_coords[gdf_unique_coords.id.isin(["BRG_621107_03", "BRG_150710_01"])]

Unnamed: 0,id,lon,lat,geometry,index_right,region_id
222630,BRG_150710_01,103.900168,-1.274317,POINT (103.90017 -1.27432),1,2
245933,BRG_621107_03,114.2206,-2.654022,POINT (114.22060 -2.65402),7,8


## Get temporal explanatory variables

In [11]:
gdf_regions

Unnamed: 0,region_id,geometry
0,1,"MULTIPOLYGON (((96.37854 4.01317, 96.76923 3.9..."
1,2,"MULTIPOLYGON (((102.96446 -0.63790, 104.82488 ..."
2,3,"MULTIPOLYGON (((140.00836 -7.80760, 140.75163 ..."
3,4,"MULTIPOLYGON (((105.23245 -2.56075, 105.62785 ..."
4,5,"MULTIPOLYGON (((101.59551 1.61281, 101.45686 0..."
5,6,"MULTIPOLYGON (((100.69365 2.01094, 100.81080 2..."
6,7,"MULTIPOLYGON (((108.80424 1.60848, 109.83126 1..."
7,8,"MULTIPOLYGON (((110.98152 -2.86934, 114.00610 ..."
8,9,"MULTIPOLYGON (((132.99060 -0.68691, 133.43736 ..."
9,10,"MULTIPOLYGON (((116.84967 3.98347, 117.30926 3..."


In [12]:
base_selectors = ["system:index", "lat", "lon", "id", "date"]
s1_selectors = ["LIA", "VH", "VV", "VVVH_ratio", "angle"]
gldas_selectors = ['sm_1', 'sm_3', 'sm_7', 'sm_30']
gpm_selectors = ['precipitation', 'prec_3', 'prec_7', 'prec_30']

def get_temporal_explanatory(region_id):
   """Get the explanatory temporal based variables"""

   region = gdf_regions[gdf_regions.region_id == region_id].to_crs("EPSG:4326")[:]
   dates = df[df.id.isin(gdf_unique_coords[gdf_unique_coords.region_id == region_id].id.unique())].date.unique()
   points = gdf_unique_coords[gdf_unique_coords.region_id == region_id][["region_id", "geometry", "lat", "lon"]].rename(columns={"region_id": "id"}).to_crs("EPSG:4326")

   # print(len(dates), len(points))
   # Convert to ee elements

   ee_dates = ee.FeatureCollection(ee.List([ ee.Feature(None, {"date": date}) for date in dates]))
   ee_points = ee.FeatureCollection(points.__geo_interface__)
   ee_region = ee.FeatureCollection(region.__geo_interface__)

   def get_sources(date_feature):
      
      date_range = ee.Date(date_feature.get("date")).getRange('day')

      s1_composite = get_s1_image(date_range, ee_region)

      return s1_composite.set({
         "numberOfBands" : s1_composite.bandNames().size(),
         "date" : ee.Date(date_feature.get("date"))
         })
      
   def reduce_composite(composite):
      
      # Filter the extra data with the matching date
      date = composite.get("date")
      date_range = ee.Date(date).getRange('day')

      gldas_composite = get_gldas(date_range, ee_region)
      gpm_composite = get_gpm(date_range, ee_region)

      composite = (ee.Image(composite)
            .addBands(gldas_composite)
            .addBands(gpm_composite)
      )
      
      return composite.reduceRegions(**{
         "collection" : ee_points,
         "reducer" : ee.Reducer.first(),
         "scale" : 10,
         "tileScale" : 16
      }).filter(ee.Filter.notNull(['VH'])).map(lambda feature: feature.set({
         "date" : date
      }))


   task = (ee_dates
         .map(get_sources)
         .filter(ee.Filter.gt('numberOfBands', 0))
         .map(reduce_composite).flatten()
   )

   task_name = f"All_temporal_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}_with_date_lon_lat"

   ee_task = ee.batch.Export.table.toDrive(**{
      "collection": task, 
      "folder" : "INDONESIA_GWL",
      "description": task_name,
      "selectors": base_selectors + s1_selectors + gldas_selectors + gpm_selectors
   })

   # Uncoment to start the task
   not send_task or ee_task.start()
   not send_task or print(task_name)

[get_temporal_explanatory(region_id) for region_id in gdf_regions.region_id.unique() if region_id == 5]

[None]

## Get "yearly" temporal explanatory variables (Hansen)

In [13]:
hansen_selectors = ["year", "B3","B4","B5","B7","ndvi","ndmi","ndbri"]

# get all the years from the field data
years = sorted([y for y in df.date.dt.year.unique() if y != 2013] )

for year in years:

    points = df[["id", "lon", "lat"]].drop_duplicates()
    points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat), crs="EPSG:4326")
    ee_points = ee.FeatureCollection(points.__geo_interface__)

    image = get_hansen(year)

    result = image.reduceRegions(**{
        "collection" : ee_points,
        "reducer" : ee.Reducer.first(),
        "scale" : 30,
        "tileScale" : 16
    }).map(lambda feature: feature.set("year", str(year)))

    ee_task = ee.batch.Export.table.toDrive(**{
        "collection": result, 
        "folder" : "INDONESIA_GWL",
        "description": f"Hansen_year_{year}_points_{len(points)}_f",
        "selectors": base_selectors + hansen_selectors
    })

    not send_task or ee_task.start()
    not send_task or print(f"Hansen_year_{year}_points_{len(points)}_f")

## Get non temporal explanatory variables (others)

In [14]:
# This dataset is not too computational expensive, so we are not forced to chunk it
# We'll try to get all the points at once, not by region (so we won't filter by region)
region = gdf_regions.to_crs("EPSG:4326")[:]
ee_region = ee.FeatureCollection(region.__geo_interface__)
points = gdf_unique_coords[["region_id", "geometry"]].rename(columns={"region_id": "id"}).to_crs("EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__)
len(points)

2072

In [15]:
composite = (
    get_srtm()
        .addBands(get_globcover())
        .addBands(get_gedi(ee_region))
        .addBands(get_gldas_stats(ee_region))
)
composite.bandNames().getInfo()

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['canopy_height']))

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description": f"All_Non_temporal_points_{len(points)}",
    "selectors": base_selectors + ['elevation', 'aspect', 'slope', 'land_cov', 'canopy_height', "gldas_mean", "gldas_stddev"]
})

# Uncoment to start the task
not send_task or ee_task.start()

True

# Merge explanatory variables

## - Read temporal variables

In [16]:
from pathlib import Path
import pandas as pd

In [140]:
explanatory_path = Path("data/7_training_data/")
dataset = "all"
temporal_file_names_groups = {
    "high_corr" : [
        "High_corr_All_temporal_non_resample_region_1_dates_485_points_2.csv",
        "High_corr_All_temporal_non_resample_region_2_dates_626_points_11.csv",
        "High_corr_All_temporal_non_resample_region_3_dates_1737_points_13.csv",
        "High_corr_All_temporal_non_resample_region_4_dates_653_points_12.csv",
        "High_corr_All_temporal_non_resample_region_5_dates_1542_points_21.csv",
        "High_corr_All_temporal_non_resample_region_6_dates_479_points_1.csv",
    ],
    "all" : [
        "All_temporal_non_resample_at_all_region_1_dates_520_points_24_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_2_dates_1773_points_148_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_3_dates_479_points_1_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_4_dates_988_points_348_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_5_dates_1796_points_717_with_date.csv",
        "All_temporal_non_resample_at_all_region_6_dates_489_points_43_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_7_dates_1274_points_477_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_8_dates_1671_points_220_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_10_dates_846_points_77_with_date_lon_lat.csv",
    ]
}

In [141]:
# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_file_names_groups[dataset]

# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
temp_explanatory_dfs

Unnamed: 0,system:index,lat,lon,id,date,LIA,VH,VV,VVVH_ratio,angle,sm_1,sm_3,sm_7,sm_30,precipitation,prec_3,prec_7,prec_30
0,13_40018,3.809687,96.451939,121_APC_A21,2021-11-18,41.518192,-12.389068,-4.963172,0.261232,36.802202,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
1,13_40056,3.811906,96.468786,121_APC_B09,2021-11-18,37.709442,-11.907431,-5.864731,0.194680,35.960575,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
2,13_40094,3.806685,96.459393,121_APC_B14,2021-11-18,35.779500,-12.803118,-5.593195,0.223412,35.960575,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
3,13_40132,3.855798,96.523820,121_APC_D11,2021-11-18,36.007309,-13.151474,-5.039535,0.264961,36.007309,38.687000,38.470375,37.954553,38.176337,0.480000,0.242308,0.110308,0.418293
4,13_40170,3.850565,96.515498,121_APC_D15,2021-11-18,37.853025,-14.973237,-4.171970,0.350833,36.007309,38.687000,38.470375,37.954553,38.176337,0.480000,0.242308,0.110308,0.418293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5190,840_5023,3.562450,117.137710,02_AHL_SSP-F153,2023-01-22,35.852239,-16.009284,-7.527672,0.151633,35.958145,42.348999,37.115084,37.660375,38.239854,0.022500,0.007826,0.448148,0.610195
5191,840_5088,3.542370,117.130950,02_AHL_SSP-F173,2023-01-22,35.036420,-12.622485,-7.525078,0.122134,35.958125,42.348999,37.115084,37.660375,38.239854,0.022500,0.007826,0.448148,0.610195
5192,840_5153,3.598300,117.201030,02_AHL_SSP-G032,2023-01-22,39.640818,-13.096971,-8.278228,0.099642,35.132320,42.348999,37.115084,37.660375,38.239854,0.030000,0.013750,0.303273,0.510766
5193,840_5218,3.601620,117.193570,02_AHL_SSP-G044,2023-01-22,35.764655,-11.508965,-8.861900,0.059312,35.132320,42.348999,37.115084,37.660375,38.239854,0.012500,0.004348,0.052407,0.567561


In [142]:
# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_file_names_groups[dataset]

# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
temp_explanatory_dfs

Unnamed: 0,system:index,lat,lon,id,date,LIA,VH,VV,VVVH_ratio,angle,sm_1,sm_3,sm_7,sm_30,precipitation,prec_3,prec_7,prec_30
0,13_40018,3.809687,96.451939,121_APC_A21,2021-11-18,41.518192,-12.389068,-4.963172,0.261232,36.802202,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
1,13_40056,3.811906,96.468786,121_APC_B09,2021-11-18,37.709442,-11.907431,-5.864731,0.194680,35.960575,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
2,13_40094,3.806685,96.459393,121_APC_B14,2021-11-18,35.779500,-12.803118,-5.593195,0.223412,35.960575,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
3,13_40132,3.855798,96.523820,121_APC_D11,2021-11-18,36.007309,-13.151474,-5.039535,0.264961,36.007309,38.687000,38.470375,37.954553,38.176337,0.480000,0.242308,0.110308,0.418293
4,13_40170,3.850565,96.515498,121_APC_D15,2021-11-18,37.853025,-14.973237,-4.171970,0.350833,36.007309,38.687000,38.470375,37.954553,38.176337,0.480000,0.242308,0.110308,0.418293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5190,840_5023,3.562450,117.137710,02_AHL_SSP-F153,2023-01-22,35.852239,-16.009284,-7.527672,0.151633,35.958145,42.348999,37.115084,37.660375,38.239854,0.022500,0.007826,0.448148,0.610195
5191,840_5088,3.542370,117.130950,02_AHL_SSP-F173,2023-01-22,35.036420,-12.622485,-7.525078,0.122134,35.958125,42.348999,37.115084,37.660375,38.239854,0.022500,0.007826,0.448148,0.610195
5192,840_5153,3.598300,117.201030,02_AHL_SSP-G032,2023-01-22,39.640818,-13.096971,-8.278228,0.099642,35.132320,42.348999,37.115084,37.660375,38.239854,0.030000,0.013750,0.303273,0.510766
5193,840_5218,3.601620,117.193570,02_AHL_SSP-G044,2023-01-22,35.764655,-11.508965,-8.861900,0.059312,35.132320,42.348999,37.115084,37.660375,38.239854,0.012500,0.004348,0.052407,0.567561


In [143]:
# drop duplicate id and date
temp_explanatory_dfs.drop_duplicates(["id", "date"])

Unnamed: 0,system:index,lat,lon,id,date,LIA,VH,VV,VVVH_ratio,angle,sm_1,sm_3,sm_7,sm_30,precipitation,prec_3,prec_7,prec_30
0,13_40018,3.809687,96.451939,121_APC_A21,2021-11-18,41.518192,-12.389068,-4.963172,0.261232,36.802202,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
1,13_40056,3.811906,96.468786,121_APC_B09,2021-11-18,37.709442,-11.907431,-5.864731,0.194680,35.960575,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
2,13_40094,3.806685,96.459393,121_APC_B14,2021-11-18,35.779500,-12.803118,-5.593195,0.223412,35.960575,38.418999,38.516500,37.875393,38.052337,0.567273,0.296923,0.136061,0.480418
3,13_40132,3.855798,96.523820,121_APC_D11,2021-11-18,36.007309,-13.151474,-5.039535,0.264961,36.007309,38.687000,38.470375,37.954553,38.176337,0.480000,0.242308,0.110308,0.418293
4,13_40170,3.850565,96.515498,121_APC_D15,2021-11-18,37.853025,-14.973237,-4.171970,0.350833,36.007309,38.687000,38.470375,37.954553,38.176337,0.480000,0.242308,0.110308,0.418293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5190,840_5023,3.562450,117.137710,02_AHL_SSP-F153,2023-01-22,35.852239,-16.009284,-7.527672,0.151633,35.958145,42.348999,37.115084,37.660375,38.239854,0.022500,0.007826,0.448148,0.610195
5191,840_5088,3.542370,117.130950,02_AHL_SSP-F173,2023-01-22,35.036420,-12.622485,-7.525078,0.122134,35.958125,42.348999,37.115084,37.660375,38.239854,0.022500,0.007826,0.448148,0.610195
5192,840_5153,3.598300,117.201030,02_AHL_SSP-G032,2023-01-22,39.640818,-13.096971,-8.278228,0.099642,35.132320,42.348999,37.115084,37.660375,38.239854,0.030000,0.013750,0.303273,0.510766
5193,840_5218,3.601620,117.193570,02_AHL_SSP-G044,2023-01-22,35.764655,-11.508965,-8.861900,0.059312,35.132320,42.348999,37.115084,37.660375,38.239854,0.012500,0.004348,0.052407,0.567561


## Read Hansen yearly variables

In [144]:
explanatory_path = Path("data/7_training_data/")
hansen_file_names = [
    "Hansen_year_2018_points_2075_f.csv",
    "Hansen_year_2019_points_2075_f.csv",
    "Hansen_year_2020_points_2075_f.csv",
    "Hansen_year_2021_points_2075_f.csv",
    "Hansen_year_2022_points_2075_f.csv",
    "Hansen_year_2023_points_2075_f.csv"
]

hansen_df = pd.concat([
    pd.read_csv(explanatory_path/file_name) 
    for file_name 
    in hansen_file_names
], axis=0)
hansen_df

Unnamed: 0,system:index,lat,lon,id,date,year,B3,B4,B5,B7,ndvi,ndmi,ndbri
0,0,1.519444,102.099167,BRG_140301_01,,2018,16,91,66,27,0.700935,0.159236,0.542373
1,1797,1.451944,102.181944,BRG_140301_02,,2018,19,91,66,28,0.654545,0.159236,0.529412
2,2376,1.511780,102.158660,BRG_140302_01,,2018,25,73,75,39,0.489796,-0.013514,0.303571
3,4173,1.516389,102.433056,BRG_140302_02,,2018,18,89,74,32,0.663551,0.092025,0.471074
4,5970,0.830883,102.354165,BRG_140802_02,,2018,25,58,73,46,0.397590,-0.114504,0.115385
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2070,273175,-2.573375,114.022576,ij2,,2023,18,80,51,20,0.632653,0.221374,0.600000
2071,273254,-1.238478,103.589975,jambi1,,2023,16,78,64,26,0.659574,0.098592,0.500000
2072,273497,-0.210225,109.394853,kalbar1,,2023,30,87,86,43,0.487179,0.005780,0.338462
2073,273890,-2.319728,114.058131,kalteng1,,2023,16,79,50,19,0.663158,0.224806,0.612245


## - Read non temporal explanatory

In [145]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

non_temporal_file_name = "All_Non_temporal_points_2074.csv"
non_temporal_df = pd.read_csv(explanatory_path/non_temporal_file_name)
# drop lat and lon
non_temporal_df = non_temporal_df.drop(columns=["lat", "lon"])
non_temporal_df

Unnamed: 0,system:index,id,elevation,aspect,slope,land_cov,canopy_height,gldas_mean,gldas_stddev
0,0,BRG_140301_01,12,26.573112,2.073173,40,15,32.506657,1.989591
1,1799,BRG_140301_02,13,270.000000,0.927708,40,6,32.530491,1.982611
2,2530,BRG_140302_01,13,270.000000,1.854979,30,4,32.526196,1.994038
3,4174,BRG_140302_02,13,165.959034,3.818558,40,13,32.566113,1.973969
4,5970,BRG_140802_02,20,239.038901,5.392593,160,0,32.519711,1.973292
...,...,...,...,...,...,...,...,...,...
2069,272775,brg6,7,314.990916,1.311649,40,0,31.976336,2.483077
2070,273254,jambi1,17,-0.000000,1.854334,30,10,31.922771,2.548600
2071,273497,kalbar1,8,243.435103,2.073039,12,3,32.688313,2.437983
2072,273890,kalteng1,8,116.546267,2.074387,40,10,31.701403,3.461483


# Create final explanatory variables dataset

In [146]:
len(temp_explanatory_dfs)

383290

In [147]:
explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

383668

In [148]:
# Merge the non-temporal variables with the temporal ones

explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

# Merge hansen data with year and id
explanatory_df["year"] = explanatory_df.date.dt.year
hansen_df["year"] = hansen_df["year"].astype(int)
explanatory_df = explanatory_df.merge(hansen_df[["id"] + hansen_selectors], on=["id", "year"], how="left")

# I get more values here because I have requested Hansen for all the years
explanatory_df

Unnamed: 0,system:index_x,lat,lon,id,date,LIA,VH,VV,VVVH_ratio,angle,...,gldas_mean,gldas_stddev,year,B3,B4,B5,B7,ndvi,ndmi,ndbri
0,13_40018,3.809687,96.451939,121_APC_A21,2021-11-18,41.518192,-12.389068,-4.963172,0.261232,36.802202,...,36.231850,2.540359,2021,18,93,69,28,0.675676,0.148148,0.537190
1,26_40018,3.809687,96.451939,121_APC_A21,2022-06-10,41.228146,-12.835561,-5.448382,0.233155,36.512148,...,36.231850,2.540359,2022,18,93,69,28,0.675676,0.148148,0.537190
2,30_40018,3.809687,96.451939,121_APC_A21,2023-02-11,33.772167,-12.627489,-6.298591,0.179892,38.231369,...,36.231850,2.540359,2023,18,93,69,28,0.675676,0.148148,0.537190
3,31_40018,3.809687,96.451939,121_APC_A21,2023-02-23,33.771343,-14.834075,-5.877339,0.225530,38.230583,...,36.231850,2.540359,2023,18,93,69,28,0.675676,0.148148,0.537190
4,48_40018,3.809687,96.451939,121_APC_A21,2021-08-26,41.515096,-12.292229,-4.994940,0.257607,36.799110,...,36.231850,2.540359,2021,18,93,69,28,0.675676,0.148148,0.537190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384419,810_5283,3.601800,117.175400,02_AHL_SSP-G047,2023-03-23,31.246828,-13.393182,-10.248247,0.048664,35.960678,...,36.032196,1.873592,2023,28,51,80,43,0.291139,-0.221374,0.085106
384420,811_5283,3.601800,117.175400,02_AHL_SSP-G047,2023-05-22,31.245850,-14.819223,-9.667181,0.074998,35.959713,...,36.032196,1.873592,2023,28,51,80,43,0.291139,-0.221374,0.085106
384421,831_5283,3.601800,117.175400,02_AHL_SSP-G047,2021-10-11,30.421554,-13.218312,-7.836919,0.116892,35.135782,...,36.032196,1.873592,2021,14,77,46,18,0.692308,0.252033,0.621053
384422,833_5283,3.601800,117.175400,02_AHL_SSP-G047,2022-07-14,30.419886,-13.768306,-7.838126,0.122516,35.134108,...,36.032196,1.873592,2022,28,51,80,43,0.291139,-0.221374,0.085106


In [149]:
export_vars = [
    'id', 'date', 'LIA', 'VH', 'VV', 'VVVH_ratio',
    'angle', 'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation', 'prec_3',
    'prec_7', 'prec_30', 'elevation',
    'aspect', 'slope', 'land_cov', 'canopy_height', 'gldas_mean',
    'gldas_stddev', 'B3', 'B4',
    'B5', 'B7', 'ndvi', 'ndmi', 'ndbri'
]

# FINAL STEP: Merge explanatory variables with response variable

In [150]:
explanatory_with_response_var = df[["source", "id", "date", "gwl_cm", "lat", "lon"]].merge(explanatory_df[export_vars], on=["id", "date"])

# Add day of the year as a variable
explanatory_with_response_var["doy"] = explanatory_with_response_var.date.dt.dayofyear
explanatory_with_response_var.to_csv("data/7_training_data/explanatory_with_response_var_and_source.csv", index=False)

In [151]:
explanatory_with_response_var

Unnamed: 0,source,id,date,gwl_cm,lat,lon,LIA,VH,VV,VVVH_ratio,...,gldas_mean,gldas_stddev,B3,B4,B5,B7,ndvi,ndmi,ndbri,doy
0,wal,02_AHL_SBG-B076,2021-06-01,-41.00,3.937760,117.007750,32.785855,-11.481278,-5.556430,0.207099,...,35.463871,1.817817,21,97,61,25,0.644068,0.227848,0.590164,152
1,wal,02_AHL_SBG-B076,2021-08-24,-38.00,3.937760,117.007750,32.785295,-12.812067,-5.960235,0.201164,...,35.463871,1.817817,21,97,61,25,0.644068,0.227848,0.590164,236
2,wal,02_AHL_SBG-B076,2022-04-09,-22.00,3.937760,117.007750,32.795191,-18.437775,-10.636812,0.072032,...,35.463871,1.817817,33,70,96,52,0.359223,-0.156627,0.147541,99
3,wal,02_AHL_SBG-B076,2023-02-15,-31.00,3.937760,117.007750,32.793740,-13.051827,-9.007584,0.076149,...,35.463871,1.817817,33,70,96,52,0.359223,-0.156627,0.147541,46
4,wal,02_AHL_SBG-B101,2021-06-01,-66.00,3.931860,117.010120,39.267563,-9.778736,-6.648683,0.111111,...,35.655701,1.811218,21,75,57,25,0.562500,0.136364,0.500000,152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34214,old_brg,kalteng1,2020-03-31,-2.71,-2.319728,114.058131,32.480674,-13.179518,-7.523130,0.128794,...,31.701403,3.461483,19,81,53,21,0.620000,0.208955,0.588235,91
34215,old_brg,kalteng1,2020-04-12,-1.14,-2.319728,114.058131,32.474638,-12.679995,-7.348031,0.130210,...,31.701403,3.461483,19,81,53,21,0.620000,0.208955,0.588235,103
34216,old_brg,kecil1,2019-03-01,-29.54,-2.856089,113.805611,35.143679,-10.922965,-7.574065,0.093967,...,31.040922,3.362376,24,76,69,32,0.520000,0.048276,0.407407,60
34217,old_brg,kecil1,2019-06-29,-29.22,-2.856089,113.805611,35.139013,-11.637249,-7.424729,0.112345,...,31.040922,3.362376,24,76,69,32,0.520000,0.048276,0.407407,180
