In [None]:
# The following notebook will get two different datasets of explanatory variables: temporal an non-temporal
# related. 
# In order to improve the speed time, this notebook will create the respective datasets and it will send a task to 
# EarthEngine with a ReduceByRegion operation, we have proved that this method is faster than using the individual
# calls to the API.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import ee
from gee_scripts.get_sources import get_s1_image, get_gldas, get_gpm, get_hansen
from gee_scripts.get_sources import get_srtm, get_globcover, get_gedi, get_gldas_stats
ee.Initialize()

The objective would be to loop over the points or the dates...<br>
After testing this script https://code.earthengine.google.com/b18e876cca44266be704924b7354ddff <br>
I found out that the best way to do it is to loop over the dates, and then pass the reduceregions. <br>


# 1. Read data

In [None]:
df = pd.read_csv('data/field_data_unique_coords.csv')
df["date"] = pd.to_datetime(df["date"])
len(df)
df

##########################
## Set type of output
##########################

In [None]:
# This notebook can be run entirely, if we want to task the orders to GEE we'll set this variable to True
send_task = False

##########################

In [None]:
unique_coords = df[["id", "lon", "lat"]].drop_duplicates()
unique_coords.head()
len(unique_coords)

# Convert them as a geodataframe and save them
geometry = [Point(xy) for xy in zip(unique_coords.lon, unique_coords.lat)]
gdf = gpd.GeoDataFrame(unique_coords, geometry=geometry)

In [None]:
# Read regions shapefile

# I have two regions, first is to only the high correlated stations and the second is all the stations
# I will use either depending on the dataset we have selected above

shp_path = Path("data/0_shp/")
region_path = "regions_to_request_explanatory_all.gpkg"

gdf_regions = gpd.GeoDataFrame.from_file(shp_path/region_path)
gdf_regions

In [None]:
# Remove those date where the gwl measure is out of reasonable range
upper_thres = 20
lower_thres = -100

df = df[(df.gwl_cm < upper_thres) & (df.gwl_cm > lower_thres)]

# Get the coordinates of the individual points

unique_coords = df[["id", 'lon', 'lat']].drop_duplicates()
len(df), len(unique_coords)

In [None]:
# Create geodataframe from x y coordinates
gdf_unique_coords = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")

# Add the region id to each point
gdf_unique_coords = gpd.sjoin(gdf_unique_coords, gdf_regions[["region_id", "geometry"]], how="left", predicate="within")

In [None]:
# Failing datasets array(['BRG_150710_01', 'BRG_621107_03'], dtype=object)
gdf_unique_coords[gdf_unique_coords.id.isin(["BRG_621107_03", "BRG_150710_01"])]

# 1. Get temporal explanatory variables

In [None]:
gdf_regions

In [None]:
gdf_unique_coords

# 2. Get temporal explanatory

In [None]:
base_selectors = ["system:index", "lat", "lon", "id", "date"]
s1_selectors = ["LIA", "VH", "VV", "VVVH_ratio", "angle"]
gldas_selectors = ['sm_1', 'sm_3', 'sm_7', 'sm_30']
gpm_selectors = ['precipitation', 'prec_3', 'prec_7', 'prec_30']

def get_temporal_explanatory(region_id):
    """Get the explanatory temporal based variables"""

    region = gdf_regions[gdf_regions.region_id == region_id].to_crs("EPSG:4326")[:]
    dates = df[df.id.isin(gdf_unique_coords[gdf_unique_coords.region_id == region_id].id.unique())].date.unique()
    points = gdf_unique_coords[gdf_unique_coords.region_id == region_id][["id", "geometry", "lat", "lon"]].to_crs("EPSG:4326")

    # print(len(dates), len(points))
    # Convert to ee elements

    ee_dates = ee.FeatureCollection(ee.List([ ee.Feature(None, {"date": date}) for date in dates]))
    ee_points = ee.FeatureCollection(points.__geo_interface__)
    ee_region = ee.FeatureCollection(region.__geo_interface__)

    def get_sources(date_feature):

        date_range = ee.Date(date_feature.get("date")).getRange('day')

        s1_composite = get_s1_image(date_range, ee_region)

        return s1_composite.set({
         "numberOfBands" : s1_composite.bandNames().size(),
         "date" : ee.Date(date_feature.get("date"))
         })

    def reduce_composite(composite):

        # Filter the extra data with the matching date
        date = composite.get("date")
        date_range = ee.Date(date).getRange('day')

        gldas_composite = get_gldas(date_range, ee_region)
        gpm_composite = get_gpm(date_range, ee_region)

        composite = (ee.Image(composite)
            .addBands(gldas_composite)
            .addBands(gpm_composite)
        )

        return composite.reduceRegions(**{
         "collection" : ee_points,
         "reducer" : ee.Reducer.first(),
         "scale" : 10,
         "tileScale" : 16
        }).filter(ee.Filter.notNull(['VH'])).map(lambda feature: feature.set({
         "date" : date
        }))


    task = (ee_dates
         .map(get_sources)
         .filter(ee.Filter.gt('numberOfBands', 0))
         .map(reduce_composite).flatten()
    )

    task_name = f"All_temporal_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}_with_date_lon_lat"

    ee_task = ee.batch.Export.table.toDrive(**{
      "collection": task, 
      "folder" : "INDONESIA_GWL",
      "description": task_name,
      "selectors": base_selectors + s1_selectors + gldas_selectors + gpm_selectors
    })

    # Uncoment to start the task
    not send_task or ee_task.start()
    print("Exported" if send_task else "Not exported", task_name)

[get_temporal_explanatory(region_id) for region_id in gdf_regions.region_id.unique()]

# 2.2 Get "yearly" temporal explanatory variables (Hansen)

In [None]:
hansen_selectors = ["year", "B3","B4","B5","B7","ndvi","ndmi","ndbri"]

# get all the years from the field data
years = sorted([y for y in df.date.dt.year.unique() if y != 2013] )

for year in years:

    points = df[["id", "lon", "lat"]].drop_duplicates()
    points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat), crs="EPSG:4326")
    ee_points = ee.FeatureCollection(points.__geo_interface__)

    image = get_hansen(year)

    result = image.reduceRegions(**{
        "collection" : ee_points,
        "reducer" : ee.Reducer.first(),
        "scale" : 30,
        "tileScale" : 16
    }).map(lambda feature: feature.set("year", str(year)))
    
    task_name = f"Hansen_year_{year}_points_{len(points)}_f"

    ee_task = ee.batch.Export.table.toDrive(**{
        "collection": result, 
        "folder" : "INDONESIA_GWL",
        "description": f"Hansen_year_{year}_points_{len(points)}_f",
        "selectors": base_selectors + hansen_selectors
    })

    not send_task or ee_task.start()
    print("Exported" if send_task else "Not exported", task_name)


## 3. Get non temporal explanatory variables (others)

In [None]:
# This dataset is not too computational expensive, so we are not forced to chunk it
# We'll try to get all the points at once, not by region (so we won't filter by region)
region = gdf_regions.to_crs("EPSG:4326")[:]
ee_region = ee.FeatureCollection(region.__geo_interface__)
points = gdf_unique_coords[["region_id", "geometry"]].rename(columns={"region_id": "id"}).to_crs("EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__)
len(points)

In [None]:
composite = (
    get_srtm()
        .addBands(get_globcover())
        .addBands(get_gedi(ee_region))
        .addBands(get_gldas_stats(ee_region))
)
composite.bandNames().getInfo()

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['canopy_height']))

task_name = f"All_Non_temporal_points_{len(points)}"

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description":task_name,
    "selectors": base_selectors + ['elevation', 'aspect', 'slope', 'land_cov', 'canopy_height', "gldas_mean", "gldas_stddev"]
})

# Uncoment to start the task
not send_task or ee_task.start()
print("Exported" if send_task else "Not exported", task_name)

# 4. Merge explanatory variables

## 4.1 Read temporal variables

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
explanatory_path = Path("data/7_training_data/")
dataset = "all"
temporal_file_names_groups = {
    "all" : [
        "All_temporal_non_resample_at_all_region_1_dates_520_points_24_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_2_dates_1773_points_148_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_3_dates_479_points_1_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_4_dates_988_points_348_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_5_dates_1796_points_717_with_date.csv",
        "All_temporal_non_resample_at_all_region_6_dates_489_points_43_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_7_dates_1274_points_477_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_8_dates_1671_points_220_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_10_dates_846_points_77_with_date_lon_lat.csv",
    ]
}

In [None]:
# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_file_names_groups[dataset]

# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
temp_explanatory_dfs

In [None]:
# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_file_names_groups[dataset]

# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
temp_explanatory_dfs

In [None]:
# drop duplicate id and date
temp_explanatory_dfs.drop_duplicates(["id", "date"])

## 4.2 Read Hansen yearly variables

In [None]:
explanatory_path = Path("data/7_training_data/")
hansen_file_names = [
    "Hansen_year_2018_points_2075_f.csv",
    "Hansen_year_2019_points_2075_f.csv",
    "Hansen_year_2020_points_2075_f.csv",
    "Hansen_year_2021_points_2075_f.csv",
    "Hansen_year_2022_points_2075_f.csv",
    "Hansen_year_2023_points_2075_f.csv"
]

hansen_df = pd.concat([
    pd.read_csv(explanatory_path/file_name) 
    for file_name 
    in hansen_file_names
], axis=0)
hansen_df

## 4.3 Read non temporal explanatory

In [None]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

non_temporal_file_name = "All_Non_temporal_points_2074.csv"
non_temporal_df = pd.read_csv(explanatory_path/non_temporal_file_name)
# drop lat and lon
non_temporal_df = non_temporal_df.drop(columns=["lat", "lon"])
non_temporal_df

## 4.4 Create final explanatory variables dataset

In [None]:
len(temp_explanatory_dfs)

In [None]:
explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

In [None]:
# Merge the non-temporal variables with the temporal ones

explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

# Merge hansen data with year and id
explanatory_df["year"] = explanatory_df.date.dt.year
hansen_df["year"] = hansen_df["year"].astype(int)
explanatory_df = explanatory_df.merge(hansen_df[["id"] + hansen_selectors], on=["id", "year"], how="left")

# I get more values here because I have requested Hansen for all the years
explanatory_df

In [None]:
export_vars = [
    'id', 'date', 'LIA', 'VH', 'VV', 'VVVH_ratio',
    'angle', 'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation', 'prec_3',
    'prec_7', 'prec_30', 'elevation',
    'aspect', 'slope', 'land_cov', 'canopy_height', 'gldas_mean',
    'gldas_stddev', 'B3', 'B4',
    'B5', 'B7', 'ndvi', 'ndmi', 'ndbri'
]

# 4.5 Final step: Merge explanatory variables with response variable

In [None]:
explanatory_with_response_var = df[["source", "id", "date", "gwl_cm", "lat", "lon"]].merge(explanatory_df[export_vars], on=["id", "date"])

# Add day of the year as a variable
explanatory_with_response_var["doy"] = explanatory_with_response_var.date.dt.dayofyear
# explanatory_with_response_var.to_csv("data/7_training_data/explanatory_with_response_var_and_source.csv", index=False)

In [None]:
import pandas as pd
pd.read_csv("data/7_training_data/explanatory_with_response_var_and_source.csv")