The following notebook will get two different datasets of explanatory variables: temporal an non-temporal
related. 
In order to improve the speed time, this notebook will create the respective datasets and it will send a task to 
EarthEngine with a ReduceByRegion operation, we have proved that this method is faster than using the individual
calls to the API.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from typing import Literal

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import ee
from gee_scripts.get_sources import get_s1_collection, get_gldas, get_gpm, get_hansen, get_gpm_sum
from gee_scripts.get_sources import get_srtm, get_globcover, get_gedi, get_gldas_stats, get_extra_non_temporal
from gee_scripts import init_ee
init_ee()

The objective would be to loop over the points or the dates...<br>
After testing this script https://code.earthengine.google.com/b18e876cca44266be704924b7354ddff <br>
I found out that the best way to do it is to loop over the dates, and then pass the reduceregions. <br>


# 1. Read data

In [None]:
df = pd.read_csv('data/field_data_unique_coords_plus_bosf.csv', parse_dates=["date"])
assert df["date"].dtype == "datetime64[ns]"
len(df)

In [None]:
# ##########33 THIS IS TEMPORARY #############

# # For the second iteration I will only use the missing data from bosf

# missing_bosf = pd.read_csv("data/7_training_data/bosf/missing_points_dates.csv", parse_dates=["date"])

# # concatenate date and id to get unique id
# df["date_id"] = df["date"].dt.strftime("%Y-%m-%d") + "_" + df["id"].astype(str)

# # Filter in the missing bosf data
# df = df[df["date_id"].isin(missing_bosf["date_id"])]
# len(df)

# df

In [None]:
df[df.source=="bosf_NASA"].drop_duplicates(subset=["id", "date"])

In [None]:
df[df.gwl_cm < 400].gwl_cm.describe()

##########################
## Set type of output
##########################

In [None]:
# This notebook can be run entirely, if we want to task the orders to GEE we'll set this variable to True
send_task = False

##########################

In [None]:
unique_coords = df[["id", "lon", "lat"]].drop_duplicates()
unique_coords.head()
len(unique_coords)

In [None]:
# Convert them as a geodataframe and save them
geometry = [Point(xy) for xy in zip(unique_coords.lon, unique_coords.lat)]
gdf = gpd.GeoDataFrame(unique_coords, geometry=geometry)

In [None]:
# Read regions shapefile

# I have two regions, first is to only the high correlated stations and the second is all the stations
# I will use either depending on the dataset we have selected above

shp_path = Path("data/0_shp/")
region_path = "bosf_region.shp"

gdf_regions = gpd.GeoDataFrame.from_file(shp_path/region_path)
gdf_regions

In [None]:
# # Remove those date where the gwl measure is out of reasonable range
# upper_thres = 20
# lower_thres = -100

# df = df[(df.gwl_cm < upper_thres) & (df.gwl_cm > lower_thres)]

# # Get the coordinates of the individual points

# unique_coords = df[["id", 'lon', 'lat']].drop_duplicates()
# len(df), len(unique_coords)

In [None]:
# Create geodataframe from x y coordinates
gdf_unique_coords = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")

# Add the region id to each point
gdf_unique_coords = gpd.sjoin(gdf_unique_coords, gdf_regions[["region_id", "geometry"]], how="left", predicate="within")

In [None]:
gdf_unique_coords

# 1. Get temporal explanatory variables

# 2. Get temporal explanatory

In [None]:
# Set base selectors

base_selectors = ["system:index", "lat", "lon", "id", "date"]
s1_selectors = ["LIA", "VH", "VV", "VVVH_ratio", "angle"]
gldas_selectors = ['sm_1', 'sm_3', 'sm_7', 'sm_30']
gpm_selectors = ['precipitation', 'prec_3', 'prec_7', 'prec_30']
gpm_selectors_sum = ['prec_3_sum', 'prec_7_sum', 'prec_30_sum']
hansen_selectors = ["year", "B3","B4","B5","B7","ndvi","ndmi","ndbri"]

In [None]:
def get_temporal_explanatory(region_id, mode: Literal["strict", "extra"]="strict", position=1):
    """Get the explanatory temporal based variables.
    
    Parameters:
    region_id: int
        The region id to get the data for
    mode: Literal["strict", "extra"]
        strict: Only get the data for the exact date
        extra: Get the closest image to the date and adds the time difference as a variable
    position: int
        The position of the image selected in the collection, starts at 1
    """

    region = gdf_regions[gdf_regions.region_id == region_id].to_crs("EPSG:4326")[:]
    dates = df[df.id.isin(gdf_unique_coords[gdf_unique_coords.region_id == region_id].id.unique())].date.unique()
    points = gdf_unique_coords[gdf_unique_coords.region_id == region_id][["id", "geometry", "lat", "lon"]].to_crs("EPSG:4326")

    if mode == "extra":
        base_selectors + ["time_difference"]

    # print(len(dates), len(points))
    # Convert to ee elements

    ee_dates = ee.FeatureCollection(ee.List([ ee.Feature(None, {"date": date}) for date in dates]))
    ee_points = ee.FeatureCollection(points.__geo_interface__)
    ee_region = ee.FeatureCollection(region.__geo_interface__)

    def get_sources(date_feature):

        if mode=="strict":
            date_range = ee.Date(date_feature.get("date")).getRange('day')

            s1_composite = get_s1_collection(date_range, ee_region)

            return s1_composite.set({
            "numberOfBands" : s1_composite.bandNames().size(),
            "date" : ee.Date(date_feature.get("date"))
            })
        
        elif mode=="extra":

            requested_date = ee.Date(date_feature.get("date"))
            # Define a time window to search for images, e.g., +/- 30 days
            start_date = requested_date.advance(-30, 'day')
            end_date = requested_date.advance(30, 'day')

            # Get all S1 images in the time window
            s1_collection = get_s1_collection(ee.DateRange(start_date, end_date), ee_region)

            # Function to compute absolute difference in days between image date and requested date
            def compute_abs_difference(image):
                diff = ee.Number(image.date().difference(requested_date, 'day')).abs()
                return image.set('time_difference', diff)

            # Map over the collection to compute time difference
            s1_collection = s1_collection.map(compute_abs_difference)

            # Sort the collection by time difference
            sorted_collection = s1_collection.sort('time_difference')

            # Get the date of the position image in the collection

            selected_image = ee.Image(sorted_collection.toList(sorted_collection.size()).get(position-1))

            selected_image_date = selected_image.date()
            selected_image_time_difference = selected_image.get('time_difference')

            # Filter the collection to images that have the same date as the closest image
            images_same_date = s1_collection.filterDate(selected_image_date, selected_image_date.advance(1, 'day'))

            # Mosaic the images
            s1_image = images_same_date.median()

            return s1_image.set({
                "numberOfBands": s1_image.bandNames().size(),
                "date": requested_date,
                "time_difference": selected_image_time_difference
            })


    def reduce_composite(composite):

        # Filter the extra data with the matching date
        date = composite.get("date")
        date_range = ee.Date(date).getRange('day')
        time_difference = composite.get("time_difference")

        gldas_composite = get_gldas(date_range, ee_region)
        gpm_composite = get_gpm(date_range, ee_region)
        gpm_sum_composite = get_gpm_sum(date_range, ee_region)

        composite = (ee.Image(composite)
            .addBands(gldas_composite)
            .addBands(gpm_composite)
            .addBands(gpm_sum_composite)
        )

        return composite.reduceRegions(**{
         "collection" : ee_points,
         "reducer" : ee.Reducer.first(),
         "scale" : 10,
         "tileScale" : 16
        }).filter(ee.Filter.notNull(['VH'])).map(lambda feature: feature.set({
            "date" : date,
            "time_difference": time_difference,
        }))


    task = (ee_dates
         .map(get_sources)
         .filter(ee.Filter.gt('numberOfBands', 0))
         .map(reduce_composite).flatten()
    )

    # task_name = f"All_temporal_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}_with_date_lon_lat"
    task_name = f"1_Precipitation_sum_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}bosf_1ND_missing"""


    ee_task = ee.batch.Export.table.toDrive(**{
      "collection": task, 
      "folder" : "INDONESIA_GWL",
      "description": task_name,
      "selectors": base_selectors + s1_selectors + gldas_selectors + gpm_selectors + gpm_selectors_sum

    })

    # Uncoment to start the task
    not send_task or ee_task.start()
    print("Exported" if send_task else "Not exported", task_name)

mode = "extra"

if mode=="extra":
    base_selectors = list(set(base_selectors + ["time_difference"]))

# Region_id 20 is the bosf region, use mode="extra" to get the closest image to the date
# [get_temporal_explanatory(region_id, mode=mode, position=2) for region_id in gdf_regions.region_id.unique() if region_id in [20]] 

# 2.2 Get "yearly" temporal explanatory variables (Hansen)

In [None]:
# get all the years from the field data
years = sorted([y for y in df.date.dt.year.unique() if y != 2013] )

points = df[["id", "lon", "lat"]].drop_duplicates()

# This is only for the bosf region
points = gdf_unique_coords[gdf_unique_coords.region_id == 20][["id", "lon", "lat"]].drop_duplicates()
points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat), crs="EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__)

no_points = ee_points.size().getInfo()

print(f"Processing points {no_points}")

for year in [2020]:
    
    # The dataset for 2020 is not available ,try with get_landsat_mosaic
    image = get_hansen(year)
    result = image.reduceRegions(**{
        "collection" : ee_points,
        "reducer" : ee.Reducer.first(),
        "scale" : 30,
        "tileScale" : 16
    }).map(lambda feature: feature.set("year", str(year)))

    task_name = f"Hansen_year_{year}_points_{no_points}_f_bosf"

    ee_task = ee.batch.Export.table.toDrive(**{
        "collection": result, 
        "folder" : "INDONESIA_GWL",
        "description": task_name,
        "selectors": base_selectors + hansen_selectors
    })

    # not send_task or ee_task.start()
    print("Exported" if send_task else "Not exported", task_name)

## 3. Get non temporal explanatory variables (others)

In [None]:
# This dataset is not too computational expensive, so we are not forced to chunk it
# We'll try to get all the points at once, not by region (so we won't filter by region)
region = gdf_regions.to_crs("EPSG:4326")[:]

# Below is just to use bosf region
region = gdf_regions[gdf_regions.region_id==20].to_crs("EPSG:4326")[:]


ee_region = ee.FeatureCollection(region.__geo_interface__)
points = gdf_unique_coords[["id", "geometry", "lat", "lon"]].rename(columns={"id": "id"}).to_crs("EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__).filterBounds(ee_region)

In [None]:
composite = (
    get_srtm()
        .addBands(get_globcover())
        .addBands(get_gedi(ee_region))
        .addBands(get_gldas_stats(ee_region))
)
composite.bandNames().getInfo()

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['canopy_height']))

no_points = ee_points.size().getInfo()
task_name = f"All_Non_temporal_points_{no_points}_bosf"

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description":task_name,
    "selectors": base_selectors + ['elevation', 'aspect', 'slope', 'land_cov', 'canopy_height', "gldas_mean", "gldas_stddev"]
})

# Uncoment to start the task
# not send_task or ee_task.start()
print("Exported" if send_task else "Not exported", task_name)

## 4. Get Extra Non temporal explanatory variables (others)

This data comes from https://code.earthengine.google.com/6c3eeb929a5ee8a42f55234b58796c0a


In [None]:
no_points = ee_points.size().getInfo()
task_name = f"1_All_Non_temporal_extra_points_latlon_{no_points}_bosf"

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['distance']))

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description":task_name,
    "selectors": base_selectors + ['distance', 'dir', 'acc']
})

# Uncoment to start the task
# not send_task or ee_task.start()
print("Exported" if send_task else "Not exported", task_name)

# 4. Merge explanatory variables

## 4.1 Read temporal variables

In [None]:
from pathlib import Path

In [None]:
# explanatory_path = Path("data/7_training_data/")
# dataset = "all"
# temporal_file_names_groups = {
#     "all" : [
#         "All_temporal_non_resample_at_all_region_1_dates_520_points_24_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_2_dates_1773_points_148_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_3_dates_479_points_1_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_4_dates_988_points_348_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_5_dates_1796_points_717_with_date.csv",
#         "All_temporal_non_resample_at_all_region_6_dates_489_points_43_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_7_dates_1274_points_477_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_8_dates_1671_points_220_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",
#         "All_temporal_non_resample_at_all_region_10_dates_846_points_77_with_date_lon_lat.csv",
#     ]
# }

# # Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
# temporal_file_names = temporal_file_names_groups[dataset]

# # get and concatenate all the dataframes
# temp_explanatory_dfs = pd.concat([
#             pd.read_csv(explanatory_path/file_name, parse_dates=["date"])
#             for file_name 
#             in temporal_file_names
#         ], 
# )

# temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
# temp_explanatory_dfs

In [None]:
explanatory_path = Path("data/7_training_data/")
dataset = "bosf"

# I modified the notebook on the 31/05/2024 to include the sum of the precipitation
temporal_precip_sum = {
    "all" : [
        "0_Precipitation_sum_non_resample_at_all_region_1_dates_520_points_24_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_2_dates_1773_points_148_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_3_dates_362_points_1_with_date_lon_lat.csv", # Using this there's only 362 where the other has 479
        "0_Precipitation_sum_non_resample_at_all_region_4_dates_988_points_348_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_5_dates_1796_points_718_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_6_dates_489_points_43_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_7_dates_1273_points_477_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_8_dates_1671_points_219_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_10_dates_846_points_77_with_date_lon_lat.csv",
    ],
    "bosf" : [
        "bosf/1_Precipitation_sum_non_resample_at_all_region_20_dates_644_points_381_with_date_lon_lat_bosf.csv",
        "bosf/1_Precipitation_sum_non_resample_at_all_region_20_dates_115_points_341bosf_1ND_missing.csv"
    ]
}

# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_precip_sum[dataset]

# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
temp_explanatory_dfs

print(len(temp_explanatory_dfs))

## 4.2 Read Hansen yearly variables

In [None]:
explanatory_path = Path("data/7_training_data/bosf/")
hansen_file_names = [
    "Hansen_year_2018_points_2075_f.csv",
    "Hansen_year_2019_points_2075_f.csv",
    "Hansen_year_2020_points_2075_f.csv",
    "Hansen_year_2021_points_2075_f.csv",
    "Hansen_year_2022_points_2075_f.csv",
    "Hansen_year_2023_points_2075_f.csv"
]

# This is for bosf
hansen_file_names = [
    "Hansen_year_2018_points_381_f_bosf.csv",
    "Hansen_year_2018_points_381_f_bosf.csv",
    "Hansen_year_2019_points_381_f_bosf.csv",
    "Hansen_year_2020_points_381_f_bosf.csv",
    "Hansen_year_2021_points_381_f_bosf.csv",
    "Hansen_year_2022_points_381_f_bosf.csv",
    "Hansen_year_2023_points_381_f_bosf.csv"
]

hansen_df = pd.concat([
    pd.read_csv(explanatory_path/file_name) 
    for file_name 
    in hansen_file_names
], axis=0)

## 4.3 Read non temporal explanatory

In [None]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

# non_temporal_file_name = "All_Non_temporal_points_2074.csv"

# for bosf
non_temporal_file_name = "All_Non_temporal_points_381_bosf.csv"

non_temporal_df = pd.read_csv(explanatory_path/non_temporal_file_name)
# drop lat and lon
non_temporal_df = non_temporal_df.drop(columns=["lat", "lon", "date", "time_difference"])
non_temporal_df


## 4.4 Read extra non temporal explanatory (accumulation, distance to rivers/canals)

In [None]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

# non_temporal_extra_file_name = "All_Non_temporal_extra_points_latlon_2072.csv"

# for bosf data
non_temporal_extra_file_name = "1_All_Non_temporal_extra_points_latlon_381_bosf.csv"

non_temporal_extra_df = pd.read_csv(explanatory_path/non_temporal_extra_file_name)
# drop lat and lon
non_temporal_extra_df = non_temporal_extra_df.drop(columns=["lat", "lon", "date"])
non_temporal_extra_df.head()

## 4.4 Create final explanatory variables dataset

In [None]:
# Merge the non-temporal variables with the temporal ones
explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

In [None]:
# Merge with the extra non-temporal variables
explanatory_df = explanatory_df.merge(non_temporal_extra_df, on="id")
len(explanatory_df)

In [None]:
# Merge hansen data with year and id
explanatory_df["year"] = explanatory_df.date.dt.year
hansen_df["year"] = hansen_df["year"].astype(int)
explanatory_df = explanatory_df.merge(hansen_df[["id"] + hansen_selectors], on=["id", "year"], how="left")

# I get more values here because I have requested Hansen for all the years
explanatory_df

In [None]:
export_vars = [
    'id', 'date', 'LIA', 'VH', 'VV', 'VVVH_ratio',
    'angle', 'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation', 'prec_3',
    'prec_7', 'prec_30', 'elevation',
    'aspect', 'slope', 'land_cov', 'canopy_height', 'gldas_mean',
    'gldas_stddev', 'B3', 'B4',
    'B5', 'B7', 'ndvi', 'ndmi', 'ndbri',
    'distance', 'dir', 'acc',
] + ["time_difference"] + ['prec_3_sum', 'prec_7_sum', 'prec_30_sum']

# 4.5 Final step: Merge explanatory variables with response variable

In [None]:
explanatory_with_response_var = df[["source", "id", "date", "gwl_cm", "lat", "lon"]].merge(
    explanatory_df[export_vars], on=["id", "date"]
)

# Add day of the year as a variable
explanatory_with_response_var["doy"] = explanatory_with_response_var.date.dt.dayofyear
# explanatory_with_response_var.to_csv("data/7_training_data/explanatory_with_response_var_and_source_extra.csv", index=False)
len(explanatory_with_response_var)

In [None]:
print("lenght of base_df: ", len(explanatory_with_response_var))
# Drop duplicates on id and date
explanatory_with_response_var = explanatory_with_response_var.drop_duplicates(subset=["id", "date"])
print("lenght of explanatory_with_response_var after dropping duplicates: ", len(explanatory_with_response_var))

In [None]:
# Export the data
explanatory_with_response_var.to_csv("data/7_training_data/bosf/explanatory_with_response_var_and_source_extra_sum_prec_bosf.csv", index=False)

# 4.6 Final step: Add the extra "accumulated precipitation" variable

In [None]:
# I dont' have to do this for BOSF because I requested them at the same time

In [None]:
# # merge explanatory_with_response_var with the one that caomes with the sum of the 
# # accumulated precipitation
# explanatory_with_response_var = pd.read_csv("data/7_training_data/explanatory_with_response_var_and_source_extra.csv", parse_dates=["date"])
# len(explanatory_with_response_var)

In [None]:
# explanatory_with_response_plus_precip = explanatory_with_response_var.merge(
#     temp_precip_sum[['id', 'date', "prec_3_sum","prec_7_sum","prec_30_sum"]], 
#     on=["id", "date"]
# )
# len(explanatory_with_response_plus_precip)

In [None]:
# # explanatory_with_response_plus_precip.to_csv("data/7_training_data/explanatory_with_response_var_and_source_extra_sum_prec.csv", index=False)

# explanatory_with_response_var_and_source_extra_sum_prec = pd.read_csv("data/7_training_data/explanatory_with_response_var_and_source_extra_sum_prec.csv", parse_dates=["date"])
# len(explanatory_with_response_var_and_source_extra_sum_prec)

# 5. Plot the data

In [None]:
df.id.nunique()

In [None]:
df[(df.gwl_cm < 400) & (df.date > "2019-01-01")]

In [None]:
from gee_scripts.plots import plot_ts
df = df[(df.gwl_cm < 500) & (df.date > "2019-01-01")]

In [None]:
plot_ts(df, "gwl_cm", title="explain df plus precip sum")

In [None]:
plot_ts(explanatory_with_response_var_and_source_extra_sum_prec, "gwl_cm", title="explain df plus precip sum")

In [None]:
plot_ts(explanatory_with_response_var, "gwl_cm", title="explain df")
