The following notebook will get two different datasets of explanatory variables: temporal an non-temporal
related. 
In order to improve the speed time, this notebook will create the respective datasets and it will send a task to 
EarthEngine with a ReduceByRegion operation, we have proved that this method is faster than using the individual
calls to the API.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import ee
from gee_scripts.get_sources import get_s1_image, get_gldas, get_gpm, get_hansen, get_gpm_sum
from gee_scripts.get_sources import get_srtm, get_globcover, get_gedi, get_gldas_stats, get_extra_non_temporal
from gee_scripts import init_ee
init_ee()

The objective would be to loop over the points or the dates...<br>
After testing this script https://code.earthengine.google.com/b18e876cca44266be704924b7354ddff <br>
I found out that the best way to do it is to loop over the dates, and then pass the reduceregions. <br>


# 1. Read data

In [33]:
df = pd.read_csv('data/field_data_unique_coords.csv', parse_dates=["date"])
assert df["date"].dtype == "datetime64[ns]"
len(df)

267218

In [34]:
requested_data = pd.read_csv("data/9_clean_training_data/all_training_data_with_extra_and_locations_and_precipSum.csv", parse_dates=["date"])

# get the points that were not requested yet by using the date and station_id
# requested_data["date"] = requested_data["date"].dt.date
# df["date"] = df["date"].dt.date
df = df[~df.set_index(["date", "id"]).index.isin(requested_data.set_index(["date", "id"]).index)]
len(df)

235671

##########################
## Set type of output
##########################

In [35]:
# This notebook can be run entirely, if we want to task the orders to GEE we'll set this variable to True
send_task = True

##########################

In [36]:
unique_coords = df[["id", "lon", "lat"]].drop_duplicates()
unique_coords.head()
len(unique_coords)

# Convert them as a geodataframe and save them
geometry = [Point(xy) for xy in zip(unique_coords.lon, unique_coords.lat)]
gdf = gpd.GeoDataFrame(unique_coords, geometry=geometry)

In [37]:
# Read regions shapefile

# I have two regions, first is to only the high correlated stations and the second is all the stations
# I will use either depending on the dataset we have selected above

shp_path = Path("data/0_shp/")
region_path = "regions_to_request_explanatory_all.gpkg"

gdf_regions = gpd.GeoDataFrame.from_file(shp_path/region_path)
gdf_regions

Unnamed: 0,region_id,geometry
0,1,"MULTIPOLYGON (((96.37854 4.01317, 96.76923 3.9..."
1,2,"MULTIPOLYGON (((102.96446 -0.63790, 104.82488 ..."
2,3,"MULTIPOLYGON (((140.00836 -7.80760, 140.75163 ..."
3,4,"MULTIPOLYGON (((105.23245 -2.56075, 105.62785 ..."
4,5,"MULTIPOLYGON (((101.59551 1.61281, 101.45686 0..."
5,6,"MULTIPOLYGON (((100.69365 2.01094, 100.81080 2..."
6,7,"MULTIPOLYGON (((108.80424 1.60848, 109.83126 1..."
7,8,"MULTIPOLYGON (((110.98152 -2.86934, 114.00610 ..."
8,9,"MULTIPOLYGON (((132.99060 -0.68691, 133.43736 ..."
9,10,"MULTIPOLYGON (((116.84967 3.98347, 117.30926 3..."


In [38]:
# # Remove those date where the gwl measure is out of reasonable range
# upper_thres = 20
# lower_thres = -100

# df = df[(df.gwl_cm < upper_thres) & (df.gwl_cm > lower_thres)]

# # Get the coordinates of the individual points

# unique_coords = df[["id", 'lon', 'lat']].drop_duplicates()
# len(df), len(unique_coords)

In [85]:
missing_coors = ['138_NBR_M13','18_SBA_DTD043','18_SBA_DTP002','18_SBA_DTP025','18_SBA_DTP031','18_SBA_DTP034','18_SBA_DTP038','18_SBA_DTP054','271_RSP_H19','BRG_140301_01','BRG_140302_01','BRG_140302_02','BRG_610117_01']

In [39]:
# Create geodataframe from x y coordinates
gdf_unique_coords = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")

# Add the region id to each point
gdf_unique_coords = gpd.sjoin(gdf_unique_coords, gdf_regions[["region_id", "geometry"]], how="left", predicate="within")

In [40]:
# Failing datasets array(['BRG_150710_01', 'BRG_621107_03'], dtype=object)
gdf_unique_coords[gdf_unique_coords.id.isin(["BRG_621107_03", "BRG_150710_01"])]

Unnamed: 0,id,lon,lat,geometry,index_right,region_id
222645,BRG_150710_01,103.900168,-1.274317,POINT (103.90017 -1.27432),1,2
245948,BRG_621107_03,114.2206,-2.654022,POINT (114.22060 -2.65402),7,8


# 1. Get temporal explanatory variables

In [41]:
gdf_regions

Unnamed: 0,region_id,geometry
0,1,"MULTIPOLYGON (((96.37854 4.01317, 96.76923 3.9..."
1,2,"MULTIPOLYGON (((102.96446 -0.63790, 104.82488 ..."
2,3,"MULTIPOLYGON (((140.00836 -7.80760, 140.75163 ..."
3,4,"MULTIPOLYGON (((105.23245 -2.56075, 105.62785 ..."
4,5,"MULTIPOLYGON (((101.59551 1.61281, 101.45686 0..."
5,6,"MULTIPOLYGON (((100.69365 2.01094, 100.81080 2..."
6,7,"MULTIPOLYGON (((108.80424 1.60848, 109.83126 1..."
7,8,"MULTIPOLYGON (((110.98152 -2.86934, 114.00610 ..."
8,9,"MULTIPOLYGON (((132.99060 -0.68691, 133.43736 ..."
9,10,"MULTIPOLYGON (((116.84967 3.98347, 117.30926 3..."


In [42]:
gdf_unique_coords

Unnamed: 0,id,lon,lat,geometry,index_right,region_id
0,02_AHL_SBG-B076,117.007750,3.937760,POINT (117.00775 3.93776),9,10
65,02_AHL_SBG-B101,117.010120,3.931860,POINT (117.01012 3.93186),9,10
130,02_AHL_SBG-B103,117.005210,3.926090,POINT (117.00521 3.92609),9,10
195,02_AHL_SBG-C003,117.145430,3.903400,POINT (117.14543 3.90340),9,10
260,02_AHL_SBG-C006,117.148320,3.919380,POINT (117.14832 3.91938),9,10
...,...,...,...,...,...,...
266127,ij2,114.022576,-2.573375,POINT (114.02258 -2.57337),7,8
266206,jambi1,103.589975,-1.238478,POINT (103.58997 -1.23848),1,2
266449,kalbar1,109.394853,-0.210225,POINT (109.39485 -0.21022),6,7
266842,kalteng1,114.058131,-2.319728,POINT (114.05813 -2.31973),7,8


# 2. Get temporal explanatory

In [44]:
base_selectors = ["system:index", "lat", "lon", "id", "date"]
s1_selectors = ["LIA", "VH", "VV", "VVVH_ratio", "angle"]
gldas_selectors = ['sm_1', 'sm_3', 'sm_7', 'sm_30']
gpm_selectors = ['precipitation', 'prec_3', 'prec_7', 'prec_30']
gpm_selectors_sum = ['prec_3_sum', 'prec_7_sum', 'prec_30_sum']

def get_temporal_explanatory(region_id):
    """Get the explanatory temporal based variables"""

    region = gdf_regions[gdf_regions.region_id == region_id].to_crs("EPSG:4326")[:]
    dates = df[df.id.isin(gdf_unique_coords[gdf_unique_coords.region_id == region_id].id.unique())].date.unique()
    points = gdf_unique_coords[gdf_unique_coords.region_id == region_id][["id", "geometry", "lat", "lon"]].to_crs("EPSG:4326")

    # print(len(dates), len(points))
    # Convert to ee elements

    ee_dates = ee.FeatureCollection(ee.List([ ee.Feature(None, {"date": date}) for date in dates]))
    ee_points = ee.FeatureCollection(points.__geo_interface__)
    ee_region = ee.FeatureCollection(region.__geo_interface__)

    def get_sources(date_feature):

        date_range = ee.Date(date_feature.get("date")).getRange('day')

        s1_composite = get_s1_image(date_range, ee_region)

        return s1_composite.set({
         "numberOfBands" : s1_composite.bandNames().size(),
         "date" : ee.Date(date_feature.get("date"))
         })

    def reduce_composite(composite):

        # Filter the extra data with the matching date
        date = composite.get("date")
        date_range = ee.Date(date).getRange('day')

        gldas_composite = get_gldas(date_range, ee_region)
        gpm_composite = get_gpm(date_range, ee_region)
        gpm_sum_composite = get_gpm_sum(date_range, ee_region)

        composite = (ee.Image(composite)
            .addBands(gldas_composite)
            .addBands(gpm_composite)
            .addBands(gpm_sum_composite)
        )

        return composite.reduceRegions(**{
         "collection" : ee_points,
         "reducer" : ee.Reducer.first(),
         "scale" : 10,
         "tileScale" : 16
        }).filter(ee.Filter.notNull(['VH'])).map(lambda feature: feature.set({
         "date" : date
        }))


    task = (ee_dates
         .map(get_sources)
         .filter(ee.Filter.gt('numberOfBands', 0))
         .map(reduce_composite).flatten()
    )

    # task_name = f"All_temporal_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}_with_date_lon_lat"
    task_name = f"1_Precipitation_sum_non_resample_at_all_region_{region_id}_dates_{len(dates)}_points_{len(points)}_with_date_lon_lat"""


    ee_task = ee.batch.Export.table.toDrive(**{
      "collection": task, 
      "folder" : "INDONESIA_GWL",
      "description": task_name,
      "selectors": base_selectors + s1_selectors + gldas_selectors + gpm_selectors + gpm_selectors_sum

    })

    # Uncoment to start the task
    not send_task or ee_task.start()
    print("Exported" if send_task else "Not exported", task_name)
send_task = True
[get_temporal_explanatory(region_id) for region_id in gdf_regions.region_id.unique()]

Exported 1_Precipitation_sum_non_resample_at_all_region_1_dates_444_points_24_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_2_dates_1767_points_148_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_3_dates_421_points_1_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_4_dates_986_points_348_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_5_dates_1799_points_718_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_6_dates_430_points_43_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_7_dates_1219_points_477_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_8_dates_1664_points_221_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat
Exported 1_Precipitation_sum_non_resample_at_all_region_10_dates_780_points_77_with_date_lon_lat


[None, None, None, None, None, None, None, None, None, None]

# 2.2 Get "yearly" temporal explanatory variables (Hansen)

In [None]:
hansen_selectors = ["year", "B3","B4","B5","B7","ndvi","ndmi","ndbri"]

# get all the years from the field data
years = sorted([y for y in df.date.dt.year.unique() if y != 2013] )

for year in years:

    points = df[["id", "lon", "lat"]].drop_duplicates()
    points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat), crs="EPSG:4326")
    ee_points = ee.FeatureCollection(points.__geo_interface__)

    image = get_hansen(year)

    result = image.reduceRegions(**{
        "collection" : ee_points,
        "reducer" : ee.Reducer.first(),
        "scale" : 30,
        "tileScale" : 16
    }).map(lambda feature: feature.set("year", str(year)))
    
    task_name = f"Hansen_year_{year}_points_{len(points)}_f"

    ee_task = ee.batch.Export.table.toDrive(**{
        "collection": result, 
        "folder" : "INDONESIA_GWL",
        "description": f"Hansen_year_{year}_points_{len(points)}_f",
        "selectors": base_selectors + hansen_selectors
    })

    not send_task or ee_task.start()
    print("Exported" if send_task else "Not exported", task_name)


## 3. Get non temporal explanatory variables (others)

In [51]:
# This dataset is not too computational expensive, so we are not forced to chunk it
# We'll try to get all the points at once, not by region (so we won't filter by region)
region = gdf_regions.to_crs("EPSG:4326")[:]
ee_region = ee.FeatureCollection(region.__geo_interface__)
points = gdf_unique_coords[["id", "geometry", "lat", "lon"]].rename(columns={"id": "id"}).to_crs("EPSG:4326")
ee_points = ee.FeatureCollection(points.__geo_interface__)
len(points)

2074

In [None]:
composite = (
    get_srtm()
        .addBands(get_globcover())
        .addBands(get_gedi(ee_region))
        .addBands(get_gldas_stats(ee_region))
)
composite.bandNames().getInfo()

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['canopy_height']))

task_name = f"All_Non_temporal_points_{len(points)}"

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description":task_name,
    "selectors": base_selectors + ['elevation', 'aspect', 'slope', 'land_cov', 'canopy_height', "gldas_mean", "gldas_stddev"]
})

# Uncoment to start the task
not send_task or ee_task.start()
print("Exported" if send_task else "Not exported", task_name)

## 4. Get Extra Non temporal explanatory variables (others)

This data comes from https://code.earthengine.google.com/6c3eeb929a5ee8a42f55234b58796c0a


In [49]:
composite = get_extra_non_temporal()
composite.bandNames().getInfo()

['distance', 'dir', 'acc', 'land_forms']

In [52]:
phu = ee.FeatureCollection(
    "users/marortpab/FAO/SEPAL/2023_trainings/smm/AOI__Province__865_PHUs__INDONESIA"
)

result = composite.reduceRegions(**{
    "collection" : ee_points,
    "reducer" : ee.Reducer.first(),
    "scale" : 10,
    "tileScale" : 16
}).filter(ee.Filter.notNull(['distance']))

task_name = f"1_All_Non_temporal_extra_points_latlon_{len(points)}"

ee_task = ee.batch.Export.table.toDrive(**{
    "collection": result, 
    "folder" : "INDONESIA_GWL",
    "description":task_name,
    "selectors": base_selectors + ['distance', 'dir', 'acc']
})

# Uncoment to start the task
not send_task or ee_task.start()
print("Exported" if send_task else "Not exported", task_name)

Exported 1_All_Non_temporal_extra_points_latlon_2074


# 4. Merge explanatory variables

## 4.1 Read temporal variables

In [23]:
from pathlib import Path
import pandas as pd

In [24]:
explanatory_path = Path("data/7_training_data/")
dataset = "all"
temporal_file_names_groups = {
    "all" : [
        "All_temporal_non_resample_at_all_region_1_dates_520_points_24_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_2_dates_1773_points_148_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_3_dates_479_points_1_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_4_dates_988_points_348_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_5_dates_1796_points_717_with_date.csv",
        "All_temporal_non_resample_at_all_region_6_dates_489_points_43_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_7_dates_1274_points_477_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_8_dates_1671_points_220_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",
        "All_temporal_non_resample_at_all_region_10_dates_846_points_77_with_date_lon_lat.csv",
    ]
}

# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_file_names_groups[dataset]

# get and concatenate all the dataframes
temp_explanatory_dfs = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_explanatory_dfs["date"] = pd.to_datetime(temp_explanatory_dfs["date"])
# temp_explanatory_dfs

In [73]:
# I modified the notebook on the 31/05/2024 to include the sum of the precipitation
temporal_precip_sum = {
    "all" : [
        "0_Precipitation_sum_non_resample_at_all_region_1_dates_520_points_24_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_2_dates_1773_points_148_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_3_dates_362_points_1_with_date_lon_lat.csv", # Using this there's only 362 where the other has 479
        "0_Precipitation_sum_non_resample_at_all_region_4_dates_988_points_348_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_5_dates_1796_points_718_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_6_dates_489_points_43_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_7_dates_1273_points_477_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_8_dates_1671_points_219_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",
        "0_Precipitation_sum_non_resample_at_all_region_10_dates_846_points_77_with_date_lon_lat.csv",
    ]
}

# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
temporal_file_names = temporal_precip_sum[dataset]

# get and concatenate all the dataframes
temp_precip_sum = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in temporal_file_names
        ], 
)

temp_precip_sum["date"] = pd.to_datetime(temp_precip_sum["date"])
temp_precip_sum

In [None]:
# I modified the notebook on the 12/06/2024 to include the stations that we removed at the beginning
# by applying the threshold of the gwl (20 and -100)
missed_stations = {
    "all" : [
        "1_Precipitation_sum_non_resample_at_all_region_1_dates_444_points_24_with_date_lon_lat.csv",        "1_Precipitation_sum_non_resample_at_all_region_2_dates_1767_points_148_with_date_lon_lat.csv",
        "1_Precipitation_sum_non_resample_at_all_region_3_dates_421_points_1_with_date_lon_lat.csv",        "1_Precipitation_sum_non_resample_at_all_region_4_dates_986_points_348_with_date_lon_lat.csv",
        "1_Precipitation_sum_non_resample_at_all_region_5_dates_1799_points_718_with_date_lon_lat.csv",
        "1_Precipitation_sum_non_resample_at_all_region_6_dates_430_points_43_with_date_lon_lat.csv",        "1_Precipitation_sum_non_resample_at_all_region_7_dates_1219_points_477_with_date_lon_lat.csv",
        "1_Precipitation_sum_non_resample_at_all_region_8_dates_1664_points_221_with_date_lon_lat.csv",
        "1_Precipitation_sum_non_resample_at_all_region_9_dates_379_points_17_with_date_lon_lat.csv",        "1_Precipitation_sum_non_resample_at_all_region_10_dates_780_points_77_with_date_lon_lat.csv",

    ]
}

# Dataset is the name of the type of data we're using (high_corr or all) (it's assigned at the beginning of the notebook)
missed_temporal_file_names = missed_stations[dataset]

# get and concatenate all the dataframes
missed_temp = pd.concat([
            pd.read_csv(explanatory_path/file_name) 
            for file_name 
            in missed_temporal_file_names
        ]
)
missed_temp["date"] = pd.to_datetime(missed_temp["date"])
missed_temp

## 4.2 Read Hansen yearly variables

In [None]:
explanatory_path = Path("data/7_training_data/")
hansen_file_names = [
    "Hansen_year_2018_points_2075_f.csv",
    "Hansen_year_2019_points_2075_f.csv",
    "Hansen_year_2020_points_2075_f.csv",
    "Hansen_year_2021_points_2075_f.csv",
    "Hansen_year_2022_points_2075_f.csv",
    "Hansen_year_2023_points_2075_f.csv"
]

hansen_df = pd.concat([
    pd.read_csv(explanatory_path/file_name) 
    for file_name 
    in hansen_file_names
], axis=0)
hansen_df

## 4.3 Read non temporal explanatory

In [None]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

non_temporal_file_name = "All_Non_temporal_points_2074.csv"
non_temporal_df = pd.read_csv(explanatory_path/non_temporal_file_name)
# drop lat and lon
non_temporal_df = non_temporal_df.drop(columns=["lat", "lon"])
non_temporal_df

## 4.4 Read extra non temporal explanatory (accumulation, distance to rivers/canals)

In [None]:
# As the non-temporal variables are the same for all the points, we just need to duplicate 
# their results into each of the dates of the points.
# i.e. 1 point with 10 dates will have the same non-temporal variables for each of the 10 dates.

non_temporal_extra_file_name = "All_Non_temporal_extra_points_latlon_2072.csv"
non_temporal_extra_df = pd.read_csv(explanatory_path/non_temporal_extra_file_name)
# drop lat and lon
non_temporal_extra_df = non_temporal_extra_df.drop(columns=["lat", "lon", "date"])
non_temporal_extra_df

## 4.4 Create final explanatory variables dataset

In [None]:
len(temp_explanatory_dfs)

In [None]:
# Merge the non-temporal variables with the temporal ones
explanatory_df = temp_explanatory_dfs.merge(non_temporal_df, on="id")
len(explanatory_df)

In [None]:
# Merge with the extra non-temporal variables
explanatory_df = explanatory_df.merge(non_temporal_extra_df, on="id")
len(explanatory_df)

In [None]:
explanatory_df.head()

In [None]:
# Merge hansen data with year and id
explanatory_df["year"] = explanatory_df.date.dt.year
hansen_df["year"] = hansen_df["year"].astype(int)
explanatory_df = explanatory_df.merge(hansen_df[["id"] + hansen_selectors], on=["id", "year"], how="left")

# I get more values here because I have requested Hansen for all the years
explanatory_df

In [33]:
export_vars = [
    'id', 'date', 'LIA', 'VH', 'VV', 'VVVH_ratio',
    'angle', 'sm_1', 'sm_3', 'sm_7', 'sm_30', 'precipitation', 'prec_3',
    'prec_7', 'prec_30', 'elevation',
    'aspect', 'slope', 'land_cov', 'canopy_height', 'gldas_mean',
    'gldas_stddev', 'B3', 'B4',
    'B5', 'B7', 'ndvi', 'ndmi', 'ndbri',
    'distance', 'dir', 'acc',
]

# 4.5 Final step: Merge explanatory variables with response variable

In [95]:
explanatory_with_response_var = df[["source", "id", "date", "gwl_cm", "lat", "lon"]].merge(
    explanatory_df[export_vars], on=["id", "date"]
)

# Add day of the year as a variable
explanatory_with_response_var["doy"] = explanatory_with_response_var.date.dt.dayofyear
# explanatory_with_response_var.to_csv("data/7_training_data/explanatory_with_response_var_and_source_extra.csv", index=False)
len(explanatory_with_response_var)

NameError: name 'explanatory_df' is not defined

# 4.6 Final step: Add the extra "accumulated precipitation" variable

In [96]:
# merge explanatory_with_response_var with the one that caomes with the sum of the 
# accumulated precipitation
import pandas as pd
explanatory_with_response_var = pd.read_csv("data/7_training_data/explanatory_with_response_var_and_source_extra.csv")
len(explanatory_with_response_var)

33420

In [97]:
explanatory_with_response_var["date"] = pd.to_datetime(explanatory_with_response_var["date"])

In [99]:
explanatory_with_response_plus_precip = explanatory_with_response_var.merge(
    temp_precip_sum[['id', 'date', "prec_3_sum","prec_7_sum","prec_30_sum"]], 
    on=["id", "date"]
)

In [100]:
explanatory_with_response_plus_precip.to_csv("data/7_training_data/explanatory_with_response_var_and_source_extra_sum_prec.csv", index=False)