In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import pandas as pd
from pathlib import Path
import geopandas as gpd

# 1. Read in the data

In [10]:
explanatory_file_path = "data/7_training_data/explanatory_with_response_var_and_source_extra_sum_prec.csv"
base_df = pd.read_csv(explanatory_file_path)
print("lenght of base_df: ", len(base_df))

# Drop duplicates on id and date
base_df = base_df.drop_duplicates(subset=["id", "date"])
print("lenght of base_df after dropping duplicates: ", len(base_df))

# Find if there are any NaNs and remove them from the dataset 
# Some NAN values are still present in the dataset, probably due to the fact that some of the
# points fall outside bounds of GLDAS dataset
base_df = base_df[~base_df.isna().any(axis=1)]

print("lenght of base_df after droping any nan values: ", len(base_df))

unique_coords = base_df[["id", 'lon', 'lat']].drop_duplicates()
print(len(base_df), len(unique_coords))

lenght of base_df:  36144
lenght of base_df after dropping duplicates:  35040
lenght of base_df after droping any nan values:  32783
32783 2047


In [11]:
base_df

Unnamed: 0,source,id,date,gwl_cm,lat,lon,LIA,VH,VV,VVVH_ratio,...,ndvi,ndmi,ndbri,distance,dir,acc,doy,prec_3_sum,prec_7_sum,prec_30_sum
0,pkeg,02_AHL_SBG-B076,2021-06-01,-41.0,3.937760,117.007750,32.785855,-11.481278,-5.556430,0.207099,...,0.644068,0.227848,0.590164,4519.468722,1,1,152,8.140000,16.280000,110.139998
1,pkeg,02_AHL_SBG-B076,2021-08-24,-38.0,3.937760,117.007750,32.785295,-12.812067,-5.960235,0.201164,...,0.644068,0.227848,0.590164,4519.468722,1,1,236,2.440000,4.570000,111.279997
2,pkeg,02_AHL_SBG-B076,2022-04-09,-22.0,3.937760,117.007750,32.795191,-18.437775,-10.636812,0.072032,...,0.359223,-0.156627,0.147541,4519.468722,1,1,99,13.550000,51.119999,173.629999
3,pkeg,02_AHL_SBG-B076,2023-02-15,-31.0,3.937760,117.007750,32.793740,-13.051827,-9.007584,0.076149,...,0.359223,-0.156627,0.147541,4519.468722,1,1,46,11.090000,11.090000,80.609997
4,pkeg,02_AHL_SBG-B101,2021-06-01,-66.0,3.931860,117.010120,39.267563,-9.778736,-6.648683,0.111111,...,0.562500,0.136364,0.500000,5020.676546,128,36,152,8.140000,16.280000,110.139998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36139,old_brg,kalteng1,2020-02-24,-23.9,-2.319728,114.058131,32.478706,-12.878587,-8.271499,0.097345,...,0.620000,0.208955,0.588235,189.692997,128,16,55,17.449999,43.629998,206.569996
36140,old_brg,kalteng1,2020-03-07,-31.1,-2.319728,114.058131,32.478589,-13.078360,-6.593164,0.169898,...,0.620000,0.208955,0.588235,189.692997,128,16,67,5.650000,24.889999,178.319996
36141,old_brg,kalteng1,2020-03-19,-14.7,-2.319728,114.058131,32.479968,-12.563061,-7.793232,0.110794,...,0.620000,0.208955,0.588235,189.692997,128,16,79,54.520000,56.490000,164.119998
36142,old_brg,kalteng1,2020-03-31,-27.1,-2.319728,114.058131,32.480674,-13.179518,-7.523130,0.128794,...,0.620000,0.208955,0.588235,189.692997,128,16,91,16.759999,31.479999,139.929998


## 2. Add regions to the points

<center><img src="img/training_regions.png" alt="image" width="900"/></center>
<p>Below we will add the region attribute to each of the points, this will be useful if we want to train different models over different subsets.</p>

In [12]:
shp_path = Path("data/0_shp/")
region_path = "regions_to_request_explanatory_all.gpkg"
gdf_regions = gpd.GeoDataFrame.from_file(shp_path/region_path)

unique_coords = base_df[["id", 'lon', 'lat']].drop_duplicates()
print(len(base_df), len(unique_coords))

# Create geodataframe from x y coordinates
station_with_region_df = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")

# # Add the region id to each point
station_with_region_df = gpd.sjoin(station_with_region_df, gdf_regions[["region_id", "geometry"]], how="left", predicate="within")

print("lenght of station_with_region_df: ", len(station_with_region_df))

# Merge the region id to the main dataframe
df = base_df.merge(station_with_region_df[["id", "region_id"]], on="id", how="left")

# print the number of unique regions
print("lengh of df: ", len(df))

# Convert to GeoDataFrame
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
# Set CRS to WGS84
df.crs = "EPSG:4326"

# read phu regions shapefile
phu_regions = gpd.read_file("data/0_shp/all_phus_numbered.gpkg")

# add phu region to each point
df = gpd.sjoin(df, phu_regions, how="left", predicate="within")
print("lenght of df after adding phu regions: ", len(df))

32783 2047
lenght of station_with_region_df:  2047
lengh of df:  32783
lenght of df after adding phu regions:  32783


In [13]:
# I want to use that dataframe as the base for all the calculations, save it to a file
# df.to_csv("data/9_clean_training_data/all_training_data_with_extra_and_locations_and_precipSum.csv")