In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from pathlib import Path
import geopandas as gpd

# 1. Read in the data

In [None]:
explanatory_file_path = "data/7_training_data/explanatory_with_response_var_and_source_extra_sum_prec.csv"
base_df = pd.read_csv(explanatory_file_path)
print("lenght of base_df: ", len(base_df))
# Drop duplicates on id and date
base_df = base_df.drop_duplicates(subset=["id", "date"])
print("lenght of base_df after dropping duplicates: ", len(base_df))

# Find if there are any NaNs and remove them from the dataset 
# Some NAN values are still present in the dataset, probably due to the fact that some of the
# points fall outside bounds of GLDSA dataset
base_df = base_df[~base_df.isna().any(axis=1)]

print("lenght of base_df after droping any nan values: ", len(base_df))

unique_coords = base_df[["id", 'lon', 'lat']].drop_duplicates()
print(len(base_df), len(unique_coords))

## 2. Add regions to the points

<center><img src="img/training_regions.png" alt="image" width="900"/></center>
<p>Below we will add the region attribute to each of the points, this will be useful if we want to train different models over different subsets.</p>

In [None]:
shp_path = Path("data/0_shp/")
region_path = "regions_to_request_explanatory_all.gpkg"
gdf_regions = gpd.GeoDataFrame.from_file(shp_path/region_path)

unique_coords = base_df[["id", 'lon', 'lat']].drop_duplicates()
print(len(base_df), len(unique_coords))

# Create geodataframe from x y coordinates
station_with_region_df = gpd.GeoDataFrame(unique_coords, geometry=gpd.points_from_xy(unique_coords.lon, unique_coords.lat), crs="EPSG:4326")

# # Add the region id to each point
station_with_region_df = gpd.sjoin(station_with_region_df, gdf_regions[["region_id", "geometry"]], how="left", predicate="within")

print("lenght of station_with_region_df: ", len(station_with_region_df))

# Merge the region id to the main dataframe
df = base_df.merge(station_with_region_df[["id", "region_id"]], on="id", how="left")

# print the number of unique regions
print("lengh of df: ", len(df))

# Convert to GeoDataFrame
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
# Set CRS to WGS84
df.crs = "EPSG:4326"

# read phu regions shapefile
phu_regions = gpd.read_file("data/0_shp/all_phus_numbered.gpkg")

# add phu region to each point
df = gpd.sjoin(df, phu_regions, how="left", predicate="within")
print("lenght of df after adding phu regions: ", len(df))

In [None]:
# I want to use that dataframe as the base for all the calculations, save it to a file
df.to_csv("data/9_clean_training_data/all_training_data_with_extra_and_locations_and_precipSum.csv")