In [8]:
import geopandas as gpd
import pandas as pd

from fwi_predict.constants import WQ_RANGES
from fwi_predict.wq import get_in_required_range

In [9]:
ponds = gpd.read_file("../data/clean/pond_metadata_clean.geojson")
measurements = pd.read_csv("../data/clean/ara_measurements_clean.csv",
                           parse_dates=['sample_dt', 'prescribed_collection_date'])

In [10]:
measurements['sample_dt']

0      2021-08-26 07:45:00+05:30
1      2021-08-26 17:25:00+05:30
2      2021-09-13 06:45:00+05:30
3      2021-09-13 16:50:00+05:30
4      2021-09-28 06:37:00+05:30
                  ...           
6728   2024-01-04 15:49:00+05:30
6729   2024-02-04 07:45:00+05:30
6730   2024-02-04 16:25:00+05:30
6731   2024-03-06 07:30:00+05:30
6732   2024-03-06 16:30:00+05:30
Name: sample_dt, Length: 6733, dtype: datetime64[ns, UTC+05:30]

In [11]:
# Subset to columns of interest
ponds_cols = ['pond_id', 'property_area_acres', 'pond_area_acres',
                      'pond_depth_meters', 'geometry']
pond_measurement_cols = ['pond_id', 'region', 'farm_id', 'sample_dt', 'do_mg_per_L',
                         'ph', 'turbidity_cm', 'ammonia_mg_per_L', 'time_of_day']

# Construct days since last measurement variable. This would suggest including water quality correction

In [12]:
ponds = ponds[ponds_cols]
ponds = ponds[ponds['geometry'].notna()] # Have to have locations to get weather data.
measurements = measurements[measurements['follow_up'] == False]
measurements = measurements[pond_measurement_cols]

In [13]:
# Remove measurements with inconsistent times of day
measurements['tod_dt'] = measurements['sample_dt'].apply(lambda x: "morning" if x.hour < 12 else "evening")
print(f"Observations with inconsistent times: {len(measurements[measurements['tod_dt'] != measurements['time_of_day']])}")
measurements = measurements[measurements['tod_dt'] == measurements['time_of_day']]
measurements = measurements.drop(columns=['tod_dt'])

Observations with inconsistent times: 44


In [14]:
combined = gpd.GeoDataFrame(
  measurements.merge(ponds, on='pond_id', how='inner'), # Inner as we need location
  geometry='geometry',
  crs=ponds.crs
)

In [15]:
ponds_not_included = ponds[~ponds['pond_id'].isin(combined['pond_id'])]
print(f"No measurements for {len(ponds_not_included)} ponds")

No measurements for 64 ponds


In [16]:
combined.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
do_mg_per_L,4877.0,6.500465,3.624134,0.0,3.6,4.8,9.4,80.8
ph,4877.0,8.182413,0.314831,3.38,8.0,8.21,8.39,9.54
turbidity_cm,3414.0,28.950644,7.524521,9.0,24.0,29.0,32.0,89.0
ammonia_mg_per_L,2965.0,0.13196,0.257797,0.0,0.01,0.06,0.18,9.0
property_area_acres,4969.0,9.259086,10.982728,0.25,2.5,5.0,13.0,53.0
pond_area_acres,4194.0,8.298865,10.499382,0.13,1.65,3.45,10.81,49.8
pond_depth_meters,4969.0,2.163614,0.808669,1.0,1.9,2.0,2.5,11.0


In [17]:
combined['sample_idx'] = pd.Series(range(len(combined)))
combined.to_file("../data/clean/measurements_with_metadata.geojson")

In [16]:
# report