In [31]:
import geopandas as gpd
import pandas as pd

from fwi_predict.constants import WQ_RANGES
from fwi_predict.wq import check_in_range

In [32]:
ponds = gpd.read_file("../data/clean/pond_metadata_clean.geojson")
measurements = pd.read_csv("../data/clean/ara_measurements_clean.csv",
                                parse_dates=['sample_dt', 'prescribed_collection_date'])

In [33]:
# Subset to columns of interest
ponds_cols = ['pond_id', 'property_area_acres', 'pond_area_acres',
                      'pond_depth_meters', 'geometry']
pond_measurement_cols = ['pond_id', 'region', 'farm_id', 'group', 'treatment_group',
                         'sample_dt', 'time_of_day', 'do_mg_per_L', 'ph', 'turbidity_cm',
                         'ammonia_mg_per_L', 'fish_per_acre', 'species', 'primary_productivity_gpp_mg_per_L']

# Construct days since last measurement variable. This would suggest including water quality correction

In [34]:
ponds = ponds[ponds_cols]
ponds = ponds[ponds['geometry'].notna()] # Have to have locations to get weather data.
measurements = measurements[measurements['follow_up'] == False]
measurements = measurements[pond_measurement_cols]

In [35]:
predict_ds = gpd.GeoDataFrame(
  measurements.merge(ponds, on='pond_id', how='inner'), # Inner as we need location
  geometry='geometry',
  crs=ponds.crs
)

In [36]:
ponds_not_included = ponds[~ponds['pond_id'].isin(predict_ds['pond_id'])]
print(f"No measurements for {len(ponds_not_included)} ponds")

No measurements for 66 ponds


In [37]:
for wq_param in WQ_RANGES.keys():
  param_no_unit = wq_param.split('_', 1)[0]
  predict_ds[f"{param_no_unit}_in_req_range"] = check_in_range(wq_param, predict_ds[wq_param], predict_ds['time_of_day'])

  in_range[values.isna() | periods.isna()] = np.nan
  in_range[values.isna()] = np.nan
  in_range[values.isna()] = np.nan
  in_range[values.isna()] = np.nan


In [38]:
predict_ds.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
do_mg_per_L,4611.0,6.553019,3.630798,0.0,3.6,4.84,9.4,80.8
ph,4610.0,8.183351,0.311804,3.38,8.0,8.21,8.39,9.54
turbidity_cm,3319.0,28.831335,7.264532,12.0,24.0,29.0,32.0,85.0
ammonia_mg_per_L,2819.0,0.13168,0.197666,0.0,0.01,0.07,0.19,3.0
fish_per_acre,2881.0,5128.636585,8646.901526,0.0,2300.0,3000.0,3882.0,66667.0
primary_productivity_gpp_mg_per_L,1001.0,2.840345,2.33823,0.0,1.6,2.7,3.71,16.9
property_area_acres,4692.0,9.519267,11.22171,0.25,2.0,5.0,13.0,53.0
pond_area_acres,4033.0,8.512341,10.649789,0.13,1.65,3.49,11.24,49.8
pond_depth_meters,4692.0,2.145908,0.821149,1.0,1.9,2.0,2.5,11.0


In [39]:
predict_ds['measurement_idx'] = pd.Series([i for i in range(0, len(predict_ds))])
predict_ds.to_file("../data/clean/ara_predict_ds.geojson")