In [1]:
import geopandas as gpd
import pandas as pd

from fwi_predict.constants import WQ_RANGES
from fwi_predict.wq import check_in_range

In [2]:
ponds = gpd.read_file("../data/clean/pond_metadata_clean.geojson")
measurements = pd.read_csv("../data/clean/ara_measurements_clean.csv",
                                parse_dates=['sample_dt', 'prescribed_collection_date'])

In [3]:
# Subset to columns of interest
ponds_cols = ['pond_id', 'property_area_acres', 'pond_area_acres',
                      'pond_depth_meters', 'geometry']
pond_measurement_cols = ['pond_id', 'region', 'farm_id', 'group', 'treatment_group',
                         'sample_dt', 'time_of_day', 'do_mg_per_L', 'ph', 'turbidity_cm',
                         'ammonia_mg_per_L', 'fish_per_acre', 'species', 'primary_productivity_gpp_mg_per_L']

# Construct days since last measurement variable. This would suggest including water quality correction

In [4]:
ponds = ponds[ponds_cols]
ponds = ponds[ponds['geometry'].notna()] # Have to have locations to get weather data.
measurements = measurements[measurements['follow_up'] == False]
measurements = measurements[pond_measurement_cols]

In [5]:
# Remove measurements with inconsistent times of day
measurements['tod_dt'] = measurements['sample_dt'].apply(lambda x: "morning" if x.hour < 12 else "evening")
print(f"Observations with inconsistent times: {len(measurements[measurements['tod_dt'] != measurements['time_of_day']])}")
measurements = measurements[measurements['tod_dt'] == measurements['time_of_day']]
measurements = measurements.drop(columns=['tod_dt'])

Observations with inconsistent times: 44


In [6]:
predict_ds = gpd.GeoDataFrame(
  measurements.merge(ponds, on='pond_id', how='inner'), # Inner as we need location
  geometry='geometry',
  crs=ponds.crs
)

In [7]:
ponds_not_included = ponds[~ponds['pond_id'].isin(predict_ds['pond_id'])]
print(f"No measurements for {len(ponds_not_included)} ponds")

No measurements for 64 ponds


In [8]:
for wq_param in WQ_RANGES.keys():
  param_no_unit = wq_param.split('_', 1)[0]
  predict_ds[f"{param_no_unit}_in_req_range"] = check_in_range(wq_param, predict_ds[wq_param], predict_ds['time_of_day'])

  in_range[values.isna() | periods.isna()] = np.nan
  in_range[values.isna()] = np.nan
  in_range[values.isna()] = np.nan
  in_range[values.isna()] = np.nan


In [9]:
predict_ds.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
do_mg_per_L,4877.0,6.500465,3.624134,0.0,3.6,4.8,9.4,80.8
ph,4877.0,8.182413,0.314831,3.38,8.0,8.21,8.39,9.54
turbidity_cm,3414.0,28.950644,7.524521,9.0,24.0,29.0,32.0,89.0
ammonia_mg_per_L,2965.0,0.13196,0.257797,0.0,0.01,0.06,0.18,9.0
fish_per_acre,2868.0,5118.866806,8632.648089,0.0,2300.0,3000.0,3882.0,66667.0
primary_productivity_gpp_mg_per_L,1202.0,3.440512,2.700201,0.0,1.85,3.0,4.7,16.9
property_area_acres,4969.0,9.259086,10.982728,0.25,2.5,5.0,13.0,53.0
pond_area_acres,4194.0,8.298865,10.499382,0.13,1.65,3.45,10.81,49.8
pond_depth_meters,4969.0,2.163614,0.808669,1.0,1.9,2.0,2.5,11.0


In [10]:
predict_ds['measurement_idx'] = pd.Series([i for i in range(0, len(predict_ds))])
predict_ds.to_file("../data/clean/ara_predict_ds.geojson")

In [11]:
predict_ds.columns

Index(['pond_id', 'region', 'farm_id', 'group', 'treatment_group', 'sample_dt',
       'time_of_day', 'do_mg_per_L', 'ph', 'turbidity_cm', 'ammonia_mg_per_L',
       'fish_per_acre', 'species', 'primary_productivity_gpp_mg_per_L',
       'property_area_acres', 'pond_area_acres', 'pond_depth_meters',
       'geometry', 'do_in_req_range', 'ph_in_req_range',
       'ammonia_in_req_range', 'turbidity_in_req_range', 'measurement_idx'],
      dtype='object')