In [26]:
import pickle
from pathlib import Path

import click
import geopandas as gpd
import pandas as pd

from fwi_predict.constants import TZ_STRING
from fwi_predict.pipeline import create_standard_dataset

In [27]:
measurements = pd.read_excel("../data/raw/Testing Data Jun-Dec 2024_ID,Date,Time only.xls")
measurements['sample_dt'] = pd.to_datetime(
	measurements['Date of data collection'].dt.strftime('%Y-%m-%d') + ' ' + 
	measurements['Time of data collection'].astype(str)
)
measurements['sample_dt'] = measurements['sample_dt'].dt.tz_localize(TZ_STRING)
measurements['sample_idx'] = pd.Series(range(len(measurements)))

measurements = measurements \
    .drop(columns=['Date of data collection', 'Time of data collection', 'Sr. No']) \
    .rename(columns={'Pond ID': 'pond_id'})

In [28]:
# Merge ponds with pond metadata so you get the right variables.
# Also fix the time one hot encoding now.
ponds = gpd.read_file("../data/clean/pond_metadata_clean.geojson")
keep_cols = ['pond_id', 'property_area_acres', 'pond_area_acres',
							'pond_depth_meters', 'geometry']
ponds = ponds[keep_cols]

In [29]:
samples = gpd.GeoDataFrame(
	measurements.merge(ponds, on='pond_id', how='left', validate='many_to_one'),
	crs=ponds.crs
)

In [30]:
no_geom_samples = samples[samples['geometry'].isna()].copy()
print(f"Ponds without locations: {no_geom_samples['pond_id'].unique().tolist()}")

samples = samples[samples['geometry'].notna()]
assert(samples['geometry'].isna().sum() == 0)

Ponds without locations: ['WG-KLR1', 'WG-GPR5', 'WG-SRR3', 'WG-SRR2', 'WG-SRR1']


In [31]:
predict_df_path = Path("../data/predict_dfs/trial/testing_data_jun_dec.csv")
gfs_gcs_filepath = "trial/gfs/testing_data_jun_dec.csv"
gfs_download_root = Path("../data/gcs").resolve()
description = Path(gfs_gcs_filepath).stem

predict_df = create_standard_dataset(samples,
																		 gfs_gcs_filepath,
																		 gfs_download_root,
																		 description=description)

predict_df_path.parent.mkdir(parents=True, exist_ok=True)
predict_df.to_csv(predict_df_path)

Exporting GFS forecast data to fwi-predict/trial/gfs/testing_data_jun_dec.csv.
Visit https://code.earthengine.google.com/tasks to monitor the export.
Task completed.


In [59]:
# Load the models
with open("../models/measurements_with_metadata_simple/do_in_range/Random Forest.pkl", 'rb') as f:
  model = pickle.load(f)
  
with open("../models/measurements_with_metadata_simple/do_in_range/encoder.pkl", 'rb') as f:
  encoder = pickle.load(f)

In [38]:
predict_df = pd.read_csv(predict_df_path)

In [44]:
['sample_dt', 'pond_id', 'geometry', 'sample_idx'].extend(ponds.columns[~ponds.columns.isin(keep_cols)].tolist())

In [47]:
sample_idx = predict_df['sample_idx'].copy() # Keep sample idx for later merge.
drop_cols = ['sample_dt', 'pond_id', 'geometry', 'sample_idx'] + ponds.columns[~ponds.columns.isin(keep_cols)].tolist()
X = predict_df.drop(columns=drop_cols)

# Predict
targets = ['do_in_range', 'ph_in_range', 'ammonia_in_range', 'turbidity_in_range']
results_df = measurements.copy()

In [60]:
preds = pd.Series(encoder.inverse_transform(model.predict(X)), index=sample_idx)

In [61]:
preds

sample_idx
0       within
1       within
2       within
3       within
4       within
         ...  
1495    within
1496    within
1497    within
1498    within
1499    within
Length: 1486, dtype: object