In [2]:
import sys

import pandas as pd

from fwi_predict.constants import TZ_STRING
from fwi_predict.wq import WQ_RANGES, get_in_required_range

sys.path.append("..")
from scripts.clean_ara_measurements import column_map

In [3]:
jun_dec_samples = pd.read_excel("../data/raw/Testing Data Jun-Dec 2024.xls")


In [4]:
jun_dec_samples = jun_dec_samples.rename(columns=column_map).rename(columns={'DO (mg/L)': 'do_mg_per_L', 'Turbidity (in cm)': 'turbidity_cm', 'Is follow up': 'follow_up'})

In [5]:
jun_dec_samples['sample_dt'] = pd.to_datetime(
	jun_dec_samples['Date of data collection'].dt.strftime('%Y-%m-%d') + ' ' + 
	jun_dec_samples['Time of data collection'].astype(str)
)
jun_dec_samples['sample_dt'] = jun_dec_samples['sample_dt'].dt.tz_localize(TZ_STRING)

In [6]:
jun_dec_samples = jun_dec_samples[['sample_dt', 'do_mg_per_L', 'turbidity_cm', 'ph', 'ammonia_mg_per_L', 'pond_id']]

In [12]:
jun_dec_samples['morning'] = jun_dec_samples['sample_dt'].dt.hour < 12
jun_dec_samples['time_of_day'] = jun_dec_samples['morning'].apply(lambda x: 'morning' if x else 'evening')
for param in WQ_RANGES.keys():
  range_name = param.split('_')[0] + '_in_range'
  jun_dec_samples[range_name] = get_in_required_range(param, jun_dec_samples[param], jun_dec_samples['time_of_day'])

In [13]:
predict_df = pd.read_csv("../data/predict_dfs/train/measurements_with_metadata_predict_df.csv", parse_dates=['sample_dt'])

# Get parameters for classification problem
predict_df['morning'] = predict_df['hour'] < 12
predict_df['time_of_day'] = predict_df['morning'].apply(lambda x: 'morning' if x else 'evening')
for param in WQ_RANGES.keys():
  range_name = param.split('_')[0] + '_in_range'
  predict_df[range_name] = get_in_required_range(param, predict_df[param], predict_df['time_of_day'])

In [None]:
predict_df['ds'] = 'train'
jun_dec_samples['ds'] = 'jun_dec_24_test'

In [18]:
common_cols = jun_dec_samples.columns[jun_dec_samples.columns.isin(predict_df.columns)].tolist()
combined = pd.concat([predict_df[common_cols], jun_dec_samples[common_cols]], ignore_index=True)

In [20]:
combined.to_csv("../data/clean/combined_josiah_compare.csv", index=False)