In [4]:
# TODO: Merge these imports into the cells where they are used. 
import os
from glob import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import typing as t
# %pip install pyaqsapi
# %pip install certifi
# %pip install requests
import pyaqsapi as aqs
from datetime import date
from functools import reduce

In [5]:
# Parse the measurement and reference dataframe lists from csv files in project folder.
measurement_files = glob("./BEACO2N_measurements/*.csv")
measurement_df = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in measurement_files}
reference_files = glob("./reference_measurements/*.csv")
reference_df = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in reference_files}
print(reference_files)
print(list(reference_df.keys()))

# Clean measurement and reference data.

def clean_BEACO2N(df: pd.DataFrame) -> pd.DataFrame :
    # Store time in Pandas datetime format.
    df.rename(columns={"datetime":"timestamp", "co2_raw":"co2"}, inplace=True)
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.round("h")

    # Drop redundant time columns
    df.drop(columns=["local_timestamp", "epoch", "node_file_id", "node_id"], inplace=True)
    
    # For all columns suffixed by "_wrk_aux", convert from Volts to milliVolts (*1000) and remove suffix
    wrk_aux_cols = df.filter(regex=r"_wrk_aux$").columns
    df[wrk_aux_cols] *= 1000
    df.rename(columns= {col : col.replace("_wrk_aux", "") for col in wrk_aux_cols}, inplace=True)

    # Use only corrected BEACO2N data.
    df.drop(columns='co', inplace=True)
    df.rename(columns={'co_corrected':'co'}, inplace=True)

    # df['co'] /= 1000 # Convert from ppb to ppm to be consistent with AQS
    
    # Drop all datapoints with incomplete data (e.g. missing co measurement)
    df.dropna(inplace=True)
    df.set_index("timestamp")
    return df

# Clean each site's dataframe
measurement_df = {site: clean_BEACO2N(df) for site, df in measurement_df.items()}

# Clean data for reference (QuantAQ) analogously
# NOTE: QuantAQ CO output is in ppb, because hourly data is 'final' (i.e. corrected). 
# See: https://docs.quant-aq.com/hardware/modulair/modulair#id-3.1-data-structure-and-outputs
def clean_QuantAQ(df: pd.DataFrame) -> pd.DataFrame :
    df.drop(columns=[col for col in ["period_start", "period_end", "period_end_utc", "sn"] if col in df.columns], inplace=True)
    # TODO: Find out if BEACO2N corrected hourly data timestamps reflect the average for the next or previous hour. 
    df.rename(columns={"period_start_utc": "timestamp", "pm25": "pm2_5"}, inplace=True)
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    # df['co'] /= 1000 # convert from ppb to ppm [to be consistent with AQS]
    df.dropna(inplace=True)
    df.set_index("timestamp")
    return df

reference_df.update({key: clean_QuantAQ(df) for key, df in reference_df.items() if not "aqs" in key})

def clean_aqs(df: pd.DataFrame) -> pd.DataFrame :
    df.rename(columns={"sample_measurement" : "co"}, inplace=True)
    df["timestamp"] = df["date_gmt"] + ' ' + df["time_gmt"]
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True) # should already be hourly, don't need to round.
    df.drop(columns=[col for col in df.columns if col not in ["co", "timestamp"]], inplace=True)
    df = df[["timestamp", "co"]]

    df.dropna(inplace=True)
    df.set_index("timestamp")
    return df

reference_df.update({key : clean_aqs(df) for key, df in reference_df.items() if "aqs" in key})

# Remove AQS references from keys
# TODO: Temporary
# reference_df = {key : val for key, val in reference_df.items() if "aqs" not in key}

# Remove CO outliers from every site's data, including reference. 
def rm_reference_outliers(df : pd.DataFrame) -> pd.DataFrame :
    co_zscore = abs(df['co']-df['co'].mean())/df['co'].std()
    return df[co_zscore < 3]
def rm_measurement_outliers(df : pd.DataFrame) -> pd.DataFrame :
    co_zscore = abs(df['co']-df['co'].mean())/df['co'].std()
    return df[co_zscore < 3]

measurement_df = {site: rm_measurement_outliers(df) for site, df in measurement_df.items()}
reference_df = {key: rm_reference_outliers(df) for key, df in reference_df.items()}

zones = {
    "dpw" : ["reservoir", "medschool", "dpw", "ccri", "southprovlib", "prek", "gym", "cfs", "myron"],
    "pema" : ["ecubed", "rochambeau", "smithhill", "martialarts", "blackstone", "rocklib", "provcollege", "pema"],
    "pha" : ["silverlake", "carnevale", "zuccolo", "wecc", "unitedway", "pha", "mtpleasant", "ricollege"]
}

['./reference_measurements/aqs_cranston.csv', './reference_measurements/pha.csv', './reference_measurements/pema.csv', './reference_measurements/dpw.csv', './reference_measurements/aqs_myron.csv']
['aqs_cranston', 'pha', 'pema', 'dpw', 'aqs_myron']


In [10]:
from functools import reduce

# Prepare a list to collect all dataframes with renamed columns
dfs = []

# Helper function to rename columns for each site
def rename_columns(df, site, is_aqs=False, is_quantaq=False, is_beaco2n=False):
    # Only keep timestamp, co, rh, temp if present
    cols = ['timestamp']
    if 'co' in df.columns: cols.append('co')
    if 'rh' in df.columns: cols.append('rh')
    if 'temp' in df.columns: cols.append('temp')
    df = df[cols].copy()
    if is_aqs:
        # For AQS, use "co_aqs_[site]" (not "co_aqs_[site]_aqs")
        suffix = f"_aqs_{site}"
    elif is_quantaq:
        suffix = f"_{site}_quantaq"
    elif is_beaco2n:
        suffix = f"_{site}_beaco2n"
    else:
        suffix = f"_{site}"
    rename_map = {}
    if 'co' in df.columns:
        if is_aqs:
            rename_map['co'] = f"co_{site}"
        else:
            rename_map['co'] = f"co{suffix}"
    if 'rh' in df.columns: rename_map['rh'] = f"rh{suffix}"
    if 'temp' in df.columns: rename_map['temp'] = f"temp{suffix}"
    df = df.rename(columns=rename_map)
    return df

# Add reference sites
for site, df in reference_df.items():
    is_aqs = 'aqs' in site
    is_quantaq = (site in measurement_df) and not is_aqs
    dfs.append(rename_columns(df, site, is_aqs=is_aqs, is_quantaq=is_quantaq))

# Add measurement sites
for site, df in measurement_df.items():
    is_beaco2n = (site in reference_df)
    dfs.append(rename_columns(df, site, is_beaco2n=is_beaco2n))

# Outer merge all dataframes on timestamp
merged_all = reduce(lambda left, right: pd.merge(left, right, on='timestamp', how='outer'), dfs)
merged_all = merged_all.sort_values('timestamp').set_index('timestamp')

merged_all.index.rename("date", inplace=True)
merged_all.to_csv("./combined_dataset.csv")