This script calibrates BEACO2N carbon monoxide data using QuantAQ sensors as reference. The script only applies this calibration to colocated sites, i.e. Department of Public Works, Providence Emergency Management Agency, and Providence Housing Authority. 

In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# Parse the measurement and reference dataframe lists from csv files in project folder.
measurement_files = glob("./BEACO2N_measurements/*.csv")
measurement = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in measurement_files}
reference_files = glob("./reference_measurements/*.csv")
reference = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in reference_files}

# Clean measurement and reference data.

def clean_measurement(df: pd.DataFrame) -> pd.DataFrame :
    # Store time in Pandas datetime format.
    df = df.rename(columns={"datetime":"timestamp", "co2_raw":"co2"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.round("h")

    # Drop redundant time columns
    df = df.drop(columns=[col for col in ["local_timestamp", "epoch", "node_file_id", "node_id"] if col in df.columns])
    
    # For all columns suffixed by "_wrk_aux", convert from Volts to milliVolts (*1000) and remove suffix
    wrk_aux_cols = df.filter(regex=r"_wrk_aux$").columns
    df[wrk_aux_cols] *= 1000
    df.rename(columns= {col : col.replace("_wrk_aux", "") for col in wrk_aux_cols}, inplace=True)
    
    # Drop all datapoints with incomplete data (e.g. missing co measurement)
    df = df.dropna()
    return df

# Clean each site's dataframe
measurement = {site: clean_measurement(df) for site, df in measurement.items()}

# Clean data for reference (QuantAQ) analogously
def clean_reference(df: pd.DataFrame) -> pd.DataFrame :
    df = df.drop(columns=[col for col in ["period_start", "period_end", "period_end_utc", "sn"] if col in df.columns])
    df = df.rename(columns={"period_start_utc": "timestamp", "pm25": "pm2_5"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.dropna()
    return df

reference = {key: clean_reference(df) for key, df in reference.items()}

Find time intervals when RSD(co) < .10 for all reference sensors in the network:

In [None]:
from functools import reduce
rsd_df = []

# Add each site's timestamp and co data to a list of co tables indexed by site
[rsd_df.append(df[["timestamp","co"]].rename(columns={'co':site})) for site, df in reference.items()]
# Merge the co tables into one table. 
rsd_df = reduce(lambda table, to_merge: pd.merge(table, to_merge, on="timestamp", how="inner"), rsd_df) # type: ignore

def rsd(row:pd.Series) -> float:
    ''' Helper function to calculate residual standard deviation of a dataframe row. '''
    vals = row[1:].values.astype(float)
    mean = np.mean(vals)
    sd = np.std(vals)
    return float(sd/mean) if mean != 0 else np.nan

rsd_df["rsd"]=rsd_df.apply(rsd, axis=1)
timestamps_rsd_lt_10pc = rsd_df[rsd_df["rsd"]<.10]["timestamp"]
print(f"10pc_times:\n{timestamps_rsd_lt_10pc}\n")
# Filter datasets to include only data contained by intersection(timestamps_rsd_lt_10pc, measurement, reference)

common_timestamps = timestamps_rsd_lt_10pc
for ref_site in reference: common_timestamps = common_timestamps[common_timestamps.isin(reference[ref_site]["timestamp"])]

common_times_by_meas = {}
for meas_site in measurement.keys(): common_times_by_meas[meas_site] = common_timestamps[common_timestamps.isin(measurement[meas_site]["timestamp"])]

# excluded_meas_sites = ["zuccolo", "gym", "reservoir", "mtpleasant", "rochambeau", "ricollege"]

# Find the common timestamps
# common_timestamps = timestamps_rsd_lt_10pc
# # print("23:", common_timestamps)
# for site in measurement:
#     if site in excluded_meas_sites: continue
#     common_timestamps = common_timestamps[common_timestamps.isin(measurement[site]["timestamp"])]
#     print(f"meas_site:{site}, commontimes:\n{common_timestamps}\n")
# for site in reference:
#     common_timestamps = common_timestamps[common_timestamps.isin(reference[site]["timestamp"])]
#     print(f"ref_site:{site}, commontimes:\n{common_timestamps}\n")

# Dicts containing dataframes filtered to only contain timestamps in 
# intersection(timestamps_rsd_lt_10pc, measurement, reference)
measurement_filtered_common = {}
reference_filtered_common = {}

# Filter dataframes to only include timestamps in common_timestamps

all_timestamps = pd.concat(
    [meas_df["timestamp"] for meas_df in measurement.values()] + 
    [ref_df["timestamp"] for ref_df in reference.values()]
).tolist()
all_timestamps = all_timestamps

# for site in measurement.keys():
#     # measurement_filtered_common[site] = \
#         # measurement[site][measurement[site]["timestamp"].isin(common_timestamps)].drop("timestamp")
#     # measurement_filtered_common[site] = measurement[site].filter([measurement[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")
#     print("T:", common_timestamps)
#     # print("[DF]:", measurement_filtered_common)
# for site in reference.keys():
#     reference_filtered_common[site] = reference[site][reference[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")

# # Print the first few rows of each key dataframe for inspection
# print("rsd_df (first 5 rows):")
# display(rsd_df.head())

# for name, data in [("merged", merged), ("measurement", measurement), ("reference", reference)]:
#     print(f"\n{name} (first 2 rows per site):")
#     for k, v in data.items():
#         print(f"Site: {k}")
#         display(v.head(2))


10pc_times:
0      2024-12-17 12:00:00+00:00
1      2024-12-17 13:00:00+00:00
2      2024-12-17 14:00:00+00:00
3      2024-12-17 15:00:00+00:00
4      2024-12-17 16:00:00+00:00
                  ...           
3725   2025-05-22 12:00:00+00:00
3726   2025-05-22 13:00:00+00:00
3727   2025-05-22 14:00:00+00:00
3728   2025-05-22 15:00:00+00:00
3729   2025-05-22 16:00:00+00:00
Name: timestamp, Length: 3458, dtype: datetime64[ns, UTC]

meas_site:cfs, commontimes:
0      2024-12-17 12:00:00+00:00
1      2024-12-17 13:00:00+00:00
2      2024-12-17 14:00:00+00:00
3      2024-12-17 15:00:00+00:00
4      2024-12-17 16:00:00+00:00
                  ...           
3725   2025-05-22 12:00:00+00:00
3726   2025-05-22 13:00:00+00:00
3727   2025-05-22 14:00:00+00:00
3728   2025-05-22 15:00:00+00:00
3729   2025-05-22 16:00:00+00:00
Name: timestamp, Length: 3400, dtype: datetime64[ns, UTC]

meas_site:silverlake, commontimes:
0      2024-12-17 12:00:00+00:00
1      2024-12-17 13:00:00+00:00
2      2024-12-

Fit per-site regression models to the data where RSC(co)<0.10:

In [6]:
zones = {
    "dpw" : ["reservoir", "medschool", "dpw", "ccri", "southprovlib", "prek", "gym", "cfs", "myron"],
    "pema" : ["ecubed", "rochambeau", "smithhill", "martialarts", "blackstone", "rocklib", "provcollege", "pema"],
    "pha" : ["silverlake", "carnevale", "zuccolo", "wecc", "unitedway", "pha", "mtpleasant", "ricollege"]
}

models = {ref_site : {meas_site : LinearRegression() for meas_site in zones[ref_site]} for ref_site in zones.keys()}
X_train = {ref_site : {meas_site : pd.DataFrame() for meas_site in zones[ref_site]} for ref_site in zones.keys()}
X_test = {ref_site : {meas_site : pd.DataFrame() for meas_site in zones[ref_site]} for ref_site in zones.keys()}
y_train = {ref_site : pd.DataFrame() for ref_site in zones.keys()}
y_test = {ref_site : pd.DataFrame() for ref_site in zones.keys()}
y_pred = {ref_site : {meas_site : np.empty(0) for meas_site in zones[ref_site]} for ref_site in zones.keys()} # TODO: Need to make sure np.empty(0) will work. 

for i, ref_site in enumerate(zones.keys()):
    print((reference_filtered_common[ref_site]))
    train_indxs, test_indxs = train_test_split(range(len(reference_filtered_common[ref_site])), random_state=0)
    y_train[ref_site] = reference_filtered_common[ref_site].iloc[train_indxs]
    y_test[ref_site] = reference_filtered_common[ref_site].iloc[test_indxs]

    for j, meas_site in enumerate(zones.get(ref_site)): # type: ignore
        X_train[ref_site][meas_site] = measurement_filtered_common[meas_site].iloc[train_indxs].drop("timestamp", axis=1)
        X_test[ref_site][meas_site] = measurement_filtered_common[meas_site].iloc[train_indxs].drop("timestamp", axis=1)
        models[ref_site][meas_site].fit(X_train[ref_site][meas_site], y_train[ref_site])
        y_pred[ref_site][meas_site] = models[ref_site][meas_site].predict(X_test[ref_site][meas_site])
# for i, site in enumerate(ref_sites):
#     x_train_i, x_test_i, y_train_i, y_test_i = train_test_split(measurement[site].drop("timestamp", axis=1), reference[site]["co"], random_state=42)
#     x_train[site] = x_train_i
#     x_test[site] = x_test_i
#     y_train[site] = y_train_i
#     y_test[site] = y_test_i
#     models[site] = LinearRegression()
#     models[site].fit(x_train[site], y_train[site])
#     y_pred[site] = models[site].predict(x_test[site])

Empty DataFrame
Columns: [timestamp, n_datapoints, rh, temp, pm1, pm2_5, pm10, co, no, no2, o3]
Index: []


ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Display residual graphs and statistics for each site model. 

In [None]:
fig = make_subplots(rows=1, cols=3, subplot_titles=[f"{site}: R^2={models[site].score(x_test[site], y_test[site]):.4f}" for site in refsites])
fig_ledgend = {"Corrected CO":"blue", "Uncorrected CO":"red"}
hover_vals = {"Corrected CO":
            'Reference CO: %{x}<br>' + 
            'Corrected CO:%{y}<br>' +
            'Timestamp: %{customdata}<extra></extra>', 
            "Uncorrected CO":
            'Reference CO: %{x}<br>' + 
            'Uncorrected CO:%{y}<br>' +
            'Timestamp: %{customdata}<extra></extra>'}
for i, site in enumerate(refsites):
    # Plot each site's data in a subplot
    showledgend = (i==1)
    y_uncorrected = measurement[site].sort_values(by="timestamp")["co"]
    for label in fig_ledgend.keys():
        # Plot corrected and uncorrected CO data for each site
        fig.add_trace(go.Scatter(
            x=y_test[site],
            y = y_pred[site] if label=="Corrected CO" else y_uncorrected,
            mode='markers',
            name=label,
            marker_color=fig_ledgend[label],
            showlegend=showledgend,
            customdata=common_timestamps.sort_values(),
            hovertemplate=hover_vals[label] 
        ), 
        row=1, col=(i+1))

    min_val = min(y_test[site].min(), y_pred[site].min(), y_uncorrected.min())
    max_val = max(y_test[site].max(), y_pred[site].max(), y_uncorrected.max())
    fig.add_shape(
        type='line',
        x0=min_val, y0=min_val,
        x1=max_val, y1=max_val,
        line=dict(color='red', dash='dash'),
        name='1:1 Line',
        row=1, col=(i+1)
    )
    fig.update_layout(
        title="Corrected BEACO2N CO data at reference sites:",
        xaxis_title="Reference CO (mV)",
        yaxis_title="Measured CO (mV)",
        legend_title="Legend", 
    )
    fig.update_xaxes(title_text="Reference CO (mV)")

fig.show()
# Collect coefficients for each site into a DataFrame for tabular display
coef_table = []
for site, model in models.items():
    for name, coef in zip(x_train[site].columns, model.coef_):
        coef_table.append({'Site': site, 'Parameter': name, 'Coefficient': coef})
coef_df = pd.DataFrame(coef_table)
display(coef_df.pivot(index='Parameter', columns='Site', values='Coefficient').round(3))

Assign each measurement (BEACO2N) sensor to a reference sensor and train a calibration model (for each measurement sensor) using timestamps with RSC(co)<0.10 within the reference network.

Note: Measurement nodes are assigned to the nearest reference node according to calculations done in QGIS with Grace's BPP network map. May be worth confirming this using ArcGIS at some point. (Perhaps RIDEM data can eventually be used to improve spacial accuracy... some nodes are >2mi from reference.)

| Reference | Measurement locations |
|------------|-------------------------|
| dpw | reservoir, medschool, dpw, ccri, southprovlib, prek, gym, cfs, myron|
| pema | ecubed, rochambeau, smithhill, martialarts, blackstone, rocklib, provcollege, pema|
| pha | silverlake, carnevale, zuccolo, wecc, unitedway, pha, mtpleasant, ricollege|
