This script calibrates BEACO2N carbon monoxide data using QuantAQ sensors as reference. The script only applies this calibration to colocated sites, i.e. Department of Public Works, Providence Emergency Management Agency, and Providence Housing Authority. 

_TODO:_ Ensure timestamps are not becoming misaligned. [Upon manually verifying the rightmost datapoint for PHA (graphed below), the QuantAQ timestamp for that CO value did not match the graphed timestamp.]

In [None]:
import os
from glob import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go

In [39]:
# Parse the measurement and reference dataframe lists from csv files in project folder.
measurement_files = glob("./BEACO2N_measurements/*.csv")
measurement = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in measurement_files}
reference_files = glob("./reference_measurements/*.csv")
reference = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in reference_files}

# Clean measurement and reference data.
def clean_measurement(df: pd.DataFrame) -> pd.DataFrame :
    df = df.rename(columns={"datetime":"timestamp", "co2_raw":"co2"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.round("h")
    df = df.drop(columns=[col for col in ["local_timestamp", "epoch", "node_file_id"] if col in df.columns])
    wrk_aux_cols = df.filter(regex=r"_wrk_aux").columns
    df[wrk_aux_cols] *= 1000
    df.rename(columns= {col : col.replace("_wrk_aux", "") for col in wrk_aux_cols}, inplace=True)
    df = df.dropna()
    return df

measurement = {key: clean_measurement(df) for key, df in measurement.items()}

def clean_reference(df: pd.DataFrame) -> pd.DataFrame :
    df = df.drop(columns=[col for col in ["period_start", "period_end", "period_end_utc", "sn"] if col in df.columns])
    df = df.rename(columns={"period_start_utc": "timestamp", "pm25": "pm2_5"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.dropna()
    return df

reference = {key: clean_reference(df) for key, df in reference.items()}

# Merge measurement and reference data by site into one list of site dataframes. 
merged = {key : pd.merge(measurement[key], reference[key], on="timestamp", how="inner", suffixes=("_meas","_ref")).dropna() 
          for key in reference.keys()}

Find time intervals when RSD(co) < .10 for all reference sensors in the network:

In [42]:
co_df = []
for site, df in reference.items() :
    co_df.append(df[["timestamp","co"]].rename(columns={"co":site}))
rsd_df = co_df[0]
for df in co_df[1:]:
    rsd_df = pd.merge(rsd_df, df, on="timestamp", how="inner")

def rsd(row:pd.Series) -> float:
    vals = row[1:].values.astype(float)
    mean = np.mean(vals)
    sd = np.std(vals)
    return float(sd/mean) if mean != 0 else np.nan

rsd_df["rsd"]=rsd_df.apply(rsd, axis=1)
timestamps_rsd_lt_10pc = rsd_df[rsd_df["rsd"]<.10]["timestamp"]

# Filter datasets to include only data contained by intersection(merged, measurement, reference)

# Find the common timestamps
common_timestamps = timestamps_rsd_lt_10pc
for site in measurement:
    common_timestamps = common_timestamps[common_timestamps.isin(measurement[site]["timestamp"])]
for site in reference:
    common_timestamps = common_timestamps[common_timestamps.isin(reference[site]["timestamp"])]
for site in merged:
    common_timestamps = common_timestamps[common_timestamps.isin(merged[site]["timestamp"])]

# Filter dataframes to only include timestamps in timestamps_rsd_lt_10pc
for site in measurement.keys():
    measurement[site] = measurement[site][measurement[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")
for site in reference.keys():
    reference[site] = reference[site][reference[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")
for site in merged.keys():
    merged[site] = merged[site][merged[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")

# Print the first few rows of each key dataframe for inspection
print("rsd_df (first 5 rows):")
display(rsd_df.head())

print("\nmerged (first 2 rows per site):")
for k, v in merged.items():
    print(f"\nSite: {k}")
    display(v.head(2))

print("\nmeasurement (first 2 rows per site):")
for k, v in measurement.items():
    print(f"\nSite: {k}")
    display(v.head(2))

print("\nreference (first 2 rows per site):")
for k, v in reference.items():
    print(f"\nSite: {k}")
    display(v.head(2))

rsd_df (first 5 rows):


Unnamed: 0,timestamp,pha,pema,dpw,rsd
0,2024-12-17 12:00:00+00:00,754.405,744.122,701.595,0.031171
1,2024-12-17 13:00:00+00:00,740.072,754.683,716.186,0.021531
2,2024-12-17 14:00:00+00:00,734.513,762.251,717.17,0.025159
3,2024-12-17 15:00:00+00:00,753.998,777.728,725.462,0.028399
4,2024-12-17 16:00:00+00:00,744.177,788.258,746.952,0.026531



merged (first 2 rows per site):

Site: pha


Unnamed: 0,timestamp,co2,co_meas,no2_meas,no_meas,o3_meas,pm2_5_meas,pressure,rh_meas,temp_meas,...,wd,ws,pm1,pm2_5_ref,pm10,co_ref,no_ref,no2_ref,o3_ref,ws_scalar
0,2024-12-17 12:00:00+00:00,462.892583,54.672333,3.436667,-3.593667,25.721,1.37736,1013.650033,90.182884,15.007287,...,239.755,2.118,1.017,1.968,4.642,754.405,3.398,7.128,28.949,2.335
1,2024-12-17 13:00:00+00:00,463.73,55.103167,1.539167,-2.300833,21.973167,2.22611,1013.795214,80.274058,15.498985,...,237.424,4.287,1.651,2.289,4.248,740.072,2.726,6.928,27.273,5.033



Site: pema


Unnamed: 0,timestamp,co2,co_meas,no2_meas,no_meas,o3_meas,pm2_5_meas,pressure,rh_meas,temp_meas,...,wd,ws,pm1,pm2_5_ref,pm10,co_ref,no_ref,no2_ref,o3_ref,ws_scalar
0,2024-12-17 12:00:00+00:00,479.616675,52.614833,-2.157,31.801667,20.7315,0.0,1015.855074,77.371347,15.97132,...,234.356,5.086,0.356,1.313,5.894,744.122,2.724,10.146,37.536,6.087
1,2024-12-17 13:00:00+00:00,481.544955,57.119667,-3.358167,32.823167,16.757,0.20352,1016.041298,72.557519,16.265786,...,227.197,5.219,0.935,1.563,4.782,754.683,2.289,9.596,34.508,6.064



Site: dpw


Unnamed: 0,timestamp,co2,co_meas,no2_meas,no_meas,o3_meas,pm2_5_meas,pressure,rh_meas,temp_meas,...,n_datapoints,rh_ref,temp_ref,pm1,pm2_5_ref,pm10,co_ref,no_ref,no2_ref,o3_ref
0,2024-12-17 12:00:00+00:00,445.549768,64.3545,-7.146,25.432333,5.628167,1.19892,1015.462369,66.358414,17.519487,...,60,87.588,14.107,1.176,3.01,12.846,701.595,3.22,10.06,35.551
1,2024-12-17 13:00:00+00:00,449.488032,70.413,-9.695167,26.669833,1.483,1.6987,1015.694863,61.404523,17.984605,...,60,84.245,14.078,1.825,2.798,7.874,716.186,2.776,10.311,32.842



measurement (first 2 rows per site):

Site: pha


Unnamed: 0,timestamp,co2,co,no2,no,o3,pm2_5,pressure,rh,temp,node_id
0,2024-12-17 12:00:00+00:00,462.892583,54.672333,3.436667,-3.593667,25.721,1.37736,1013.650033,90.182884,15.007287,257
1,2024-12-17 13:00:00+00:00,463.73,55.103167,1.539167,-2.300833,21.973167,2.22611,1013.795214,80.274058,15.498985,257



Site: ccri


Unnamed: 0,timestamp,co2,co,no2,no,o3,pm2_5,pressure,rh,temp,node_id
0,2024-12-17 12:00:00+00:00,452.537464,79.797833,9.830667,9.464333,12.063833,0.4,1014.462575,70.761765,16.44903,259
1,2024-12-17 13:00:00+00:00,455.761568,86.676,8.098667,10.777167,7.265833,0.58726,1014.633889,65.207693,16.792317,259



Site: pema


Unnamed: 0,timestamp,co2,co,no2,no,o3,pm2_5,pressure,rh,temp,node_id
0,2024-12-17 12:00:00+00:00,479.616675,52.614833,-2.157,31.801667,20.7315,0.0,1015.855074,77.371347,15.97132,271
1,2024-12-17 13:00:00+00:00,481.544955,57.119667,-3.358167,32.823167,16.757,0.20352,1016.041298,72.557519,16.265786,271



Site: myron


Unnamed: 0,timestamp,co2,co,no2,no,o3,pm2_5,pressure,rh,temp,node_id
0,2024-12-17 12:00:00+00:00,509.253555,75.746333,15.918167,29.749833,26.8555,0.05679,1014.940731,80.805683,15.662446,250
1,2024-12-17 13:00:00+00:00,510.426209,79.467167,14.273667,30.407833,24.457,0.20538,1015.135324,73.930475,16.063199,250



Site: library


Unnamed: 0,timestamp,co2,co,no2,no,o3,pm2_5,pressure,rh,temp,node_id
0,2024-12-17 12:00:00+00:00,469.62794,69.236833,2.262667,6.270167,12.543167,4.24057,1012.785028,81.562849,15.029639,253
1,2024-12-17 13:00:00+00:00,472.766053,66.911333,0.227667,7.463667,8.096333,5.03286,1012.97048,74.553787,15.491189,253



Site: dpw


Unnamed: 0,timestamp,co2,co,no2,no,o3,pm2_5,pressure,rh,temp,node_id
0,2024-12-17 12:00:00+00:00,445.549768,64.3545,-7.146,25.432333,5.628167,1.19892,1015.462369,66.358414,17.519487,276
1,2024-12-17 13:00:00+00:00,449.488032,70.413,-9.695167,26.669833,1.483,1.6987,1015.694863,61.404523,17.984605,276



reference (first 2 rows per site):

Site: pha


Unnamed: 0,timestamp,n_datapoints,rh,temp,wd,ws,pm1,pm2_5,pm10,co,no,no2,o3,ws_scalar
0,2024-12-17 12:00:00+00:00,60,85.778,13.992,239.755,2.118,1.017,1.968,4.642,754.405,3.398,7.128,28.949,2.335
1,2024-12-17 13:00:00+00:00,60,82.692,13.952,237.424,4.287,1.651,2.289,4.248,740.072,2.726,6.928,27.273,5.033



Site: pema


Unnamed: 0,timestamp,n_datapoints,rh,temp,wd,ws,pm1,pm2_5,pm10,co,no,no2,o3,ws_scalar
0,2024-12-17 12:00:00+00:00,60,84.795,14.302,234.356,5.086,0.356,1.313,5.894,744.122,2.724,10.146,37.536,6.087
1,2024-12-17 13:00:00+00:00,60,81.717,14.182,227.197,5.219,0.935,1.563,4.782,754.683,2.289,9.596,34.508,6.064



Site: dpw


Unnamed: 0,timestamp,n_datapoints,rh,temp,pm1,pm2_5,pm10,co,no,no2,o3
0,2024-12-17 12:00:00+00:00,60,87.588,14.107,1.176,3.01,12.846,701.595,3.22,10.06,35.551
1,2024-12-17 13:00:00+00:00,60,84.245,14.078,1.825,2.798,7.874,716.186,2.776,10.311,32.842


Fit per-site regression models to the data where RSC(co)<0.10:

In [41]:
# Fit basic polynomial models only for the colocated BEACO2N nodes
# %pip install plotly.express
refsites = list(reference.keys())

models = {}
x_train = {}
x_test = {}
y_train = {}
y_test = {}
y_pred = {}

for i, site in enumerate(refsites):
    x_train_i, x_test_i, y_train_i, y_test_i = train_test_split(measurement[site].drop("timestamp", axis=1), reference[site]["co"], random_state=42)
    x_train[site] = x_train_i
    x_test[site] = x_test_i
    y_train[site] = y_train_i
    y_test[site] = y_test_i
    models[site] = LinearRegression()
    models[site].fit(x_train[site], y_train[site])
    y_pred[site] = models[site].predict(x_test[site])
    print(site, "\tR^2: ", models[site].score(x_test[site], y_test[site]), '\n')

for i, site in enumerate(refsites):    
    y_true = y_test[site]
    y_raw = measurement[site].sort_values(by="timestamp")["co"]
    
    fig = go.Figure()

    # Add corrected (predicted) values
    fig.add_trace(go.Scatter(
        x=y_test[site],
        y=y_pred[site],
        mode='markers',
        name='Corrected CO',
        customdata=common_timestamps.sort_values(),
        hovertemplate=
            'Reference CO: %{x}<br>' + 
            'Corrected CO:%{y}<br>' +
            'Timestamp: %{customdata}<extra></extra>'
    ))

    # Add raw (uncorrected) values
    fig.add_trace(go.Scatter(
        x=y_test[site],
        y=y_raw,
        mode='markers',
        name='Raw CO',
        customdata=common_timestamps.sort_values(),
        hovertemplate=
            'Reference CO: %{x}<br>' +
            'Raw CO: %{y}<br>' +
            'Timestamp: %{customdata}<extra></extra>'
    ))

    # Add 1:1 reference line
    min_val = min(y_test[site].min(), y_pred[site].min(), y_raw.min())
    max_val = max(y_test[site].max(), y_pred[site].max(), y_raw.max())
    fig.add_shape(
        type='line',
        x0=min_val, y0=min_val,
        x1=max_val, y1=max_val,
        line=dict(color='red', dash='dash'),
        name='1:1 Line'
    )

    fig.update_layout(
        title=f"Site: {site}",
        xaxis_title="Reference CO (mV)",
        yaxis_title="Measured CO (mV)",
        legend_title="Legend"
    )
    fig.show()

pha 	R^2:  0.9816143738945818 

pema 	R^2:  0.9844974907372382 

dpw 	R^2:  0.9826169878421204 

