This script calibrates BEACO2N carbon monoxide data using QuantAQ sensors as reference. 

In [20]:
# %pip install scikit-learn
# %pip install matplotlib 
import os
import pandas as pd
from datetime import datetime
import numpy as np
from matplotlib import pyplot as plt
# import sklearn
from glob import glob

In [None]:
# Parse the measurement and reference dataframe lists from csv files in project folder.
measurement_files = glob("./BEACO2N_measurements/*.csv")
measurement = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in measurement_files}
reference_files = glob("./reference_measurements/*.csv")
reference = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in reference_files}

# Clean measurement and reference data.
def clean_measurement(df: pd.DataFrame) -> pd.DataFrame :
    df = df.rename(columns={"datetime":"timestamp", "co2_raw":"co2"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.round("h")
    df = df.drop(columns=[col for col in ["local_timestamp", "epoch", "node_file_id"] if col in df.columns])
    wrk_aux_cols = df.filter(regex=r"_wrk_aux").columns
    df[wrk_aux_cols] *= 1000
    df.rename(columns= {col : col.replace("_wrk_aux", "") for col in wrk_aux_cols}, inplace=True)
    df = df.dropna()
    return df

measurement = {key: clean_measurement(df) for key, df in measurement.items()}

def clean_reference(df: pd.DataFrame) -> pd.DataFrame :
    df = df.drop(columns=[col for col in ["period_start", "period_end", "period_end_utc", "sn"] if col in df.columns])
    df = df.rename(columns={"period_start_utc": "timestamp", "pm25": "pm2_5"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.dropna()
    return df

reference = {key: clean_reference(df) for key, df in reference.items()}

# Merge measurement and reference data by site into one list of site dataframes. 
merged = {key : pd.merge(measurement[key], reference[key], on="timestamp", how="inner", suffixes=("_meas","_ref")).dropna() 
          for key in reference.keys()}

Find time intervals when RSD(co) < .10 for all reference sensors in the network:

In [None]:
co_df = []
for site, df in reference.items() :
    co_df.append(df[["timestamp","co"]].rename(columns={"co":site}))
rsd_df = co_df[0]
for df in co_df[1:]:
    rsd_df = pd.merge(rsd_df, df, on="timestamp", how="inner")

def rsd(row:pd.Series) -> float:
    vals = row[1:].values.astype(float)
    mean = np.mean(vals)
    sd = np.std(vals)
    return float(sd/mean) if mean != 0 else np.nan

rsd_df["rsd"]=rsd_df.apply(rsd, axis=1)
timestamps_rsd_lt_10pc = rsd_df[rsd_df["rsd"]<.10]["timestamp"]

# Filter datasets to include only data contained by intersection(merged, measurement, reference)

# Find the common timestamps
common_timestamps = timestamps_rsd_lt_10pc
for site in measurement:
    common_timestamps = common_timestamps[common_timestamps.isin(measurement[site]["timestamp"])]
for site in reference:
    common_timestamps = common_timestamps[common_timestamps.isin(reference[site]["timestamp"])]
for site in merged:
    common_timestamps = common_timestamps[common_timestamps.isin(merged[site]["timestamp"])]

# Filter dataframes to only include timestamps in timestamps_rsd_lt_10pc
for site in measurement.keys():
    measurement[site] = measurement[site][measurement[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")
for site in reference.keys():
    reference[site] = reference[site][reference[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")
for site in merged.keys():
    merged[site] = merged[site][merged[site]["timestamp"].isin(common_timestamps)].reset_index(drop=True).sort_values("timestamp")

print("rsd_df:\n", rsd_df.head())
print("merged:\n", {k: v.head() for k, v in merged.items()})
print("measurement:\n", {k: v.head() for k, v in measurement.items()})
print("reference:\n", {k: v.head() for k, v in reference.items()})

Fit per-site regression models to the data where RSC(co)<0.10:

In [49]:
# Fit basic polynomial models only for the colocated BEACO2N nodes
# %pip install plotly.express
# !pip install nbformat
refsites = list(reference.keys())
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import plotly.express as px
models = {}
x_train = {}
x_test = {}
y_train = {}
y_test = {}
y_pred = {}

for i, site in enumerate(refsites):
    x_train_i, x_test_i, y_train_i, y_test_i = train_test_split(measurement[site].drop("timestamp", axis=1), reference[site]["co"], random_state=42)
    x_train[site] = x_train_i
    x_test[site] = x_test_i
    y_train[site] = y_train_i
    y_test[site] = y_test_i
    models[site] = LinearRegression()
    models[site].fit(x_train[site], y_train[site])
    y_pred[site] = models[site].predict(x_test[site])
    print(site, "\tR^2: ", models[site].score(x_test[site], y_test[site]), '\n')
    
    fig = px.scatter(
        x=y_test[site],
        y=y_pred[site],
        labels={'x':'Reference CO (mV)', 'y':'Corrected measured CO (mV)'},
        title=f"Site: {site}"
    )
    fig.add_shape(
        type='line',
        x0=min(y_test[site]), y0=min(y_test[site]),
        x1=max(y_test[site]), y1=max(y_test[site]),
        line=dict(color='red', dash='dash')
    )
    fig.show()

pha 	R^2:  0.9816143738945818 



ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed