This script calibrates BEACO2N carbon monoxide data using QuantAQ sensors as reference. 

In [None]:
# %pip install scikit-learn
import os
import pandas as pd
from datetime import datetime
import numpy as np
# import sklearn
from glob import glob

In [5]:
# Parse the measurement and reference dataframe lists from csv files in project folder.
measurement_files = glob("./BEACO2N_measurements/*.csv")
measurement = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in measurement_files}
reference_files = glob("./reference_measurements/*.csv")
reference = {os.path.splitext(os.path.basename(f))[0] : pd.read_csv(f) for f in reference_files}

# Clean measurement and reference data.
def clean_measurement(df: pd.DataFrame) -> pd.DataFrame :
    df = df.rename(columns={"datetime":"timestamp", "co2_raw":"co2"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.round("h")
    df = df.drop(columns=[col for col in ["local_timestamp", "epoch", "node_file_id"] if col in df.columns])
    wrk_aux_cols = df.filter(regex=r"_wrk_aux").columns
    df[wrk_aux_cols] *= 1000
    df.rename(columns= {col : col.replace("_wrk_aux", "") for col in wrk_aux_cols}, inplace=True)
    df = df.dropna()
    return df

measurement = {key: clean_measurement(df) for key, df in measurement.items()}

def clean_reference(df: pd.DataFrame) -> pd.DataFrame :
    df = df.drop(columns=[col for col in ["period_start", "period_end", "period_end_utc", "sn"] if col in df.columns])
    df = df.rename(columns={"period_start_utc": "timestamp", "pm25": "pm2_5"})
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.dropna()
    return df

reference = {key: clean_reference(df) for key, df in reference.items()}

# Merge measurement and reference data by site into one list of site dataframes. 
merged = {key : pd.merge(measurement[key], reference[key], on="timestamp", how="inner", suffixes=("_meas","_ref")).dropna() 
          for key in reference.keys()}

# for site, df in measurement.items():
#     print(f"Site: {site}")
#     display(df)

Find time intervals when RSD(co) < .10 for all reference sensors in the network:

In [None]:
co_data = []
for site, df in reference.items() :
    co_data.append(df[["timestamp","co"]].rename(columns={"co":site}))
rsd_df = co_data[0]
for df in co_data[1:]:
    rsd_df = pd.merge(rsd_df, df, on="timestamp", how="inner")

def rsd(row:pd.Series) -> float:
    vals = row[1:].values.astype(float)
    mean = np.mean(vals)
    sd = np.std(vals)
    return float(sd/mean) if mean != 0 else np.nan

rsd_df["rsd"]=rsd_df.apply(rsd, axis=1)
timestamps_rsd_lt_10pc = rsd_df[rsd_df["rsd"]<.10]["timestamp"]

merged = {
    k :
    df[df["timestamp"].isin(timestamps_rsd_lt_10pc)].dropna()
    for k, df in merged.items()
}
measurement = {
    k :
    df[df["timestamp"].isin(timestamps_rsd_lt_10pc)].dropna()
    for k, df in measurement.items()
}
reference = {
   k :
   df[df["timestamp"].isin(timestamps_rsd_lt_10pc)].dropna()
   for k, df in reference.items()
}

print("rsd_df:\n", rsd_df.head())
print("merged:\n", {k: v.head() for k, v in merged.items()})
print("measurement:\n", {k: v.head() for k, v in measurement.items()})
print("reference:\n", {k: v.head() for k, v in reference.items()})


Fit a regression model to the uniform data:

In [None]:
# measurement = measurement[measurement["timestamp"].isin(timestamps_rsd_lt_10pc)]
refsites = list(reference.keys())
from sklearn import linear_model

models = {
    site : linear_model.LinearRegression()
    for site in refsites
}
for site, model in models.items():
    model.fit()