# Cleaner: cleaning HOBO data for pressure sensor calibration

The objective is to clean the data generated for several calibrations in a thermostatic chamber.

The input data are CSV files stored in the `folder` folder (whose adress you must change).

First, rename the columns corresponding to the pressure sensor voltages and enter their names in the `sensors` table.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from scipy import stats

In [None]:
folder = './data_clean/'
paths = [ folder + path for path in os.listdir(folder) ]
sensors = [ 'XV', 'XVI', 'VI' ]

paths, sensors

In [None]:
def convert_to_dates(df, col_name):
    times = df[col_name]
    new_times = pd.to_datetime(times, format="%m/%d/%y %I:%M:%S %p")
    new_ts = new_times.values.astype(np.int64)
    
    # check order
    test = np.sort(new_ts) - new_ts
    if np.sum(abs(test)) != 0 :
        raise ValueError()
    
    df[col_name] = new_times

def remove_outliers_z_score(df, col_names, threshold=3):
    mask = pd.Series(True, index=df.index)
    for col_name in col_names:
        z_scores = np.abs(stats.zscore(df[col_name]))
        mask &= z_scores < threshold
    return df[mask]

def read_df(path):
    df = pd.read_csv(path, sep=',', skiprows=1, index_col="#").iloc[:,:5]
    farenheit = '°F' in df.columns[-1]
    df.columns = ['date'] + sensors + ['T']
    # df.columns = [ 'date', 'U 1', 'U 2', 'U 3', 'T' ]

    if farenheit:
        df['T'] = (df['T'] - 32) / 1.8
    convert_to_dates(df, 'date')

    df = remove_outliers_z_score(df, df.columns[1:])
    
    return df

The `add_Tc` function must be configured so that each calibration provides the temperature recorded by the thermostatic chamber.

Below are the two main types of constraints:
- simple ramp programs
- more complex programs with many ramps (case `n==‘5’`)

In [None]:
def add_Tc(n, df):
    t0 = pd.to_datetime({
        '2': '10/29/25 07:13:45 PM',
        '3': '10/31/25 10:00:00 AM',
        '4': '11/03/25 02:46:30 PM',
        '5': '10/27/25 05:32:02 PM',
        '6': '11/05/25 10:31:21 AM',
    } [ n ])

    # ajout du temps en secondes
    df['t'] = (df['date'] - t0).dt.total_seconds().astype(int)
    # création de la colonne Tc
    df['Tc'] = pd.Series(np.nan, index=df.index)

    # sinus
    if n == '5':
        i1 = (df['date'] >= t0) & (df['date'] <= t0 + pd.DateOffset(hours=1))
        df.loc[i1, 'Tc'] = 20

        ramps = [
            (22, 5520),
            (24, 6900),
            (25, 6540),
            (25, 5280),
            (24, 6540),
            (22, 6900),
            (19, 8280),
            (17, 6000),
            (16, 3780),
            (15, 6360),
            (15, 6480),
            (17, 9300),
            (19, 5760),
            (21, 5520),
            (23, 5760),
            (25, 9300),
            (25, 6480),
            (24, 6360),
            (23, 3780),
            (21, 6000),
            (18, 8280),
            (16, 6900),
            (15, 6540),
            (15, 5280),
            (16, 6540),
            (18, 6900),
            (20, 5520),
        ]

        t = t0 + pd.DateOffset(hours=1)
        Tfrom = 20
        for Tcible, seconds in ramps:
            i = (df['date'] >= t) & (df['date'] <= t + pd.DateOffset(seconds=seconds))
            ir = i.loc[i == True].index
            irn = (ir - ir.min()) / (ir.max() - ir.min())
            df.loc[i, 'Tc'] = Tfrom + (Tcible - Tfrom) * irn
            Tfrom = Tcible
            t += pd.DateOffset(seconds=seconds)
        return

    # rampe
    up = n == '2' or n == '5'
    T0, Tf = (5, 25) if up else (25, 5)

    h1 = 4 if up else 3
    h2 = 7 if up else 8

    dr1 = pd.DateOffset(hours=h1)
    dr2 = pd.DateOffset(hours=h2)
    dr3 = pd.DateOffset(hours=4)

    # palier 1
    i1 = (df['date'] >= t0) & (df['date'] <= t0 + dr1)
    df.loc[i1, 'Tc'] = T0
    # rampe
    i2 = (df['date'] >= t0 + dr1) & (df['date'] <= t0 + dr1 + dr2)
    ir = i2.loc[i2 == True].index
    irn = (ir - ir.min()) / (ir.max() - ir.min())
    df.loc[i2, 'Tc'] = T0 + (Tf - T0) * irn
    # palier 2
    i3 = (df['date'] >= t0 + dr1 + dr2) & (df['date'] <= t0 + dr1 + dr2 + dr3)
    df.loc[i3, 'Tc'] = Tf
    
    return df

/!\ to be modified for new sensors

In [None]:
dfs = {}
for path in paths:
    print(f"reading {path}")
    df = read_df(path)
    # post processing
    df.drop(columns=['XVI', 'VI'], axis=1, inplace=True)
    if '3' in path:
        df = df[df['XV'] > 1.3]
    df.rename(columns={"XV":"U"}, inplace=True)

    add_Tc(path[path.index('.csv')-1], df)
    df.drop(columns=['date'], axis=1, inplace=True)
    df = df[['t', 'Tc', 'T', 'U']]

    for col in ['T', 'Tc']:
        df[col] = np.round(df[col]*100)/100

    dfs[path] = df
    df.to_csv(f"./XV/data/{path[path.index('.csv')-1]}.csv")

In [None]:
def plot_df(df, n):
    fig, ax1 = plt.subplots(figsize=(10, 4))
    ax2 = ax1.twinx()
    ax2.plot(df['t'] / 3600, df['U'], label='U', linewidth=0.5)
    ax1.plot(df['t'] / 3600, df['T'], label="T", color="red")
    ax1.plot(df['t'] / 3600, df['Tc'], label="Tc", color="black")
    ax1.legend()
    ax1.set_xlabel("t (h)")
    ax1.set_ylabel("T (°C)")
    ax2.set_ylabel("U (V)")
    ax1.grid()
    plt.title(f"calibration {n}")
    plt.savefig(f"./XV/plots/{n}.png")

for path in dfs:
    df = dfs[path]
    print(path)
    plot_df(df, path[path.index('.csv')-1])
    # break