In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

### Write function as I explore in notebook

In [5]:
# Read data 
# specify file name

exp_no = 104
file_name = f"{exp_no}_SHT_SMD.txt"
file_path = f"../data/01_raw/{file_name}"
df = pd.read_csv(file_path, sep=',', usecols=['timestamp', 'SHT40_temp', 'SHT40_Humidity', 'A1_Sensor', 'A1_Resistance'])




In [20]:
# identify all peaks including high and low peaks, and then find the index of smaller peak by compariing
# the height of two adjacent peaks
# put the index of the smaller and larger peaks into two lists

def _hi_lo_peak(x: pd.DataFrame) -> pd.DataFrame:
    peaks, properties = find_peaks(x['A1_Sensor'], width=50, height=1)
    peak_heights = properties['peak_heights']
# Determine smaller and larger peaks
    smaller_peaks, larger_peaks = [], []
    for i in range(len(peaks) - 1):
        if peak_heights[i] > peak_heights[i + 1]:
            larger_peaks.append(peaks[i])
            smaller_peaks.append(peaks[i + 1])
    # smaller_peaks_df = x.iloc[smaller_peaks]
    return smaller_peaks

In [21]:
_hi_lo_peak(df)

[4956,
 8596,
 12239,
 15882,
 19520,
 23160,
 26801,
 30443,
 34083,
 37726,
 41364,
 44999,
 48642,
 52284,
 55923,
 59565,
 63207,
 66848,
 70485,
 74129,
 77769,
 81411,
 85048,
 88686,
 92330,
 95971,
 99609,
 103251,
 106892,
 110534,
 114176,
 117809,
 121456,
 125093,
 128734,
 132375,
 136015,
 139658,
 143297,
 146940,
 150578,
 154219,
 157858,
 161500,
 165138,
 168783,
 172419,
 176060,
 179699,
 183342,
 186984,
 190624,
 194264,
 197905,
 201547,
 205186,
 208826,
 212471,
 216108,
 219746,
 223390,
 227031,
 230666,
 234310,
 237949,
 241588,
 245233,
 248873,
 252511,
 256154,
 259793,
 263440,
 267074,
 270713,
 274354,
 277996,
 281632,
 285276,
 288918,
 292561,
 296200,
 299839,
 303481,
 307120,
 310759,
 314402,
 318041,
 321678,
 325318,
 328960,
 332601,
 336242,
 339882,
 343520,
 347163,
 350803,
 354443,
 358088,
 361727,
 365365,
 369005,
 372650,
 376286,
 379927,
 383570,
 387211,
 390857,
 394492,
 398136,
 401773,
 405413,
 409052,
 412693,
 416335,
 41

In [25]:
def preprocess_data_stack(sp: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    df_stacked_list = []
    for i in range(len(sp) - 1):
        df_subset = df.iloc[sp[i]:sp[i + 1]].copy()
        df_subset['exp_no'] = i
        df_subset['timestamp'] -= df_subset['timestamp'].iloc[0]
        df_stacked_list.append(df_subset)
        df_stacked = pd.concat(df_stacked_list, ignore_index=True)
    return df_stacked

In [38]:
df_stacked = preprocess_data_stack(_hi_lo_peak(df), df)

# Bucketing
Bucketing all time variable under uniformed buckets to align features
- Define minimum t and maximum t
- Form a bucket per 200ms
- Average n number of values in the bucket

examine all exp_no
determine min and max that will be applied uniformly


In [53]:
#search the maximum and minimum of time in the df_stacked
def _max_min_time(df_stacked: pd.DataFrame) -> pd.DataFrame:
    max_time = df_stacked['timestamp'].max()
    min_time = df_stacked['timestamp'].min()
    return max_time, min_time

(199000, 0)

In [36]:
# Obtain number of buckets in one experiment by dividing the maximum time by bucket size
def _split_bucket(df_stacked: pd.DataFrame, x: int) -> pd.DataFrame:
    max_time, min_time = _max_min_time(df_stacked)
    buckets = max_time/x
    return buckets
    

In [60]:
# group experiments by using groupby function
# create a new column called timestamp_bucket
# assign timestamps in a range of 100ms to a bucket
# iterate through the groupby object timestamp and add a new column called timestamp_bucket
# each bucket is 100ms i.e., items with timestamp between 0 and 100ms are assigned to bucket 1
# items with timestamp between 100ms and 200ms are assigned to bucket 2
# and so on...
def _group_by_bucket(df_stacked: pd.DataFrame, bucket_size_ms: int) -> pd.DataFrame:
    df_list = []
    grouped = df_stacked.groupby('exp_no')
    for name, group in grouped:
        group['timestamp_bucket'] = group['timestamp'].floordiv(bucket_size_ms)
        df_list.append(group)
    return pd.concat(df_list)
bucketed = _group_by_bucket(df_stacked, 100)

Unnamed: 0,timestamp,SHT40_temp,SHT40_Humidity,A1_Sensor,A1_Resistance,exp_no,timestamp_bucket
844593,181900,26.56,40.67,1598.0,2311577.0,231,1819
844594,181949,26.56,40.72,1596.0,2314661.5,231,1819
844595,182000,26.58,40.8,1597.0,2313118.25,231,1820
844596,182050,26.57,40.84,1601.0,2306964.5,231,1820
844597,182100,26.58,40.89,1601.0,2306964.5,231,1821


In [61]:
# write bucketed data to csv in intermediate folder
bucketed.to_csv(f"../data/02_intermediate/bucketed.csv", index=False)