In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

### List of thresholds of devices that we have. For those that we do not have threshold it is assigned to zero.

In [2]:
thresholds = {
    "dev-5t2uup6709t3":0,
    "dev-5t2utpi5paxv":500,
    "dev-5t2uyos6ulyr":0,
    "dev-5t2uv5yxfbft":1500,
    "dev-5t2ure2gsucf":0,
    "dev-5t2uy4thkcdt":0,
    "dev-5t2uvus0me23":2000,
    "dev-5t2v0iy5iw8p":4000,
    "dev-5t2uxn4wxw33":3500,
    "dev-5t2v2bfwnagv":5000,
    "dev-5t2ux5l4mfn1":1000,
    "dev-5t2ut9n2f0yp":0, 
    "dev-5t2upjdaeobf":0,  
    "dev-5t2v17d5skor":0,
    "dev-5t2uza97fe7p":1500,
    "dev-5t2uqb77ohb1":1500,
    "dev-5t2uu82uirad":4000,
    "dev-5t2uzv34fzcn":1700,
    "dev-5t2v1puiqwfx":0
}

### Takes a row from a column, which is a string of floats separated by commas, and splits it into three separate values: p_avg, p_min, and p_max. These values are then returned as a series.

In [3]:
def parse_values(row):
    values = [float(x.strip(' []')) for x in row.split(',')]
    return pd.Series(values, index=['p_avg', 'p_min', 'p_max'])

### Reads the entire dataset, indexes it by timestamp_device, filters the rows where the tag equals "data", selects specific columns for the final DataFrame, call the parse_values function on column P and drops any unwanted columns. The processed DataFrame is then returned:

In [4]:
# Read the data
def load_and_process_data(file_path):
    df = pd.read_csv(file_path, parse_dates=True, low_memory=False)
    df['timestamp_device'] = pd.to_datetime(df['timestamp_device'])
    df = df[df['tag'] == 'data'].reset_index(drop=True)
    df = df[['P', 'timestamp_device', 'device_id']]
    df[['p_avg', 'p_min', 'p_max']] = df['P'].apply(parse_values)
    df = df.drop(columns=['P'])
    return df

In [5]:
df = load_and_process_data(file_path="data/data.csv")

### The DataFrame is divided based on device_id, and each subset is saved as a CSV file named device1, device2, ..., device19, corresponding to the 19 different devices.

In [6]:
dfs={}
grouped_data = df.groupby('device_id')
for i, (device_id, data) in enumerate(grouped_data, start=1):
    dfs[i] = data.sort_values(by ='timestamp_device').reset_index(drop= True)
    threshold = thresholds[dfs[i]['device_id'].iloc[0]]
    dfs[i].loc[dfs[i]['p_avg']< threshold, 'y'] = 0
    dfs[i].loc[dfs[i]['p_avg'] > threshold, 'y'] = 1

In [7]:
j=1
for i in range(1,20):
    dfs[i].to_csv('data/device'+str(j)+'.csv', index=False)
    j+=1