In [1]:
import pandas as pd
from pathlib import Path
import copy
from datetime import timedelta
import numpy as np
import re

In [2]:
class CreateDataset:

    base_dir = ''
    granularity = 0
    data_table = None

    def __init__(self, base_dir, granularity):
        self.base_dir = base_dir
        self.granularity = granularity

    # Create an initial data table with entries from start till end time, with steps
    # of size granularity. Granularity is specified in milliseconds
    def create_timestamps(self, start_time, end_time):
        return pd.date_range(start_time, end_time, freq=str(self.granularity)+'ms')

    def create_dataset(self, start_time, end_time, cols, prefix):
        c = copy.deepcopy(cols)
        if not prefix == '':
            for i in range(0, len(c)):
                c[i] = str(prefix) + str(c[i])
        timestamps = self.create_timestamps(start_time, end_time)

        #Specify the datatype here to prevent an issue
        self.data_table = pd.DataFrame(index=timestamps, columns=c, dtype=object)

    # Add numerical data, we assume timestamps in the form of nanoseconds from the epoch
    def add_numerical_dataset(self, file, timestamp_col, value_cols, aggregation='avg', prefix=''):
        print(f'Reading data from {file}')
        dataset = pd.read_csv(file)

        # Convert timestamps to dates
        dataset[timestamp_col] = pd.to_datetime(dataset[timestamp_col], unit='s')

        # Create a table based on the times found in the dataset
        if self.data_table is None:
            self.create_dataset(min(dataset[timestamp_col]), max(dataset[timestamp_col]), value_cols, prefix)
        else:
            for col in value_cols:
                self.data_table[str(prefix) + str(col)] = np.nan

        # Over all rows in the new table
        for i in range(0, len(self.data_table.index)):
            # Select the relevant measurements.
            relevant_rows = dataset[
                (dataset[timestamp_col] >= self.data_table.index[i]) &
                (dataset[timestamp_col] < (self.data_table.index[i] +
                                           timedelta(milliseconds=self.granularity)))
            ]
            for col in value_cols:
                # Take the average value
                if len(relevant_rows) > 0:
                    if aggregation == 'avg':
                        self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.average(relevant_rows[col])
                    else:
                        raise ValueError(f"Unknown aggregation {aggregation}")
                else:
                    self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.nan

    # Remove undesired value from the names.
    def clean_name(self, name):
        return re.sub('[^0-9a-zA-Z]+', '', name)

    # Add data in which we have rows that indicate the occurrence of a certain event with a given start and end time.
    # 'aggregation' can be 'sum' or 'binary'.
    def add_event_dataset(self, file, start_timestamp_col, end_timestamp_col, value_col, aggregation='sum'):
        print(f'Reading data from {file}')
        dataset = pd.read_csv(file)

        # Convert timestamps to datetime.
        dataset[start_timestamp_col] = pd.to_datetime(dataset[start_timestamp_col], unit='s')
        dataset[end_timestamp_col] = pd.to_datetime(dataset[end_timestamp_col], unit='s')

        # Clean the event values in the dataset
        dataset[value_col] = dataset[value_col].apply(self.clean_name)
        event_values = dataset[value_col].unique()

        # Add columns for all possible values (or create a new dataset if empty), set the default to 0 occurrences
        if self.data_table is None:
            self.create_dataset(min(dataset[start_timestamp_col]), max(dataset[end_timestamp_col]), event_values, value_col)
        for col in event_values:
            self.data_table[(str(value_col) + str(col))] = 0

        # Now we need to start counting by passing along the rows....
        for i in range(0, len(dataset.index)):
            # identify the time points of the row in our dataset and the value
            start = dataset[start_timestamp_col][i]
            end = dataset[end_timestamp_col][i]
            value = dataset[value_col][i]
            border = (start - timedelta(milliseconds=self.granularity))

            # get the right rows from our data table
            relevant_rows = self.data_table[(start <= (self.data_table.index +timedelta(milliseconds=self.granularity))) & (end > self.data_table.index)]

            # and add 1 to the rows if we take the sum
            if aggregation == 'sum':
                self.data_table.loc[relevant_rows.index, (str(value_col) + str(col))] += 1
            # or set to 1 if we just want to know it happened
            elif aggregation == 'binary':
                self.data_table.loc[relevant_rows.index, (str(value_col) + str(col))] = 1
            else:
                raise ValueError("Unknown aggregation '" + aggregation + "'")

    # This function returns the column names that have one of the strings expressed by 'ids' in the column name.
    def get_relevant_columns(self, ids):
        relevant_dataset_cols = []
        cols = list(self.data_table.columns)

        for id in ids:
            relevant_dataset_cols.extend([col for col in cols if id in col])

        return relevant_dataset_cols

In [3]:
miliseconds_per_instance = 500
dataset = CreateDataset(Path('.'), granularity = miliseconds_per_instance)

In [4]:
dataset.add_numerical_dataset('Accelerometer.csv', 'Time (s)', ['X (m/s^2)','Y (m/s^2)','Z (m/s^2)'], 'avg', 'acc_')

Reading data from Accelerometer.csv


In [5]:
dataset.add_numerical_dataset('Barometer.csv', 'Time (s)', ['X (hPa)'], 'avg', 'bar_')

Reading data from Barometer.csv


In [6]:
dataset.add_numerical_dataset('Gyroscope.csv', 'Time (s)', ['X (rad/s)', 'Y (rad/s)', 'Z (rad/s)'], 'avg', 'gyr_')

Reading data from Gyroscope.csv


In [7]:
dataset.add_numerical_dataset('Linear Accelerometer.csv', 'Time (s)', ['X (m/s^2)','Y (m/s^2)','Z (m/s^2)'], 'avg', 'lin_acc_')

Reading data from Linear Accelerometer.csv


In [8]:
dataset.add_numerical_dataset('Location.csv', 'Time (s)', ["Latitude (°)","Longitude (°)","Height (m)","Velocity (m/s)","Direction (°)","Horizontal Accuracy (m)","Vertical Accuracy (°)"], 'avg', 'loc_')

Reading data from Location.csv


In [9]:
dataset.add_numerical_dataset('Magnetometer.csv', 'Time (s)', ['X (µT)', 'Y (µT)', 'Z (µT)'], 'avg', 'mag_')

Reading data from Magnetometer.csv


In [10]:
dataset.add_numerical_dataset('Proximity.csv', 'Time (s)', ['Distance (cm)'], 'avg', 'prox_')

Reading data from Proximity.csv


In [11]:
dataset.data_table.index = dataset.data_table.index + pd.Timedelta(weeks = 2788, days = 1, hours=9, minutes=6, seconds=10)

In [12]:
# dataset.add_event_dataset('labels.csv', 'time_start', 'time_end', 'label', 'binary')

In [13]:
dataset.data_table

Unnamed: 0,acc_X (m/s^2),acc_Y (m/s^2),acc_Z (m/s^2),bar_X (hPa),gyr_X (rad/s),gyr_Y (rad/s),gyr_Z (rad/s),lin_acc_X (m/s^2),lin_acc_Y (m/s^2),lin_acc_Z (m/s^2),...,loc_Longitude (°),loc_Height (m),loc_Velocity (m/s),loc_Direction (°),loc_Horizontal Accuracy (m),loc_Vertical Accuracy (°),mag_X (µT),mag_Y (µT),mag_Z (µT),prox_Distance (cm)
2023-06-09 09:06:09.999236959,-1.172664,8.057956,5.466192,1015.039825,-0.119477,-0.608310,0.105744,-0.084164,0.022770,0.020265,...,4.881865,0.311493,-1.0,-1.0,12.348833,12.348833,-4.650503,-43.537346,-11.629228,5.0
2023-06-09 09:06:10.499236959,1.004768,6.315789,5.739167,,-0.821283,0.550055,0.774829,0.383327,-1.097207,-0.505725,...,4.881865,0.431227,-1.0,-1.0,12.348833,12.348833,-12.604013,-39.005757,-14.994400,
2023-06-09 09:06:10.999236959,-3.849626,-2.796973,10.943311,1015.049591,-2.458637,2.157408,0.753934,-1.861990,-0.985122,3.221120,...,,,,,,,-0.478520,-2.631660,-29.270551,
2023-06-09 09:06:11.499236959,-6.871075,-4.585406,2.951065,,-1.041971,-0.365091,-0.374745,1.099491,0.076223,-0.190910,...,,,,,,,29.189895,14.278607,-16.707217,0.0
2023-06-09 09:06:11.999236959,-1.424692,-9.191968,2.640943,1015.070114,-0.292931,-0.422819,-2.335696,-0.014413,-0.433022,-0.192992,...,,,,,,,0.535455,27.210411,-12.999481,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-09 10:29:38.499236959,-1.571497,9.123457,3.242583,,0.030172,0.033033,0.041151,0.001590,-0.022836,0.071354,...,,,,,,,2.124366,-42.455074,3.847730,
2023-06-09 10:29:38.999236959,-1.482957,9.344588,2.759042,,0.240415,0.369608,0.010300,0.077714,0.000282,0.220769,...,,,,,,,1.198954,-41.821915,5.101600,
2023-06-09 10:29:39.499236959,-1.739428,9.314045,1.957213,,-0.172657,-0.172551,0.032654,-0.003317,-0.068599,-0.184658,...,,,,,,,1.239135,-40.997622,6.326578,
2023-06-09 10:29:39.999236959,-1.677615,9.431776,2.071836,,0.199488,0.181392,-0.015356,0.030111,-0.030445,0.141775,...,,,,,,,1.128178,-40.942677,6.927644,


In [14]:
dataset.data_table.to_csv('data_v2.csv')

In [21]:
df = pd.read_csv("data_v2.csv", index_col=0)
l = pd.read_csv("labels4.csv")

In [22]:
df

Unnamed: 0,acc_X (m/s^2),acc_Y (m/s^2),acc_Z (m/s^2),bar_X (hPa),gyr_X (rad/s),gyr_Y (rad/s),gyr_Z (rad/s),lin_acc_X (m/s^2),lin_acc_Y (m/s^2),lin_acc_Z (m/s^2),...,loc_Longitude (°),loc_Height (m),loc_Velocity (m/s),loc_Direction (°),loc_Horizontal Accuracy (m),loc_Vertical Accuracy (°),mag_X (µT),mag_Y (µT),mag_Z (µT),prox_Distance (cm)
2023-06-09 09:06:09.999236959,-1.172664,8.057956,5.466192,1015.039825,-0.119477,-0.608310,0.105744,-0.084164,0.022770,0.020265,...,4.881865,0.311493,-1.0,-1.0,12.348833,12.348833,-4.650503,-43.537346,-11.629228,5.0
2023-06-09 09:06:10.499236959,1.004768,6.315789,5.739167,,-0.821283,0.550055,0.774829,0.383327,-1.097207,-0.505725,...,4.881865,0.431227,-1.0,-1.0,12.348833,12.348833,-12.604013,-39.005757,-14.994400,
2023-06-09 09:06:10.999236959,-3.849626,-2.796973,10.943311,1015.049591,-2.458637,2.157408,0.753934,-1.861990,-0.985122,3.221120,...,,,,,,,-0.478520,-2.631660,-29.270551,
2023-06-09 09:06:11.499236959,-6.871075,-4.585406,2.951065,,-1.041971,-0.365091,-0.374745,1.099491,0.076223,-0.190910,...,,,,,,,29.189895,14.278607,-16.707217,0.0
2023-06-09 09:06:11.999236959,-1.424692,-9.191968,2.640943,1015.070114,-0.292931,-0.422819,-2.335696,-0.014413,-0.433022,-0.192992,...,,,,,,,0.535455,27.210411,-12.999481,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-09 10:29:38.499236959,-1.571497,9.123457,3.242583,,0.030172,0.033033,0.041151,0.001590,-0.022836,0.071354,...,,,,,,,2.124366,-42.455074,3.847730,
2023-06-09 10:29:38.999236959,-1.482957,9.344588,2.759042,,0.240415,0.369608,0.010300,0.077714,0.000282,0.220769,...,,,,,,,1.198954,-41.821915,5.101600,
2023-06-09 10:29:39.499236959,-1.739428,9.314045,1.957213,,-0.172657,-0.172551,0.032654,-0.003317,-0.068599,-0.184658,...,,,,,,,1.239135,-40.997622,6.326578,
2023-06-09 10:29:39.999236959,-1.677615,9.431776,2.071836,,0.199488,0.181392,-0.015356,0.030111,-0.030445,0.141775,...,,,,,,,1.128178,-40.942677,6.927644,


In [23]:
l

Unnamed: 0,time,label
0,09:06:10,sitting
1,09:14:20,standing
2,09:21:54,laying
3,09:31:10,standing
4,09:36:50,laying
5,09:48:48,sitting
6,09:59:36,standing
7,10:06:29,laying
8,10:15:15,standing
9,10:20:19,sitting


In [28]:
l['time'] = pd.to_datetime(l['time']) - pd.Timedelta(days=2)

In [29]:
l

Unnamed: 0,time,label
0,2023-06-09 09:06:10,sitting
1,2023-06-09 09:14:20,standing
2,2023-06-09 09:21:54,laying
3,2023-06-09 09:31:10,standing
4,2023-06-09 09:36:50,laying
5,2023-06-09 09:48:48,sitting
6,2023-06-09 09:59:36,standing
7,2023-06-09 10:06:29,laying
8,2023-06-09 10:15:15,standing
9,2023-06-09 10:20:19,sitting


In [32]:
df.index[0]

'2023-06-09 09:06:09.999236959'

In [41]:
new_row = {'time': '2023-06-09 09:06:09.999236959' , 'label':'sitting'}
# l2 = l.append(new_row)

  l2 = l.append(new_row)


TypeError: Can only append a dict if ignore_index=True

In [42]:
l2

Unnamed: 0,time,label
0,2023-06-09 09:06:10,sitting
1,2023-06-09 09:14:20,standing
2,2023-06-09 09:21:54,laying
3,2023-06-09 09:31:10,standing
4,2023-06-09 09:36:50,laying
5,2023-06-09 09:48:48,sitting
6,2023-06-09 09:59:36,standing
7,2023-06-09 10:06:29,laying
8,2023-06-09 10:15:15,standing
9,2023-06-09 10:20:19,sitting
