In [1]:
%load_ext autoreload
%autoreload 2
from src.edgeml import edgeml
import time
import random
import pandas as pd
import tsfresh

# Upload randomly generated data to the server using the edge-ml python library

In [4]:
readKey = "e0da63d2292850c65d73208f7aa4a5d6" # Device API-key
writeKey = '074426a41a118b8c0f896e8c3e2cd070'
localUrl = "http://localhost:3004"   # Backend URL
datasetName = "Example Dataset" # Name of the dataset
useDeviceTime = False           # Change it to true if you want to use timestamps generated by the server 
startTime = time.time() * 1000  # TODO isn't the use device time name confusing when its function is to allow use of the timestamps generated by 
                                # the "SERVER" and not the DEVICE?

# create collector object to collect and upload data                                
collector = edgeml.DatasetCollector(localUrl,
                                    writeKey,
                                    datasetName,
                                    useDeviceTime, ['Accelerometer', 'Magnetometer'], {'meta': 'data'}, None)

# upload randomly generated data to the project using the collector
for i in range(5000):
    currentTime = startTime + i * 10 
    await collector.addDataPoint(currentTime, "Accelerometer", random.randint(1,50)/10.0) 
    await collector.addDataPoint(currentTime, "Magnetometer", random.randint(-50,550)/10.0) 

# signal data collection is complete
await collector.onComplete()


True

# Retrieve existing real world data from remote server

In [96]:
remoteUrl = "https://app.edge-ml.org"
remoteKey = "PSJSIwhSuN6TSK9LEMzuFbfXZwACWKYFFtKWxXy2Upkrmhv4U7MjD25b+QcPl0d20n6vsfBWV4Gz4iWBhSmnXw=="
project = edgeml.getProject(remoteUrl, remoteKey) # retrieve project data from remote server
datasets = project['datasets']
# print(datasets)

AttributeError: module 'src.edgeml.edgeml' has no attribute 'getProject'

# Create intervals from the labelset to label the datapoints given their timestamps 
# Assing ids to different labels

In [99]:
labelset = {} # stores different start and end times (intervals) belonging to a label
labelIds = {} # assing distinct ids to labels, required for training with data
labelId = 0
for dataset in datasets:
    labels = dataset['labels']
    for labelData in labels:
        for label in labelData:
            name = label['name']
            start = label['start']
            end = label['end']
            if not name in labelset:
                labelset[name] = []
                labelIds[name] = labelId        # assign id to the label
                labelId = labelId + 1               
            labelset[name].append((start, end)) # add interval to the label
print(labelIds)                                 
print(labelset)                                

{'Circle': 0, 'Still': 1}
{'Circle': [(1637617321055.8777, 1637617323285.569), (1637617326032.4146, 1637617328451.6357), (1637617330336.8848, 1637617332950.0796)], 'Still': [(1637617328514.5383, 1637617330267.1917), (1637617323269.7556, 1637617325892.147)]}


# Labels in the dataset

In [109]:
labels = dataset['labels']
labels

[[{'labelingName': 'Gestures',
   'name': 'Circle',
   'start': 1637617321055.8777,
   'end': 1637617323285.569},
  {'labelingName': 'Gestures',
   'name': 'Circle',
   'start': 1637617326032.4146,
   'end': 1637617328451.6357},
  {'labelingName': 'Gestures',
   'name': 'Circle',
   'start': 1637617330336.8848,
   'end': 1637617332950.0796},
  {'labelingName': 'Gestures',
   'name': 'Still',
   'start': 1637617328514.5383,
   'end': 1637617330267.1917},
  {'labelingName': 'Gestures',
   'name': 'Still',
   'start': 1637617323269.7556,
   'end': 1637617325892.147}]]

# Sensors used in data collection

In [100]:
for dataset in datasets:
    sensors = dataset['sensors']
    for sensor in sensors:
        sensorName = sensor['name']
        print(sensorName)


ACC_x
ACC_y
ACC_z
GYRO_x
GYRO_y
GYRO_z


# Fuse different sensor values to a single timestamp, identify start-/endtime of all datapoints

In [101]:
windowStart = float('inf')                          # earliest timestamp 
windowEnd = float('-inf')                           # latest timestamp
dataTimeValueSensor = {}                            # sensor values fused into single timestamps
usedSensors = {"ACC_x", "ACC_y", "ACC_z",           # choose which sensors' data you would like to use
               "GYRO_x", "GYRO_y", "GYRO_z"}           
                                                        
for dataset in datasets:
    sensors = dataset['sensors']
    for sensor in sensors:
        sensorName = sensor['name']
        data = sensor['data']
        if sensorName not in usedSensors:
            continue
        for dataPoint in data:
            timestamp = dataPoint['timestamp']
            dataPointValue = dataPoint['datapoint']
            if timestamp not in dataTimeValueSensor:
                dataTimeValueSensor[timestamp] = []
            dataTimeValueSensor[timestamp].append({'value': dataPointValue, 'sensor': sensorName})
            windowStart = min(windowStart, timestamp)
            windowEnd   = max(windowEnd, timestamp)

# Create dataframe using fused data values, beware that a data point may belong to multiple windows because of the sliding window logic

In [102]:
# Sliding Window
# stepSize = 1
# datapoints: 0 1 2 3 4 5 6 7 8
# window  id: 0 0 0 0
#         id:   1 1 1 1
#         id:     2 2 2 2
#         id:       3 3 3 3
#         id:         4 4 4 4
#         id:           5 5 5 5
#         id:             6 6 6
#         id:               7 7
#         id:                 8
#         Datapoint 7 belongs to windows 4,5,6,7; whereas datapoint 1 belongs only to 0 and 1
#
# stepSize = 2
# datapoints: 0 1 2 3 4 5 6 7 8
# window  id: 0 0 0 0
#         id:     1 1 1 1
#         id:         2 2 2 2  
#         id:             3 3 3
#         id:                 4
# 
# stepSize = 3
# datapoints: 0 1 2 3 4 5 6 7 8
# window  id: 0 0 0 0
#         id:       1 1 1 1
#         id:             2 2 2
#                

values = {'id': []}         # stores values by id and sensors
dataPointLabels = {}        # keeps track of how many times a data group belongs to a label
windowSize = 10             # length of the window
stepSize = 1                # describes how many steps the window slides
id = 0
for timestamp, timestampData in dataTimeValueSensor.items():
    for data in timestampData:
        value = data['value']
        sensor = data['sensor']
        if not sensor in values:
            values[sensor] = []
        normalized = id                                          # normalize timestamp using the earliest timestamp 
        remainder = normalized - windowSize + 1                  # calculate the window
        firstId = max(0, remainder)                              # beginning of the window
        lastId = firstId + windowSize - 1 + min(0, remainder)    # end of the window
        for label, intervals in labelset.items():
            for interval in intervals:
                start = interval[0]
                end = interval[1]
                if timestamp >= start and timestamp <= end:
                    for idd in range(firstId, lastId + 1):
                        values[sensor].append(value)
                        if data == timestampData[0]:
                            values['id'].append(idd)
                        if (idd, label) not in dataPointLabels:
                            dataPointLabels[(idd, label)] = 0
                        dataPointLabels[(idd, label)] = dataPointLabels[(idd, label)] + 1
    id = id + 1
dataFrame = pd.DataFrame(values)                                # create dataframe 
print(dataFrame.to_string())

       id  ACC_x  ACC_y  ACC_z  GYRO_x  GYRO_y  GYRO_z
0      19    314   3980    750      -2       1       0
1      20    314   3980    750      -2       1       0
2      21    314   3980    750      -2       1       0
3      22    314   3980    750      -2       1       0
4      23    314   3980    750      -2       1       0
5      24    314   3980    750      -2       1       0
6      25    314   3980    750      -2       1       0
7      26    314   3980    750      -2       1       0
8      27    314   3980    750      -2       1       0
9      28    314   3980    750      -2       1       0
10     20    320   3978    745      -1       0       1
11     21    320   3978    745      -1       0       1
12     22    320   3978    745      -1       0       1
13     23    320   3978    745      -1       0       1
14     24    320   3978    745      -1       0       1
15     25    320   3978    745      -1       0       1
16     26    320   3978    745      -1       0       1
17     27 

# Extract features using tsfresh

In [103]:
settings = tsfresh.feature_extraction.settings.MinimalFCParameters()
extracted = tsfresh.extract_features(dataFrame, column_id="id", default_fc_parameters=settings) 
print(extracted)

Feature Extraction: 100%|██████████| 30/30 [00:00<00:00, 44.76it/s]


     ACC_x__sum_values  ACC_x__median  ACC_x__mean  ACC_x__length  \
19               314.0          314.0   314.000000            1.0   
20               634.0          317.0   317.000000            2.0   
21               954.0          320.0   318.000000            3.0   
22              1270.0          318.0   317.500000            4.0   
23              1593.0          320.0   318.600000            5.0   
..                 ...            ...          ...            ...   
629           -17419.0        -3499.0 -3483.800000            5.0   
630           -14085.0        -3512.0 -3521.250000            4.0   
631           -10595.0        -3525.0 -3531.666667            3.0   
632            -7024.0        -3512.0 -3512.000000            2.0   
633            -3499.0        -3499.0 -3499.000000            1.0   

     ACC_x__standard_deviation  ACC_x__variance  ACC_x__root_mean_square  \
19                    0.000000         0.000000               314.000000   
20                 

# Create list of labels that matches a given datapoint id most, which will be passed to the train function

In [104]:
trainingLabels = []
mostMatchingLabelPerId = {}
for (id, label) , count in dataPointLabels.items():
    if id not in mostMatchingLabelPerId or mostMatchingLabelPerId[id][1] < count:
        mostMatchingLabelPerId[id] = (label, count)

for id, (label, count) in mostMatchingLabelPerId.items():
    trainingLabels.append(labelIds[label])
print(trainingLabels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Split the labeled dataset into two parts, one for training and one for testing the trained model

In [105]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(extracted, trainingLabels, random_state = 5)

# Normalize data using scaler

In [106]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(x_train)
trans_x_train = scaler.transform(x_train)
trans_x_test = scaler.transform(x_test)
trans_x_train = pd.DataFrame(trans_x_train,columns=x_train.columns)
trans_x_test = pd.DataFrame(trans_x_test,columns=x_test.columns)
trans_x_train.describe()

Unnamed: 0,ACC_x__sum_values,ACC_x__median,ACC_x__mean,ACC_x__length,ACC_x__standard_deviation,ACC_x__variance,ACC_x__root_mean_square,ACC_x__maximum,ACC_x__minimum,ACC_y__sum_values,...,GYRO_y__minimum,GYRO_z__sum_values,GYRO_z__median,GYRO_z__mean,GYRO_z__length,GYRO_z__standard_deviation,GYRO_z__variance,GYRO_z__root_mean_square,GYRO_z__maximum,GYRO_z__minimum
count,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,...,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0
mean,0.244196,0.369929,0.365733,-0.396963,0.363747,0.810973,-0.313019,0.264319,0.253738,-0.008988,...,-0.97672,-0.129683,-0.19735,-0.12441,-0.396963,0.322124,0.643261,0.364405,0.582589,-0.518582
std,0.62127,0.653448,0.637401,1.409869,0.739042,1.497741,0.642497,0.572338,0.713428,0.611799,...,2.254297,2.801474,4.018935,2.65799,1.409869,0.649975,1.221966,0.594578,1.409546,1.086893
min,-0.527147,-0.396979,-0.39277,-9.0,-0.286162,-0.074997,-1.551936,-0.391589,-0.651963,-1.367773,...,-6.95915,-7.553001,-11.34476,-7.164625,-9.0,-0.225091,-0.04862,-0.154251,-1.731123,-3.504277
25%,-0.22913,-0.066264,-0.070394,0.0,-0.240218,-0.073064,-0.900164,-0.202622,-0.342657,-0.579075,...,-0.921569,-0.854548,-0.797258,-0.858393,0.0,-0.204051,-0.048195,-0.143413,-0.044813,-0.98109
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.77087,0.933736,0.929606,0.0,0.759782,0.926936,0.099836,0.797378,0.657343,0.420925,...,0.078431,0.145452,0.202742,0.141607,0.0,0.795949,0.951805,0.856587,0.955187,0.01891
max,1.634016,1.866264,1.789684,1.0,2.553271,7.308895,0.581061,1.437193,2.030662,1.437383,...,2.166667,7.534838,10.203722,7.147395,1.0,2.794948,8.703713,1.740694,4.275629,1.797389


# Train the machine learning model 

In [107]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(trans_x_train,y_train)

RandomForestClassifier()

# Classify test data using the trained model and evaluate the accuracy

In [108]:
from sklearn.metrics import accuracy_score
prediction = clf.predict(trans_x_test)
print("accuracy_score train :", accuracy_score(y_test,prediction))

accuracy_score train : 1.0
