In [17]:
from edgeml import edgeml
import time
import random
import pandas as pd
import tsfresh

# Upload randomly generated data to the server using the edge-ml python library

In [18]:
localKey = "RuW7CkG0keGCn9OH7yG40XY3nZz4Vfdl+gUgvpeiqxOP1eIwCwg90BjMrQVQV5fz3/zK1zBZ4uP+WFelZUmYTw==" # Device API-key
localUrl = "http://localhost"   # Backend URL
datasetName = "Example Dataset" # Name of the dataset
useDeviceTime = False           # Change it to true if you want to use timestamps generated by the server 
startTime = time.time()         # TODO isn't the use device time name confusing when its function is to allow use of the timestamps generated by 
                                # the "SERVER" and not the DEVICE?

# create collector object to collect and upload data                                
collector = edgeml.datasetCollector(localUrl,
                                    localKey,
                                    datasetName,
                                    useDeviceTime)

# upload randomly generated data to the project using the collector
for i in range(50):
    currentTime = startTime + i * 10 
    collector.addDataPoint("Accelerometer", random.randint(1,50)/10.0, currentTime) 
    collector.addDataPoint("Magnetometer", random.randint(1,50)/10.0, currentTime) 

# signal data collection is complete
collector.onComplete()


# Retrieve existing real world data from remote server

In [19]:
remoteUrl = "https://app.edge-ml.org"
remoteKey = "EoXl+XUYu71AQ/g3A2BTE1MS2Lpwi07Ud8BQ6Cm9IDgKV/QaiZN0oLIvF+/1jRAJSPT+HJivHDrtx6xRqg+6xQ=="
project = edgeml.getProject(remoteUrl, remoteKey) # retrieve project data from remote server
datasets = project['datasets']
# print(datasets)

# Create intervals from the labelset to label the datapoints given their timestamps 
# Assing ids to different labels

In [20]:
labelset = {} # stores different start and end times (intervals) belonging to a label
labelIds = {} # assing distinct ids to labels, required for training with data
labelId = 0   
divideBy = 1000 # required for normalizing received timestamps
for dataset in datasets:
    labels = dataset['labels']
    for labelData in labels:
        for label in labelData:
            name = label['name']
            start = label['start'] / divideBy # normalize
            end = label['end'] / divideBy     # normalize
            if not name in labelset:
                labelset[name] = []
                labelIds[name] = labelId        # assign id to the label
            labelset[name].append((start, end)) # add interval to the label
            labelId = labelId + 1
print(labelIds)

{'Drinking': 0, 'Not Drinking': 1}


# Fuse different sensor values to a single timestamp, identify start-/endtime of all datapoints

In [21]:
windowStart = float('inf')  # earliest timestamp 
windowEnd = float('-inf')   # latest timestamp
dataTimeValueSensor = {}    # sensor values fused into single timestamps

for dataset in datasets:
    sensors = dataset['sensors']
    for sensor in sensors:
        sensorName = sensor['name']
        data = sensor['data']
        for dataPoint in data:
            timestamp = dataPoint['timestamp'] / divideBy # normalize
            dataPointValue = dataPoint['datapoint']
            if timestamp not in dataTimeValueSensor:
                dataTimeValueSensor[timestamp] = []
            dataTimeValueSensor[timestamp].append({'value': dataPointValue, 'sensor': sensorName})
            windowStart = min(windowStart, timestamp)
            windowEnd   = max(windowEnd, timestamp)


# Create dataframe using fused data values, beware that a data point may belong to multiple windows because of the sliding window logic

In [22]:
# Sliding Window
# datapoints: 0 1 2 3 4 5 6 7 8
#         id: 0 0 0 0
#         id:   1 1 1 1
#         id:     2 2 2 2
#         id:       3 3 3 3
#         id:         4 4 4 4
#         id:           5 5 5 5
#         id:             6 6 6
#         id:               7 7
#         id:                 8
#         Datapoint 7 belongs to ids 4,5,6,7; whereas datapoint 1 belongs only to 0 and 1

values = {'id': []}         # stores values by id and sensors
dataPointLabels = {}        # keeps track of how many times a data group belongs to a label
windowSize = 10             # window size

for timestamp, timestampData in dataTimeValueSensor.items():
    for data in timestampData:
        value = data['value']
        sensor = data['sensor']
        if not sensor in values:
            values[sensor] = []
        normalized = int(timestamp - windowStart)               # normalize timestamp using earliest timestamp 
        remainder = normalized - windowSize + 1                 # calculate the window
        firstId = max(0, remainder)                             # beginning of the window
        lastId = firstId + windowSize - 1 + min(0, remainder)   # end of the window
        for label, intervals in labelset.items():
            for interval in intervals:
                start = interval[0]
                end = interval[1]
                if timestamp >= start and timestamp <= end:
                    for id in range(firstId, lastId + 1):
                        values[sensor].append(value)
                        if data == timestampData[0]:
                            values['id'].append(id)
                        if (id, label) not in dataPointLabels:
                            dataPointLabels[(id, label)] = 0
                        dataPointLabels[(id, label)] = dataPointLabels[(id, label)] + 1

dataFrame = pd.DataFrame(values)                                # create dataframe 
print(dataFrame)

        id  ACC_x  ACC_y  ACC_z
0        0     -2    -41   4139
1        0      0    -30   4128
2        0      0    -30   4128
3        0      4    -45   4136
4        0      0    -35   4136
...    ...    ...    ...    ...
62218  438    374   -410   4068
62219  439    374   -410   4068
62220  440    374   -410   4068
62221  441    374   -410   4068
62222  442    374   -410   4068

[62223 rows x 4 columns]


# Extract features using tsfresh

In [23]:
settings = tsfresh.feature_extraction.settings.MinimalFCParameters()
extracted = tsfresh.extract_features(dataFrame, column_id="id", default_fc_parameters=settings) 
print(extracted)

Feature Extraction: 100%|██████████| 30/30 [00:00<00:00, 360.41it/s]

     ACC_x__sum_values  ACC_x__median  ACC_x__mean  ACC_x__length  \
0               -570.0            0.0    -1.250000          456.0   
1             -69015.0            0.0  -138.584337          498.0   
2            -179220.0           -1.0  -359.879518          498.0   
3            -256319.0           -2.0  -509.580517          503.0   
4            -320243.0           -3.0  -636.666004          503.0   
..                 ...            ...          ...            ...   
438           -74866.0         -494.0  -310.647303          241.0   
439           -36456.0         -464.5  -187.917526          194.0   
440            -5463.0         -383.0   -37.937500          144.0   
441            19412.0          209.0   206.510638           94.0   
442            27500.0          636.5   625.000000           44.0   

     ACC_x__standard_deviation  ACC_x__variance  ACC_x__root_mean_square  \
0                    14.747249       217.481360                14.800130   
1                  




# Create list of labels that matches a given datapoint id most, which will be passed to the train function

In [29]:
trainingLabels = []
mostMatchingLabelPerId = {}
for (id, label) , count in dataPointLabels.items():
    if id not in mostMatchingLabelPerId or mostMatchingLabelPerId[id][1] < count:
        mostMatchingLabelPerId[id] = (label, count)

for id, (label, count) in mostMatchingLabelPerId.items():
    trainingLabels.append(labelIds[label])
print(trainingLabels)

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Split the labeled dataset into two parts, one for training and one for testing the trained model

In [60]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(extracted, trainingLabels, random_state = 5)

# Normalize data using scaler

In [61]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(x_train)
trans_x_train = scaler.transform(x_train)
trans_x_test = scaler.transform(x_test)
trans_x_train = pd.DataFrame(trans_x_train,columns=x_train.columns)
trans_x_test = pd.DataFrame(trans_x_test,columns=x_test.columns)
trans_x_train.describe()

Unnamed: 0,ACC_x__sum_values,ACC_x__median,ACC_x__mean,ACC_x__length,ACC_x__standard_deviation,ACC_x__variance,ACC_x__root_mean_square,ACC_x__maximum,ACC_x__minimum,ACC_y__sum_values,...,ACC_y__minimum,ACC_z__sum_values,ACC_z__median,ACC_z__mean,ACC_z__length,ACC_z__standard_deviation,ACC_z__variance,ACC_z__root_mean_square,ACC_z__maximum,ACC_z__minimum
count,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,...,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0
mean,-0.200001,-0.469962,-0.193377,-5.19619,-0.123602,0.057157,-0.031818,0.402844,0.132233,0.116766,...,0.005602,-0.188586,-0.669953,-0.268379,-5.19619,0.16653,0.400405,-0.235407,0.153207,0.197728
std,0.727433,0.738492,0.744787,12.144902,0.506408,0.622465,0.533344,1.171785,0.479575,1.237979,...,0.457162,0.700668,1.195867,0.64746,12.144902,0.524668,0.687253,0.650072,0.844429,0.46158
min,-1.989916,-2.009105,-1.973882,-45.2,-0.821412,-0.599197,-0.728048,-1.572827,-0.597481,-3.910509,...,-0.681955,-2.146331,-6.024213,-2.119894,-45.2,-0.397645,-0.147574,-2.13418,-0.599603,-0.504883
25%,-0.572539,-0.974203,-0.561185,-0.8,-0.75668,-0.594992,-0.624364,-0.330477,-0.26287,-0.280441,...,-0.538346,-0.613247,-0.973366,-0.734855,-0.8,-0.358376,-0.145998,-0.689895,-0.58041,-0.240723
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.427461,0.025797,0.438815,0.2,0.24332,0.405008,0.375636,0.669523,0.73713,0.719559,...,0.461654,0.386753,0.026634,0.265145,0.2,0.641624,0.854002,0.310105,0.41959,0.759277
max,0.599661,0.927921,1.477059,0.7,0.818799,1.7767,1.042422,3.876377,0.831325,3.307789,...,0.585213,0.455756,0.079903,0.299051,0.7,1.082565,1.88178,0.377105,2.4818,0.78125


# Train the machine learning model 

In [62]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(trans_x_train,y_train)

RandomForestClassifier()

# Classify test data using the trained model and evaluate the accuracy

In [63]:
from sklearn.metrics import accuracy_score
prediction = clf.predict(trans_x_test)
print("accuracy_score train :", accuracy_score(y_test,prediction))

accuracy_score train : 0.9142857142857143
