# Training a Machine Learning algorithm 

## Load data

First, load the SDK of the Data-centric design Hub to connect to your Thing

Note: like in the Python script for data collection, you need an .env file with your thing id and token.

In [None]:
from dotenv import load_dotenv
import os
from dcd.entities.thing import Thing

# The thing ID and access token
load_dotenv()
THING_ID = os.environ['THING_ID']
THING_TOKEN = os.environ['THING_TOKEN']

my_thing = Thing(thing_id=THING_ID, token=THING_TOKEN)
my_thing.read()

Provide the start and end dates, defining when to look for data

In [None]:
from datetime import datetime
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
START_DATE = "2019-05-16 10:53:00"
END_DATE = "2019-05-16 10:56:00"

from_ts = datetime.timestamp(datetime.strptime(START_DATE, DATE_FORMAT)) * 1000
to_ts = datetime.timestamp(datetime.strptime(END_DATE, DATE_FORMAT)) * 1000

Retrieve data and label

In [None]:
FSR_PROP_NAME = "FSR"
CLASS_PROP_NAME = "Sitting Posture"

fsr = my_thing.find_property_by_name(FSR_PROP_NAME)
fsr.read(from_ts, to_ts)
data = fsr.values

sitting = my_thing.find_property_by_name(CLASS_PROP_NAME)
sitting.read(from_ts, to_ts)
label = sitting.values

Extract classes from the CLASS property

In [None]:
classes = []
for index, clazz in enumerate(sitting.classes):
    print(index, " => ", clazz['name'])
    classes.append(clazz['name'])

# Prepare

Split the data into training data (60%), cross validation data (20%) and test data (20%)

In [None]:
train_data = []
train_label = []
cv_data = []
cv_label = []
test_data = []
test_label = []
leftover_data = []
leftover_label = []

for index in range(len(data)):
    # remove time
    data[index].pop(0)
    label[index].pop(0)
    if index%5 == 0:
        # 20% to test data
        test_data.append(data[index])
        test_label.append(label[index])
    else:
        # 80% leftover data
        leftover_data.append(data[index])
        leftover_label.append(label[index])

for index in range(len(leftover_data)):
    if index%4 == 0:
        # 20% to cross validate
        cv_data.append(leftover_data[index])
        cv_label.append(leftover_label[index])
    else:
        # 60% to train
        train_data.append(leftover_data[index])
        train_label.append(leftover_label[index])

Check the distribution

In [None]:
print("nb total data: " + str(len(data)))
print("nb total labels: " + str(len(label)))

print("nb train data: " + str(len(train_data)))
print("nb train labels: " + str(len(train_label)))

print("nb cv data: " + str(len(cv_data)))
print("nb cv labels: " + str(len(cv_label)))

print("nb test data: " + str(len(test_data)))
print("nb test labels: " + str(len(test_label)))

# Train

We use a k-Nearest Neighbour (kNN) algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(train_data, train_label)

# Evaluate

Import evaluation functions from scikit learn

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy

Use the test data to evaluate the algorithm

In [None]:
predicted = neigh.predict(cv_data)
cvLabel = numpy.array(cv_label)
result = accuracy_score(cvLabel, predicted)
print("cv accuracy: {}".format(result))

## Cross Validation accuracy > 80%

The validation passed, we can display the test performance

In [None]:
predicted = neigh.predict(test_data)
testLabel = numpy.array(test_label)
testResult = accuracy_score(testLabel, predicted)
print("test accuracy: {}".format(testResult))

Confusion matrix

In [None]:
confusion_matrix(testLabel, predicted)

Precision score

In [None]:
precision_score(testLabel, predicted, average="macro")

Recall score

In [None]:
recall_score(testLabel, predicted, average="macro")

F1 score

In [None]:
f1_score(testLabel, predicted, average="weighted")

In [None]:
f1_score(testLabel, predicted, average=None)

Finally, we can show the classification report.

In [None]:
print(classification_report(testLabel, predicted, target_names=classes))

## Cross Validation accuracy < 80%

The validation failed, we can display the validation performance.

Confusion matrix

In [None]:
confusion_matrix(cvLabel, predicted)

Precision score

In [None]:
precision_score(cvLabel, predicted, average="macro")

Recall score

In [None]:
recall_score(cvLabel, predicted, average="macro")

F1 score

In [None]:
f1_score(cvLabel, predicted, average="weighted")

In [None]:
f1_score(cvLabel, predicted, average=None)

# Save the model in a file

In [None]:
# Where to save the model to
MODEL_FILE_NAME = "model.pickle"

# import the pickle library
import io
import pickle

with io.open(MODEL_FILE_NAME, "wb") as file:
    pickle.dump(neigh, file, protocol=2)