In [182]:
from db.binStore.binaryStore import BinaryStore
from db.datasets import get_dataset
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


# Simulated Input
datasetIds = ["6412ce0cf822bfe2b57fea8f", "6412ce0cf822bfe2b57feaa8", "6412ce0cf822bfe2b57fead1", "6412ce0cf822bfe2b57feae3"]
project = "640842f05e144be461cef648"
labeling_id = "6412cb71f822bfe2b57fe96e"
ts = ["6412ce0cf822bfe2b57fea90", "6412ce0cf822bfe2b57fea91", "6412ce0cf822bfe2b57fea92"]

labelings = {"Square": "6412cb71f822bfe2b57fe96d", "W": "6412cb71f822bfe2b57fe973"}

In [172]:
import json
with open("datasets.json", "r") as f:
    all_datasets = json.load(f)

In [173]:
# Get relevant datasets

datasets = []
for d in all_datasets:
    if (str(d["_id"]["$oid"]) in datasetIds):
        datasets.append(d)
print(len(datasets))

4


In [174]:
# Build mal for labelings. Label => Number

labelMap = {"6412cb71f822bfe2b57fe96d": 1, "6412cb71f822bfe2b57fe973": 2}

In [191]:
# Process each dataset


def processDataset(dataset):
    
    # Get labels in datasets
    labels = [x for x in dataset["labelings"] if x["labelingId"]["$oid"] == labeling_id][0]["labels"]
    # Get the time-series
    dfs = []
    
    for ts in dataset["timeSeries"]:
        binStore = BinaryStore(ts["_id"]['$oid'])
        binStore.loadSeries()
        ts_data = binStore.getFull()
        data = ts_data["data"]
        time = ts_data["time"]
        df = pd.DataFrame({"time": time, ts["name"]: data})
        dfs.append(df)


    # Merge the dataframes
    df = dfs[0]
    for d in dfs[1:]:
        df = pd.merge(df, d, how='outer', on="time")

    df = df.interpolate(method='linear', limit_direction='both')
    
    dims = df.columns
    arr = df.to_numpy()
    label_arr = np.empty((arr.shape[0], 1))

    arr = np.concatenate([arr, label_arr], axis=1)

    for i, t in enumerate(arr):
        for l in labels:
            if t[0] >= int(l["start"]) and t[0] <= int(l["end"]):
                arr[i][-1] = labelMap[l["type"]['$oid']]
                break
            else:
                arr[i][-1] = 0
    
    return arr

def getDatasetWindows(dataset, window_size, stride):
    window_size = 100
    stride = 20

    fused = []
    idx = 0
    while idx < dataset.shape[0]:
        if idx+window_size > dataset.shape[0]:
            break
        fused.append(dataset[idx: idx+window_size])
        idx += stride

    labels = []
    windows = []

    for w in fused:
        windows.append(w[:, :-1])
        counts = np.bincount(w[:,-1].astype(int))
        label = np.argmax(counts)
        labels.append(label)

    windows = np.array(windows)
    labels = np.array(labels)
    return windows, labels


features = [np.sum, np.median, np.mean, np.std, np.var, np.max, lambda x : np.abs(np.max(x)), np.min]

def extract_features(data):
    return np.array([f(data) for f in features])


def calculateFeatures(windows):
    window_features = []
    for w in windows:
        stack = []
        for i in range(1, windows.shape[-1]):
            stack.append(extract_features(w[:, i]))
        window_features.append(np.stack(stack))

    return np.array(window_features)    


def trainClassifier(windows, labels):
    clf = DecisionTreeClassifier()

    # Reshape the windows
    windows = np.reshape(windows, (windows.shape[0], np.multiply(*windows.shape[1:])))
    
    clf.fit(windows, labels)
    y_pred = clf.predict(windows)
    acc = accuracy_score(labels, y_pred)
    print("Acc: ", acc)

    pass

all_windows = []
all_labels = []
for dataset in datasets:
    arr = processDataset(dataset)
    windows, labels = getDatasetWindows(arr, window_size=50, stride=20)
    windows = calculateFeatures(windows)
    print(windows.shape)
    all_windows.append(windows)
    all_labels.extend(labels)

all_windows = np.concatenate(all_windows, axis=0)
all_labels = np.array(all_labels)
print("-"*50)
print(all_windows.shape)
print(all_labels.shape)
filter = np.array([x > 0 for x in all_labels])

all_windows = all_windows[filter]
all_labels = all_labels[filter]


trainClassifier(all_windows, all_labels)


(135, 3, 8)
(53, 3, 8)
(133, 3, 8)
(99, 3, 8)
--------------------------------------------------
(420, 3, 8)
(420,)
Acc:  1.0


In [None]:
import json

with open("datasets.json", "r") as f:
    datasets = json.load(f)

dataset = None
for d in datasets:
    if (d["_id"]["$oid"] == datasetId):
        dataset = d

labels = [x for x in dataset["labelings"] if x["labelingId"]["$oid"]][0]["labels"]
print(labels)


[{'start': 263288, 'end': 266220, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea93'}, 'metaData': {}}, {'start': 267290, 'end': 269944, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea94'}, 'metaData': {}}, {'start': 271173, 'end': 274303, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea95'}, 'metaData': {}}, {'start': 275848, 'end': 277949, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea96'}, 'metaData': {}}, {'start': 278979, 'end': 281277, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea97'}, 'metaData': {}}, {'start': 283099, 'end': 286031, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea98'}, 'metaData': {}}, {'start': 288171, 'end': 290825, 'type': {'$oid': '6412cb71f822bfe2b57fe96d'}, '_id': {'$oid': '6412ce0cf822bfe2b57fea99'}, 'metaData': {}}, {'start': 29

In [None]:

binStore = BinaryStore(ts[0])
binStore.loadSeries()
data = binStore.getFull()
time = data["time"]
data = data["data"]

In [None]:
import numpy as np

d = np.array([time, data])
n = np.empty((1, len(time)))

d = np.concatenate([d, n]).T

In [None]:
print(d.shape)

for i, t in enumerate(d):
    for l in labels:
        if t[0] >= int(l["start"]) and t[0] <= int(l["end"]):
            d[i][2] = 1
            break
        else:
            d[i][2] = 0

np.savetxt("test.csv", d, delimiter=",")

(2780, 3)


In [None]:
idx = 0

window_size = 100
stride = 20

fused = []

while idx < d.shape[0]:
    if idx+window_size > d.shape[0]:
        break
    fused.append(d[idx: idx+window_size])
    idx += stride

labels = []
windows = []

for w in fused:
    windows.append(w[:, :-1])
    counts = np.bincount(w[:,2].astype(int))
    label = np.argmax(counts)
    labels.append(label)

windows = np.array(windows)
labels = np.array(labels)


In [None]:
import numpy as np

features = [np.sum, np.median, np.mean, np.std, np.var, np.max, lambda x : np.abs(np.max(x)), np.min]

def extract_features(data):
    return np.array([f(data) for f in features])



window_features = []
for w in windows:
    stack = []
    for i in range(1, windows.shape[-1]):
        stack.append(extract_features(w[:, i]))
    window_features.append(np.stack(stack))

window_features = np.array(window_features)

print(window_features.shape)
print(len(labels))

(135, 1, 8)
135


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

windows_train = np.reshape(window_features, (len(window_features), np.multiply(*window_features.shape[1:])))

print(windows_train.shape)

clf = DecisionTreeClassifier()

clf.fit(windows_train, labels)
y_pred = clf.predict(windows_train)
acc = accuracy_score(labels, y_pred)
print(acc)

(135, 8)
1.0
