In [47]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from glob import glob
import json
import zipfile
import tempfile

In [48]:
glob("dataset_for_movelets/simple_yao/*")

['dataset_for_movelets/simple_yao/res_simple_yaotrain.csv',
 'dataset_for_movelets/simple_yao/res_simple_yaomoveletsOnTest.json',
 'dataset_for_movelets/simple_yao/train_simple_yao.zip',
 'dataset_for_movelets/simple_yao/res_simple_yaotest.csv',
 'dataset_for_movelets/simple_yao/res_simple_yaomoveletsOnTrain.json',
 'dataset_for_movelets/simple_yao/test_simple_yao.zip']

In [59]:
def get_trajectory(zip_file, trajectory_id):
    with tempfile.TemporaryDirectory() as tempdir:
        with zipfile.ZipFile(zip_file) as zf:
            zf.extractall(str(tempdir))

        matching_files = glob(str(tempdir)+f"/{trajectory_id} *.r2")
        assert len(matching_files) == 1
        print(matching_files)

        df = pd.read_csv(matching_files[0], names=["t", "lon_lat"])
        df[["lon", "lat"]] = list(df.lon_lat.apply(lambda x: x.split(" ")))

        return df.drop(columns=["lon_lat"]).astype(float)

get_trajectory("dataset_for_movelets/simple_yao/train_simple_yao.zip", "10")

['/tmp/tmpf16vl8uz/10 s10 ccircling.r2']


Unnamed: 0,t,lon,lat
0,0.0,-0.000000,0.000000
1,1.0,0.238461,-0.386110
2,2.0,-0.375518,5.978642
3,3.0,-5.547720,19.118422
4,5.0,-16.246067,38.526781
...,...,...,...
78,119.0,-15.151487,12.906636
79,121.0,-5.253783,2.192211
80,122.0,0.435037,5.846320
81,123.0,-5.230255,19.020728


In [60]:
def get_shapelets(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)

    shapelets = []
    for shapelet_info in data["shapelets"]:
        shapelets.append(
            (
                {
                    "tid": shapelet_info["trajectory"],
                    "start": shapelet_info["start"],
                    "end": shapelet_info["end"],
                    "label": shapelet_info["label"],
                },
                get_trajectory(json_file.split("moveletsOnTrain")[0].split("moveletsOnTest")[0].replace("res", "train")+".zip", 
                               shapelet_info["trajectory"]).iloc[shapelet_info["start"]:shapelet_info["end"]+1]
            )
        )

    return shapelets

get_shapelets('dataset_for_movelets/simple_yao/res_simple_yaomoveletsOnTrain.json')

['/tmp/tmpqqln_ovl/62 s62 cstraight.r2']
['/tmp/tmp9a9nyvlr/92 s92 ccircling.r2']


[({'tid': 62, 'start': 13, 'end': 21, 'label': 'straight'},
         t        lon        lat
  13  21.0  16.882680 -38.823174
  14  23.0  18.271881 -42.296518
  15  25.0  18.083068 -46.380317
  16  26.0  20.920598 -47.520239
  17  28.0  21.153652 -51.729385
  18  30.0  23.564555 -54.519458
  19  31.0  25.163470 -56.212069
  20  32.0  25.503553 -58.056581
  21  34.0  25.906038 -62.517936),
 ({'tid': 92, 'start': 15, 'end': 16, 'label': 'circling'},
         t       lon        lat
  15  20.0 -0.532731  20.267276
  16  21.0  0.124456  -0.027089)]

In [93]:
def retrive_true_label(dataset_path):
    dataset_name = dataset_path.split("/")[-1]
    train_path = f"{dataset_path}/train_{dataset_name}.zip"
    test_path = f"{dataset_path}/test_{dataset_name}.zip"

    labels = dict(
        #dict(tid: label)
    )

    with tempfile.TemporaryDirectory() as tempdir:
        with zipfile.ZipFile(train_path) as zf:
            zf.extractall(str(tempdir))

        for filename in glob(f"{tempdir}/*.r2"):
            tid, target = filename[:-3].split("/")[-1].split(" ")[::2]
            labels["train_"+tid] = target[1:]

    with tempfile.TemporaryDirectory() as tempdir:
        with zipfile.ZipFile(test_path) as zf:
            zf.extractall(str(tempdir))

        for filename in glob(f"{tempdir}/*.r2"):
            tid, target = filename[:-3].split("/")[-1].split(" ")[::2]
            labels["test_"+tid] = target[1:]

    return labels

In [112]:
def retrive_similarities_and_prediction(dataset_path):
    dataset_name = dataset_path.split("/")[-1]
    train_path = f"{dataset_path}/res_{dataset_name}moveletsOnTrain.json"
    test_path = f"{dataset_path}/res_{dataset_name}moveletsOnTest.json"

    true_labels = retrive_true_label(dataset_path)

    with open(train_path, 'r') as f:
        data = json.load(f)
    dataset = dict()
    for i, el in enumerate(data["classes"]):
        dataset[f"train_{el['tid']}"] = {
            'tid': f"train_{el['tid']}",
            'set': 'train',
            'label': el['label'],
            'true_label': true_labels[f"train_{el['tid']}"]
        }

        for shape_idx, shape_info in enumerate(data["shapelets"]):
            dataset[f"train_{el['tid']}"] |= {f"shape_{shape_idx}": shape_info["distances"][i]}

    with open(test_path, 'r') as f:
        data = json.load(f)
    for i, el in enumerate(data["classes"]):
        dataset[f"test_{el['tid']}"] = {
            'tid': f"test_{el['tid']}",
            'set': 'test',
            'label': el['label'],
            'true_label': true_labels[f"test_{el['tid']}"]
        }

        for shape_idx, shape_info in enumerate(data["shapelets"]):
            dataset[f"test_{el['tid']}"] |= {f"shape_{shape_idx}": shape_info["distances"][i]}

    
    return pd.DataFrame.from_dict(dataset).T

df = retrive_similarities_and_prediction('dataset_for_movelets/simple_yao')

df

Unnamed: 0,tid,set,label,true_label,shape_0,shape_1
train_0,train_0,train,straight,straight,69.717405,14.449962
train_1,train_1,train,bending,bending,62.685772,14.391581
train_10,train_10,train,circling,circling,157.803932,14.338552
train_100,train_100,train,bending,bending,45.586129,14.40647
train_101,train_101,train,straight,straight,46.241823,14.479542
...,...,...,...,...,...,...
test_86,test_86,test,straight,straight,65.624115,14.627296
test_87,test_87,test,straight,straight,45.606478,14.544125
test_88,test_88,test,circling,circling,116.293559,14.336801
test_89,test_89,test,straight,straight,45.836359,14.412516


In [116]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score

def train_models_syntetic_datasets(df):
    X_train = df[df.set == 'train'].drop(columns=["tid", 'set', 'label', 'true_label']).values
    X_test = df[df.set == 'test'].drop(columns=["tid", 'set', 'label', 'true_label']).values
    y_train = df[df.set == 'train'].label.values
    y_test = df[df.set == 'test'].label.values
    
    knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
    dt = DecisionTreeClassifier(max_depth=4).fit(X_train, y_train)
    rf = RandomForestClassifier(n_estimators=500).fit(X_train, y_train)
    
    knn_y_pred = knn.predict(X_test)
    dt_y_pred = dt.predict(X_test)
    rf_y_pred = rf.predict(X_test)
    
    return {
        'knn_f1': round(f1_score(y_pred = knn_y_pred, y_true=y_test, average='macro'), 3),
        'dt_f1': round(f1_score(y_pred = dt_y_pred, y_true=y_test, average='macro'), 3),
        'rf_f1': round(f1_score(y_pred = rf_y_pred, y_true=y_test, average='macro'), 3),
    }

train_models_syntetic_datasets(retrive_similarities_and_prediction('dataset_for_movelets/simple_yao'))

{'knn_f1': 0.68, 'dt_f1': 0.72, 'rf_f1': 0.722}

In [87]:
with open('dataset_for_movelets/simple_yao/res_simple_yaomoveletsOnTrain.json', 'r') as f:
    data = json.load(f)

from IPython.display import JSON

JSON(data)

<IPython.core.display.JSON object>

In [62]:
data["shapelets"]

[{'start': 13,
  'end': 21,
  'trajectory': 62,
  'label': 'straight',
  'features': {},
  'maxValues': {'space': -1.0, 'points': -1.0},
  'distances': [65.57292242306785,
   43.14478511409989,
   62.19580684223062,
   126.99831330445035,
   61.830954736911494,
   45.26678860757834,
   124.71576543100426,
   43.59791973775341,
   60.514536609940876,
   165.54047041567947,
   46.364639420991054,
   44.85262531530147,
   65.19738890122309,
   56.0490685062589,
   73.03444717946535,
   66.53024002092062,
   84.70203107925506,
   139.11459674644772,
   146.6899806401477,
   66.78033435166468,
   175.58025513991768,
   82.24582877541393,
   50.5173594874929,
   41.590305234802294,
   50.23957330120357,
   43.11555792981215,
   104.65544264107467,
   103.4350195933107,
   68.3002140139387,
   53.03049909958833,
   56.82739662112454,
   111.74201867522937,
   105.42950040496667,
   174.60097504074758,
   60.39085726430178,
   67.62946992125785,
   71.800182165217,
   67.63493716632925,
   160

In [46]:
pd.read_csv("../../data/simple_yao.zip")

Unnamed: 0,time,lat,lon,y,tid
0,0,0.000000,0.000000,straight,0
1,2,3.137486,1.208551,straight,0
2,3,5.648241,0.420390,straight,0
3,4,7.336309,1.062335,straight,0
4,5,8.582438,3.009159,straight,0
...,...,...,...,...,...
20594,111,-208.012613,82.456516,bending,299
20595,112,-210.036307,82.648491,bending,299
20596,114,-214.358151,83.389841,bending,299
20597,116,-219.251445,82.911010,bending,299
