In [1]:
import random
import os
import pandas as pd
from glob import glob
from tqdm.auto import tqdm
import geopandas
from sklearn.model_selection import train_test_split
import shutil
import numpy as np

In [2]:
sumo_rename_map = {
    "vehicle_id": "tid",
    "class": "y"
}

def undersample(df, n_sample_per_class):
    classes = df.y.unique().tolist()
    selected_tid = []

    for classe in classes:
        selected_tid += random.sample(df[df.y == classe].tid.unique().tolist(), n_sample_per_class)

    return df[df.tid.isin(selected_tid)].sort_values(by=["tid", "time"])

def sumo_cities():
    df_grid = pd.read_csv("../../Code/generation/sumo_stuff/Grid-Empire-2.5km/trajectories.zip").rename(columns=sumo_rename_map)
    df_borgo = pd.read_csv("../../Code/generation/sumo_stuff/Borgo-Rome-2Km/trajectories.zip").rename(columns=sumo_rename_map)
    df_mixed = pd.read_csv("../../Code/generation/sumo_stuff/Mixed-Athens-2km/trajectories.zip").rename(columns=sumo_rename_map)

    df_grid.y = "grid"
    df_borgo.y = "borgo"
    df_mixed.y = "mixed"
    df_grid.tid = df_grid.tid.apply(lambda x: f"grid_{x}")
    df_borgo.tid = df_borgo.tid.apply(lambda x: f"borgo_{x}")
    df_mixed.tid = df_mixed.tid.apply(lambda x: f"mixed_{x}")

    return pd.concat([df_grid, df_borgo, df_mixed], ignore_index=True)

def sumo_cities_grid_vs_borgo():
    df_grid = pd.read_csv("../../Code/generation/sumo_stuff/Grid-Empire-2.5km/trajectories.zip").rename(columns=sumo_rename_map)
    df_borgo = pd.read_csv("../../Code/generation/sumo_stuff/Borgo-Rome-2Km/trajectories.zip").rename(columns=sumo_rename_map)

    df_grid.y = "grid"
    df_borgo.y = "borgo"
    df_grid.tid = df_grid.tid.apply(lambda x: f"grid_{x}")
    df_borgo.tid = df_borgo.tid.apply(lambda x: f"borgo_{x}")

    return pd.concat([df_grid, df_borgo], ignore_index=True)

def sumo_cities_borgo_vs_mixed():
    df_borgo = pd.read_csv("../../Code/generation/sumo_stuff/Borgo-Rome-2Km/trajectories.zip").rename(columns=sumo_rename_map)
    df_mixed = pd.read_csv("../../Code/generation/sumo_stuff/Mixed-Athens-2km/trajectories.zip").rename(columns=sumo_rename_map)

    df_borgo.y = "borgo"
    df_mixed.y = "mixed"
    df_borgo.tid = df_borgo.tid.apply(lambda x: f"borgo_{x}")
    df_mixed.tid = df_mixed.tid.apply(lambda x: f"mixed_{x}")

    return pd.concat([df_borgo, df_mixed], ignore_index=True)

def sumo_cities_mixed_vs_grid():
    df_grid = pd.read_csv("../../Code/generation/sumo_stuff/Grid-Empire-2.5km/trajectories.zip").rename(columns=sumo_rename_map)
    df_mixed = pd.read_csv("../../Code/generation/sumo_stuff/Mixed-Athens-2km/trajectories.zip").rename(columns=sumo_rename_map)

    df_grid.y = "grid"
    df_mixed.y = "mixed"
    df_grid.tid = df_grid.tid.apply(lambda x: f"grid_{x}")
    df_mixed.tid = df_mixed.tid.apply(lambda x: f"mixed_{x}")

    return pd.concat([df_grid, df_mixed], ignore_index=True)

def sumo_all_car_vs_bikes():
    df_grid = pd.read_csv("../../Code/generation/sumo_stuff/Grid-Empire-2.5km/trajectories.zip").rename(columns=sumo_rename_map)
    df_borgo = pd.read_csv("../../Code/generation/sumo_stuff/Borgo-Rome-2Km/trajectories.zip").rename(columns=sumo_rename_map)
    df_mixed = pd.read_csv("../../Code/generation/sumo_stuff/Mixed-Athens-2km/trajectories.zip").rename(columns=sumo_rename_map)

    df_grid.y = df_grid.y.apply(lambda x: f"grid_{x}")
    df_borgo.y = df_borgo.y.apply(lambda x: f"borgo_{x}")
    df_mixed.y = df_mixed.y.apply(lambda x: f"mixed_{x}")

    df_grid.tid = df_grid.tid.apply(lambda x: f"grid_{x}")
    df_borgo.tid = df_borgo.tid.apply(lambda x: f"borgo_{x}")
    df_mixed.tid = df_mixed.tid.apply(lambda x: f"mixed_{x}")

    return pd.concat([df_grid, df_borgo, df_mixed], ignore_index=True)

def sumo_cities_car_vs_bikes():
    df_grid = pd.read_csv("../../Code/generation/sumo_stuff/Grid-Empire-2.5km/trajectories.zip").rename(columns=sumo_rename_map)
    df_borgo = pd.read_csv("../../Code/generation/sumo_stuff/Borgo-Rome-2Km/trajectories.zip").rename(columns=sumo_rename_map)
    df_mixed = pd.read_csv("../../Code/generation/sumo_stuff/Mixed-Athens-2km/trajectories.zip").rename(columns=sumo_rename_map)

    df_grid.tid = df_grid.tid.apply(lambda x: f"grid_{x}")
    df_borgo.tid = df_borgo.tid.apply(lambda x: f"borgo_{x}")
    df_mixed.tid = df_mixed.tid.apply(lambda x: f"mixed_{x}")

    return pd.concat([df_grid, df_borgo, df_mixed], ignore_index=True)

def sumo_grid_car_vs_bikes():
    return pd.read_csv("../../Code/generation/sumo_stuff/Grid-Empire-2.5km/trajectories.zip").rename(columns=sumo_rename_map)

def sumo_borgo_car_vs_bikes():
    return pd.read_csv("../../Code/generation/sumo_stuff/Borgo-Rome-2Km/trajectories.zip").rename(columns=sumo_rename_map)

def sumo_mixed_car_vs_bikes():
    return pd.read_csv("../../Code/generation/sumo_stuff/Mixed-Athens-2km/trajectories.zip").rename(columns=sumo_rename_map)

In [3]:
df_simple_shape = pd.read_csv("../../data/simple_shape.zip")
df_simple_shape_time = pd.read_csv("../../data/simple_shape_time.zip")
df_simple_time = pd.read_csv("../../data/simple_time.zip")
df_simple_yao = pd.read_csv("../../data/simple_yao.zip")

df_sumo_cities = sumo_cities()
df_sumo_cities_grid_vs_borgo = sumo_cities_grid_vs_borgo()
df_sumo_cities_borgo_vs_mixed = sumo_cities_borgo_vs_mixed()
df_sumo_cities_mixed_vs_grid = sumo_cities_mixed_vs_grid()
df_sumo_all_car_vs_bikes = sumo_all_car_vs_bikes()
df_sumo_cities_car_vs_bikes = sumo_cities_car_vs_bikes()
df_sumo_grid_car_vs_bikes = sumo_grid_car_vs_bikes()
df_sumo_borgo_car_vs_bikes = sumo_borgo_car_vs_bikes()
df_sumo_mixed_car_vs_bikes = sumo_mixed_car_vs_bikes()

df_simple_shape.head()

Unnamed: 0,y,lat,lon,time,tid
0,right,0.250687,-0.887452,0,0
1,right,0.359131,-0.815455,1,0
2,right,0.267477,-0.796666,2,0
3,right,0.340948,-0.700614,3,0
4,right,0.257886,-0.600954,4,0


In [4]:
from sklearn.preprocessing import LabelEncoder

def prepareForShapelet(df=pd.DataFrame, trajectoryIDAttribute=None, targetAttribute=None,
                       timestampAttribute=None, latAttr=None, lonAttr=None,
                       train_path="train", test_path="test"
                      ):
    
    attributes = [timestampAttribute, "latLon"]+ [x for x in list(df.columns) if x not in [trajectoryIDAttribute, targetAttribute, timestampAttribute, latAttr, lonAttr]] #mi assicuro di avere il timestam come primo elemento

    df = df.copy()

    #df["y_old"] = df[trajectoryIDAttribute]
    #df[trajectoryIDAttribute] = LabelEncoder().fit_transform(df[trajectoryIDAttribute])

    if os.path.exists(train_path): shutil.rmtree(train_path)
    if os.path.exists(test_path): shutil.rmtree(test_path)
    
    if(os.path.exists(train_path) | os.path.exists(test_path)):
        print("Le cartelle train e test esistono già!")
        return

    df["latLon"] = df[latAttr].astype(str) + " " + df[lonAttr].astype(str)

    ids, ids_idx = np.unique(df[trajectoryIDAttribute], return_index=True)#list(df[trajectoryIDAttribute].unique())
    #y = pd.concat([df[df[trajectoryIDAttribute] == x].head(1) for x in tqdm(ids)])[targetAttribute]
    y = df[targetAttribute].values[ids_idx]
    ids_train, ids_test, y_train, y_test = train_test_split(ids, y, test_size=0.3, random_state=32, stratify=y)

    os.makedirs(train_path)
    os.makedirs(test_path)

    for id, tid in enumerate(tqdm(ids_train, leave=False, desc="train")):
        df_id = df[df[trajectoryIDAttribute] == tid]
        classe = df_id[targetAttribute].iloc[0]
        df_id[attributes].to_csv(F"{train_path}/{id} s{id} c{classe}.r2", index=False, header=False)

    for id, tid in enumerate(tqdm(ids_test, leave=False, desc="test")):
        df_id = df[df[trajectoryIDAttribute] == tid]
        classe = df_id[targetAttribute].iloc[0]
        df_id[attributes].to_csv(F"{test_path}/{id} s{id} c{classe}.r2", index=False, header=False)

    #df[df[trajectoryIDAttribute].isin(ids_train)].to_csv(f"df_{train_path}.csv")
    #df[df[trajectoryIDAttribute].isin(ids_test)].to_csv(f"df_{test_path}.csv")
    
    shutil.make_archive(train_path, 'zip', train_path)
    shutil.make_archive(test_path, 'zip', test_path)

    shutil.rmtree(train_path)
    shutil.rmtree(test_path)

In [5]:
datasets = {
    "simple_shape": df_simple_shape,
    "simple_shape_time": df_simple_shape_time,
    "simple_time": df_simple_time,
    "simple_yao": df_simple_yao,

    "sumo_cities": df_sumo_cities,
    "sumo_cities_grid_vs_borgo": df_sumo_cities_grid_vs_borgo,
    "sumo_cities_borgo_vs_mixed": df_sumo_cities_borgo_vs_mixed,
    "sumo_cities_mixed_vs_grid": df_sumo_cities_mixed_vs_grid,
    "sumo_all_car_vs_bikes": df_sumo_all_car_vs_bikes,
    "sumo_cities_car_vs_bikes": df_sumo_cities_car_vs_bikes,
    "sumo_grid_car_vs_bikes": df_sumo_grid_car_vs_bikes,
    "sumo_borgo_car_vs_bikes": df_sumo_borgo_car_vs_bikes,
    "sumo_mixed_car_vs_bikes": df_sumo_mixed_car_vs_bikes,
}

params = {'trajectoryIDAttribute': "tid", 'targetAttribute': "y", 'timestampAttribute': "time", 'latAttr': "lat", 'lonAttr': "lon"}

progress = tqdm(datasets.items())
for name, d in progress:
    df = d.copy()
    progress.set_description(name)
    base_path = "dataset_for_movelets/"+name
    if not os.path.exists(base_path): 
        os.mkdir(base_path)

    if "sumo" in name:
        df_count = df.groupby("tid").size()
        df_count = df_count[(df_count > 60) & (df_count < 60 * 20)]
        df0 = df[df.tid.isin(df_count.keys())]
        df = undersample(df0, 300)
    
    prepareForShapelet(df=df, train_path=f"{base_path}/train_{name}", test_path=f"{base_path}/test_{name}", **params)

  0%|          | 0/13 [00:00<?, ?it/s]

train:   0%|          | 0/140 [00:00<?, ?it/s]

test:   0%|          | 0/60 [00:00<?, ?it/s]

train:   0%|          | 0/140 [00:00<?, ?it/s]

test:   0%|          | 0/60 [00:00<?, ?it/s]

train:   0%|          | 0/140 [00:00<?, ?it/s]

test:   0%|          | 0/60 [00:00<?, ?it/s]

train:   0%|          | 0/210 [00:00<?, ?it/s]

test:   0%|          | 0/90 [00:00<?, ?it/s]

train:   0%|          | 0/630 [00:00<?, ?it/s]

test:   0%|          | 0/270 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

train:   0%|          | 0/1260 [00:00<?, ?it/s]

test:   0%|          | 0/540 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

train:   0%|          | 0/420 [00:00<?, ?it/s]

test:   0%|          | 0/180 [00:00<?, ?it/s]

In [None]:
import numpy as np
np.unique(df_sumo_cities["tid"], return_index=True)

In [None]:
prepareForShapelet(df=df_simple_shape, trajectoryIDAttribute="tid", targetAttribute="y", timestampAttribute="time", latAttr="lat", lonAttr="lon", 
                   train_path="dataset_for_movelets/train_simple_shape", test_path="dataset_for_movelets/test_simple_shape")

In [None]:
mkdir dataset_for_movelets