# Test normalizzazione cella - Veicoli

In [1]:
import geolib.geohash
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from tqdm.auto import tqdm
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from sklearn.model_selection import train_test_split
from cri98tj.distancers.Euclidean_distancer import euclideanBestFitting

In [3]:
df = pd.read_csv('../examples/Animals Dataset/data/animals_preapred.zip').sort_values(by=["tid", "t"])# precision=5, 50 movelet, DTW
df = df[["tid", "class", "c1", "c2", "t"]]
df.head()

Unnamed: 0,tid,class,c1,c2,t
0,1,D,50.1066,3.79665,0
1,1,D,50.1045,3.79455,4
2,1,D,50.1111,3.79845,7
3,1,D,50.1072,3.79845,9
4,1,D,50.1132,3.79965,15


In [5]:
tid_train, tid_test, _, _ = train_test_split(df.groupby(by=["tid"]).max().reset_index()["tid"],
                                                        df.groupby(by=["tid"]).max().reset_index()["class"],
                                                        test_size=.3,
                                                        stratify=df.groupby(by=["tid"]).max().reset_index()["class"],
                                                        random_state=3)

spatioTemporalCols = ["c1", "c2", "t"]
n_jobs = 12
verbose = True
precision = 2

In [6]:
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistanceBestFitting

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=20, bestFittingMeasure=InterpolatedRootDistanceBestFitting, movelets_per_class=300, trajectories_for_orderline=100, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)

In [7]:
import random

partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
df_part = pd.DataFrame(part, columns=["tid", "class"]+spatioTemporalCols+["partId"])

n_per_class = int(100)
selected_tid = []
for classe in df_part["class"].unique():
    df_tmp = df_part[df_part["class"] == classe]
    selected_tid += random.sample(df_tmp.partId.unique().tolist(), k=min(len(df_tmp.partId.unique()), n_per_class))

df_shape = part[df_part.partId.isin(selected_tid)]

shapelets = normalizer.fit_transform(df_shape)

Encoding 10600 points with precision 2


  0%|          | 0/10600 [00:00<?, ?it/s]

Cutting sub-trajectories length at 201.0 over 281
Pivoting tables


  0%|          | 0/10060 [00:00<?, ?it/s]

In [8]:
from geolib import geohash


def my_InterpolatedRootDistanceBestFitting(trajectory, movelet, spatioTemporalColumns):  # nan == end
    if len(trajectory) % len(spatioTemporalColumns) != 0:
        raise Exception(f"la lunghezza della traiettoria deve essere divisivile per {len(spatioTemporalColumns)}")
    if len(movelet) % len(spatioTemporalColumns) != 0:
        raise Exception(f"la lunghezza della traiettoria deve essere divisivile per {len(spatioTemporalColumns)}")

    offset_trajectory = int(len(trajectory) / len(spatioTemporalColumns))
    offset_movelet = int(len(movelet) / len(spatioTemporalColumns))

    len_mov = 0
    for el in movelet:
        if np.isnan(el) or len_mov >= offset_movelet:
            break
        len_mov += 1

    len_t = 0
    for el in trajectory:
        if np.isnan(el) or len_t >= offset_trajectory:
            break
        len_t += 1

    trajectory_dict = [None for x in spatioTemporalColumns]
    movelet_dict = [None for x in spatioTemporalColumns]

    for i, col in enumerate(spatioTemporalColumns):
        trajectory_dict[i] = trajectory[i * offset_trajectory:(i * offset_trajectory) + len_t]
        movelet_dict[i] = movelet[i * offset_movelet:(i * offset_movelet) + len_mov]

    bestScore = math.inf
    best_i = -1
    partitions = geohashPartition(trajectory_dict, precision)

    for i in range(len(partitions)): #mi assicuro di fare almeno 1 iterazione
        returned = trajectory_distance(partitions[i], movelet_dict)
        if returned is not None and returned < bestScore:
            bestScore = returned
            best_i = i

    return best_i, bestScore

def geohashPartition(trajectory_dict=[], precision=5):
    prec_index = 0
    prec_gh = geohash.encode(trajectory_dict[0][0], trajectory_dict[1][0], precision)
    prec_sw = geohash.bounds(prec_gh).sw
    partizioni = []
    for i, (c1, c2, t) in enumerate(zip(trajectory_dict[0], trajectory_dict[1], trajectory_dict[2])):
        if geohash.encode(c1, c2, precision) != prec_gh:
            dict_tmp = {}
            dict_tmp[0] = trajectory_dict[0][prec_index:i] - prec_sw.lat
            dict_tmp[1] = trajectory_dict[1][prec_index:i] - prec_sw.lon
            dict_tmp[2] = trajectory_dict[2][prec_index:i] - trajectory_dict[2][prec_index]
            partizioni.append(dict_tmp)

            prec_index = i
            prec_gh = geohash.encode(c1, c2, precision)
            prec_sw = geohash.bounds(prec_gh).sw
    dict_tmp = {}
    dict_tmp[0] = trajectory_dict[0][prec_index:]- prec_sw.lat
    dict_tmp[1] = trajectory_dict[1][prec_index:]- prec_sw.lon
    dict_tmp[2] = trajectory_dict[2][prec_index:]- trajectory_dict[2][prec_index]
    partizioni.append(dict_tmp)

    return partizioni

In [9]:
from cri98tj.normalizers.normalizer_utils import dataframe_pivot
from cri98tj.distancers.InterpolatedRootDistance_distancer import trajectory_distance
import math
import numpy as np

trajectories_df = df.copy()
trajectories_df["partId"] = trajectories_df.tid
df_pivot = dataframe_pivot(df=trajectories_df, maxLen=None, verbose=verbose, fillna_value=None, columns=spatioTemporalCols)

dist_matrix = np.zeros((len(df_pivot), len(shapelets)))



Pivoting tables


  0%|          | 0/14990 [00:00<?, ?it/s]

In [10]:
for i, traj in enumerate(tqdm(df_pivot.values[:, 1:])):
    for j, mov in enumerate(shapelets.values[:, 1:]):
        _, dist_matrix[i, j] = my_InterpolatedRootDistanceBestFitting(traj, mov, spatioTemporalCols)

  0%|          | 0/102 [00:00<?, ?it/s]

In [25]:
clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

dist_np_df = pd.DataFrame(dist_matrix)

In [26]:
dist_matrix

array([[706575.34430803, 708107.12837611, 708800.80899823, ...,
        707712.49031949, 707143.11954273, 707530.74083997],
       [706308.74640126, 708018.46307817, 708601.07482498, ...,
        707182.61213296, 706557.9529265 , 706864.57331855],
       [708700.23310677, 710390.68386956, 710893.9105306 , ...,
        709253.13178729, 709074.39972942, 709149.34355899],
       ...,
       [709909.6880162 , 711627.84212013, 711850.85929592, ...,
        710066.05400057, 710262.24317682, 710039.04338037],
       [709950.16618371, 711715.58785893, 711990.21743484, ...,
        711222.55007813, 710523.41591442, 710910.65600607],
       [708392.93482972, 710142.97342988, 710307.0962887 , ...,
        709379.85769903, 708962.95219502, 709242.72410845]])

In [27]:

X = dist_matrix
y = df_pivot["class"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

In [28]:
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.67      1.00      0.80        10
           D       0.43      0.33      0.38         9
           E       0.89      0.67      0.76        12

    accuracy                           0.68        31
   macro avg       0.66      0.67      0.65        31
weighted avg       0.68      0.68      0.66        31



Risultati: anche in questo caso, la normalizzazione basata sulla cella sembra essere peggiore di quella basata sul primo punto. (?? vs ??, nonostante siano state usate molte più shapelets)

In [29]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score


accuracy_score(y_test, y_pred)

0.6774193548387096