# Testing delle misure di distanza al variare della precision

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from tqdm.auto import tqdm
from datetime import datetime

In [2]:
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from sklearn.model_selection import train_test_split
from cri98tj.selectors.Random_selector import Random_selector

In [3]:
df0 = pd.read_csv('../examples/Vehicles Dataset/data/vehicles_preapred.zip').sort_values(by=["tid", "t"])# precision=5, 50 movelet, DTW
df0["c1"] = df0.c1/100000
df0["c2"] = df0.c2/100000

df0.head()

Unnamed: 0,tid,class,t,c1,c2
0,30901,B,0,42.07716,4.738411
1,30901,B,30,42.077246,4.739088
2,30901,B,60,42.077259,4.739096
3,30901,B,90,42.077369,4.739158
4,30901,B,120,42.077635,4.739343


In [4]:
tid_train, tid_test, _, _ = train_test_split(df0.groupby(by=["tid"]).max().reset_index()["tid"],
                                                        df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                        test_size=.3,
                                                        stratify=df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                        random_state=3)

df = df0[["tid", "class", "c1", "c2", "t"]]
n_movelets=10
n_jobs = 12
verbose = False

import warnings
warnings.filterwarnings("ignore")

In [5]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

def compute_measures(test, pred):
    return (accuracy_score(test, pred), precision_score(test, pred, average="micro"), f1_score(test, pred, average="micro"), recall_score(test, pred, average="micro"))


## Distanza euclidea, k=range(2,7)

In [6]:
from cri98tj.distancers.Euclidean_distancer import euclideanBestFitting

spatioTemporalCols = ["c1", "c2"]

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=n_movelets,
                                                  bestFittingMeasure=euclideanBestFitting, 
                                                  movelets_per_class=300, trajectories_for_orderline=100, 
                                                  n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, 
                                                  normalizer=normalizer, verbose=verbose)
distancer = Euclidean_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = []
time = []
i=0
for precision in tqdm(range(3,8)):    
    res.append((.0, .0, .0, .0, .0))
    time.append(.0)
    
    for _ in range(3):
        df = df0.copy()[["tid", "class", "c1", "c2"]]
        
        start = datetime.now()
        
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        shapelets = selector.fit_transform(part)
        _, dist_np = distancer.fit_transform((df.values, shapelets))
        
        stop = start - datetime.now()

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        res[i] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i]))
        time[i] += -stop.total_seconds()*1000 #millisecondi
        
        print(compute_measures(y_test, y_pred))
        
    res[i] = list(map(lambda x: x/3, res[i]))
    time[i] /= 3
    i += 1

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

(0.8521739130434782, 0.8521739130434782, 0.8521739130434782, 0.8521739130434782)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9217391304347826, 0.9217391304347826, 0.9217391304347826, 0.9217391304347826)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9043478260869565, 0.9043478260869565, 0.9043478260869565, 0.9043478260869565)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.8521739130434782, 0.8521739130434782, 0.8521739130434782, 0.8521739130434782)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9478260869565217, 0.9478260869565217, 0.9478260869565217, 0.9478260869565217)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.8521739130434782, 0.8521739130434782, 0.8521739130434782, 0.8521739130434782)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9130434782608695, 0.9130434782608695, 0.9130434782608695, 0.9130434782608695)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9565217391304348, 0.9565217391304348, 0.9565217391304348, 0.9565217391304348)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9826086956521739, 0.9826086956521739, 0.9826086956521739, 0.9826086956521739)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9565217391304348, 0.9565217391304348, 0.9565217391304348, 0.9565217391304348)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9478260869565217, 0.9478260869565217, 0.9478260869565217, 0.9478260869565217)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.9478260869565217, 0.9478260869565217, 0.9478260869565217, 0.9478260869565217)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.8434782608695652, 0.8434782608695652, 0.8434782608695653, 0.8434782608695652)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.8521739130434782, 0.8521739130434782, 0.8521739130434782, 0.8521739130434782)


  0%|          | 0/10 [00:00<?, ?it/s]

(0.7304347826086957, 0.7304347826086957, 0.7304347826086957, 0.7304347826086957)


In [7]:
for precisione, t, val in zip(range(3,8), time, res):
    print(F"{precisione}: {val[0]} in {t/1000}s")

3: 0.8927536231884058 in 329.62791533333336s
4: 0.8840579710144927 in 277.14882800000004s
5: 0.9507246376811594 in 97.78541466666667s
6: 0.9507246376811594 in 49.36315333333333s
7: 0.8086956521739129 in 40.830655s


In [8]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig["precision"] = range(3,8)

df_res_rig.to_csv(f"Test euclidea vehicles.csv", index=None)

## Distanza Prof

In [9]:
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
spatioTemporalCols = ["c1", "c2", "t"]

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=int(1.4**i), 
                                                  bestFittingMeasure=InterpolatedRootDistanceBestFitting, 
                                                  movelets_per_class=300, trajectories_for_orderline=100, 
                                                  n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, 
                                                  normalizer=normalizer, verbose=verbose)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = []
time = []
i=0
for precision in tqdm(range(3,8)):
    res.append((.0, .0, .0, .0, .0))
    time.append(.0)
    
    for _ in range(3):
        df = df0.copy()[["tid", "class", "c1", "c2", "t"]]
        start = datetime.now()
    
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        shapelets = selector.fit_transform(part)
        _, dist_np = distancer.fit_transform((df.values, shapelets))
        
        stop = start - datetime.now()

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        print(compute_measures(y_test, y_pred)[0])
        
        res[i] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i]))
        time[i] += stop.total_seconds()*1000 #millisecondi
        
    res[i] = list(map(lambda x: x/3, res[i]))
    time[i] /= 3
    i += 1

  0%|          | 0/5 [00:00<?, ?it/s]

0.8
0.8
0.7913043478260869
0.9130434782608695
0.8608695652173913
0.8782608695652174
0.9304347826086956
0.9478260869565217
0.9043478260869565
0.9130434782608695
0.9043478260869565
0.9391304347826087
0.9478260869565217
0.8608695652173913
0.9478260869565217


In [10]:
for precisione, t, val in zip(range(3,8), time, res):
    print(F"{precisione}: {val[0]} in {t/1000}s")

3: 0.7971014492753623 in -2230.779322s
4: 0.8840579710144928 in -2379.063755666666s
5: 0.927536231884058 in -697.998319s
6: 0.918840579710145 in -183.81517166666666s
7: 0.918840579710145 in -66.635127s


In [11]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig["t"] *= -1

df_res_rig["precision"] = range(3,8)

df_res_rig.to_csv(f"Test IRD vehicles.csv", index=None)

## DTW

In [12]:
from cri98tj.distancers.DTW_distancer import DTW_distancer, DTWBestFitting

spatioTemporalCols = ["c1", "c2"]

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=int(1.4**i), 
                                                  bestFittingMeasure=DTWBestFitting, 
                                                  movelets_per_class=300, trajectories_for_orderline=100, 
                                                  n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, 
                                                  normalizer=normalizer, verbose=verbose)
distancer = DTW_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = []
time = []
i=0
for precision in tqdm(range(7,2,-1)):
    res.append((.0, .0, .0, .0, .0))
    time.append(.0)
    
    for _ in range(3):
        spatioTemporalCols = ["c1", "c2"]
        df = df0.copy()[["tid", "class", "c1", "c2"]]
        start = datetime.now()
        
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        shapelets = selector.fit_transform(part)
        dist_np = distancer.fit_transform((df.values, shapelets))
        
        stop = start - datetime.now()

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        print(compute_measures(y_test, y_pred)[0])
        
        res[i] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i]))
        time[i] += stop.total_seconds()*1000 #millisecondi
        
    res[i] = list(map(lambda x: x/3, res[i]))
    time[i] /= 3
    i += 1

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

0.7217391304347827


  0%|          | 0/5 [00:00<?, ?it/s]

0.8695652173913043


  0%|          | 0/5 [00:00<?, ?it/s]

0.7217391304347827


  0%|          | 0/5 [00:00<?, ?it/s]

0.9217391304347826


  0%|          | 0/5 [00:00<?, ?it/s]

0.8695652173913043


  0%|          | 0/5 [00:00<?, ?it/s]

0.9217391304347826


  0%|          | 0/5 [00:00<?, ?it/s]

0.8782608695652174


  0%|          | 0/5 [00:00<?, ?it/s]

0.9304347826086956


  0%|          | 0/5 [00:00<?, ?it/s]

0.9217391304347826


KeyboardInterrupt: 

In [13]:
for precisione, t, val in zip(range(7,2,-1), time, res):
    print(F"{precisione}: {val[0]} in {t/1000}s")

7: 0.7710144927536232 in -101.124988s
6: 0.9043478260869565 in -320.0141786666667s
5: 0.910144927536232 in -3740.563357666667s
4: 0.0 in 0.0s


In [23]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig["t"] *= -1

df_res_rig["precision"] = range(7,3,-1)

df_res_rig.to_csv(f"Test DTW vehicles.csv", index=None)

In [21]:
res

[[0.7710144927536232,
  0.7710144927536232,
  0.7710144927536232,
  0.7710144927536232],
 [0.9043478260869565,
  0.9043478260869565,
  0.9043478260869565,
  0.9043478260869565],
 [0.910144927536232, 0.910144927536232, 0.910144927536232, 0.910144927536232],
 (0.0, 0.0, 0.0, 0.0)]

In [20]:
res[-1] = (.0,.0,.0,.0)