# Testing delle misure di distanza al variare della precision

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from tqdm.auto import tqdm
from datetime import datetime

In [2]:
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from sklearn.model_selection import train_test_split
from cri98tj.selectors.Random_selector import Random_selector

In [3]:
df0 = pd.read_csv('../examples/Animals Dataset/data/animals_preapred.zip').sort_values(by=["tid", "t"])# precision=5, 50 movelet, DTW

In [4]:
df0.head()

Unnamed: 0,tid,class,t,c1,c2
0,1,D,0,50.1066,3.79665
1,1,D,4,50.1045,3.79455
2,1,D,7,50.1111,3.79845
3,1,D,9,50.1072,3.79845
4,1,D,15,50.1132,3.79965


In [5]:
tid_train, tid_test, _, _ = train_test_split(df0.groupby(by=["tid"]).max().reset_index()["tid"],
                                                        df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                        test_size=.3,
                                                        stratify=df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                        random_state=3)

df = df0[["tid", "class", "c1", "c2", "t"]]
n_movelets=50
n_jobs = 12
verbose = False

import warnings
warnings.filterwarnings("ignore")

In [6]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

def compute_measures(test, pred):
    return (accuracy_score(test, pred), precision_score(test, pred, average="micro"), f1_score(test, pred, average="micro"), recall_score(test, pred, average="micro"))


## Distanza euclidea, k=range(2,7)

In [7]:
from cri98tj.distancers.Euclidean_distancer import euclideanBestFitting

spatioTemporalCols = ["c1", "c2"]

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=n_movelets, bestFittingMeasure=euclideanBestFitting, movelets_per_class=None, trajectories_for_orderline=None, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
distancer = Euclidean_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = []
time = []
i=0
for precision in tqdm(range(3,8)):    
    res.append((.0, .0, .0, .0, .0))
    time.append(.0)
    
    for _ in range(3):
        df = df0.copy()[["tid", "class", "c1", "c2"]]
        
        start = datetime.now()
        
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        shapelets = selector.fit_transform(part)
        _, dist_np = distancer.fit_transform((df.values, shapelets))
        
        stop = start - datetime.now()

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        res[i] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i]))
        time[i] += -stop.total_seconds()*1000 #millisecondi
        
        print(compute_measures(y_test, y_pred))
        
    res[i] = list(map(lambda x: x/3, res[i]))
    time[i] /= 3
    i += 1

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

(0.8709677419354839, 0.8709677419354839, 0.8709677419354839, 0.8709677419354839)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8709677419354839, 0.8709677419354839, 0.8709677419354839, 0.8709677419354839)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8387096774193549, 0.8387096774193549, 0.8387096774193549, 0.8387096774193549)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8709677419354839, 0.8709677419354839, 0.8709677419354839, 0.8709677419354839)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8709677419354839, 0.8709677419354839, 0.8709677419354839, 0.8709677419354839)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.9032258064516129, 0.9032258064516129, 0.9032258064516129, 0.9032258064516129)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8709677419354839, 0.8709677419354839, 0.8709677419354839, 0.8709677419354839)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8387096774193549, 0.8387096774193549, 0.8387096774193549, 0.8387096774193549)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.8709677419354839, 0.8709677419354839, 0.8709677419354839, 0.8709677419354839)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.6129032258064516, 0.6129032258064516, 0.6129032258064516, 0.6129032258064516)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.7096774193548387, 0.7096774193548387, 0.7096774193548389, 0.7096774193548387)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.5161290322580645, 0.5161290322580645, 0.5161290322580645, 0.5161290322580645)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.3870967741935484, 0.3870967741935484, 0.3870967741935484, 0.3870967741935484)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.3870967741935484, 0.3870967741935484, 0.3870967741935484, 0.3870967741935484)


  0%|          | 0/50 [00:00<?, ?it/s]

(0.3870967741935484, 0.3870967741935484, 0.3870967741935484, 0.3870967741935484)


In [8]:
for precisione, t, val in zip(range(3,8), time, res):
    print(F"{precisione}: {val[0]} in {t/1000}s")

3: 0.8602150537634409 in 25.242951333333334s
4: 0.8817204301075269 in 18.02610333333333s
5: 0.8602150537634409 in 14.060889333333332s
6: 0.6129032258064516 in 10.462801333333335s
7: 0.3870967741935483 in 9.996637999999999s


In [9]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig["precision"] = range(3,8)

df_res_rig.to_csv(f"Test euclidea animals.csv", index=None)

## Distanza Prof

In [10]:
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
spatioTemporalCols = ["c1", "c2", "t"]

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=n_movelets, bestFittingMeasure=InterpolatedRootDistanceBestFitting, movelets_per_class=None, trajectories_for_orderline=None, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = []
time = []
i=0
for precision in tqdm(range(3,8)):
    res.append((.0, .0, .0, .0, .0))
    time.append(.0)
    
    for _ in range(3):
        df = df0.copy()[["tid", "class", "c1", "c2", "t"]]
        start = datetime.now()
    
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        shapelets = selector.fit_transform(part)
        _, dist_np = distancer.fit_transform((df.values, shapelets))
        
        stop = start - datetime.now()

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        print(compute_measures(y_test, y_pred)[0])
        
        res[i] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i]))
        time[i] += stop.total_seconds()*1000 #millisecondi
        
    res[i] = list(map(lambda x: x/3, res[i]))
    time[i] /= 3
    i += 1

  0%|          | 0/5 [00:00<?, ?it/s]

0.9032258064516129
0.9354838709677419
0.9032258064516129
0.8064516129032258
0.9032258064516129
0.7741935483870968
0.8064516129032258
0.7741935483870968
0.7741935483870968
0.5806451612903226
0.6451612903225806
0.5483870967741935
0.3870967741935484
0.25806451612903225
0.3870967741935484


In [11]:
for precisione, t, val in zip(range(3,8), time, res):
    print(F"{precisione}: {val[0]} in {t/1000}s")

3: 0.9139784946236559 in -102.07380033333334s
4: 0.8279569892473116 in -87.923401s
5: 0.7849462365591396 in -66.85863866666666s
6: 0.5913978494623656 in -20.539392333333332s
7: 0.3440860215053763 in -11.350185333333332s


In [12]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig["t"] *= -1

df_res_rig["precision"] = range(3,8)

df_res_rig.to_csv(f"Test IRD animals.csv", index=None)

## DTW

In [21]:
from cri98tj.distancers.DTW_distancer import DTW_distancer, DTWBestFitting

spatioTemporalCols = ["c1", "c2"]

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=n_movelets, bestFittingMeasure=DTWBestFitting, movelets_per_class=None, trajectories_for_orderline=None, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
distancer = DTW_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = []
time = []
i=0
for precision in tqdm(range(7,2,-1)):
    res.append((.0, .0, .0, .0, .0))
    time.append(.0)
    
    for _ in range(3):
        spatioTemporalCols = ["c1", "c2"]
        df = df0.copy()[["tid", "class", "c1", "c2"]]
        start = datetime.now()
        
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        shapelets = selector.fit_transform(part)
        dist_np = distancer.fit_transform((df.values, shapelets))
        
        stop = start - datetime.now()

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        
        print(compute_measures(y_test, y_pred)[0])
        
        res[i] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i]))
        time[i] += stop.total_seconds()*1000 #millisecondi
        
    res[i] = list(map(lambda x: x/3, res[i]))
    time[i] /= 3
    i += 1

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

0.2903225806451613


  0%|          | 0/50 [00:00<?, ?it/s]

0.2903225806451613


  0%|          | 0/50 [00:00<?, ?it/s]

0.2903225806451613


  0%|          | 0/50 [00:00<?, ?it/s]

0.6774193548387096


  0%|          | 0/50 [00:00<?, ?it/s]

0.7096774193548387


  0%|          | 0/50 [00:00<?, ?it/s]

0.7096774193548387


  0%|          | 0/50 [00:00<?, ?it/s]

0.8709677419354839


  0%|          | 0/50 [00:00<?, ?it/s]

0.9032258064516129


  0%|          | 0/50 [00:00<?, ?it/s]

0.8709677419354839


  0%|          | 0/50 [00:00<?, ?it/s]

0.967741935483871


  0%|          | 0/50 [00:00<?, ?it/s]

0.9354838709677419


  0%|          | 0/50 [00:00<?, ?it/s]

0.9354838709677419


  0%|          | 0/50 [00:00<?, ?it/s]

0.9354838709677419


  0%|          | 0/50 [00:00<?, ?it/s]

0.9354838709677419


  0%|          | 0/50 [00:00<?, ?it/s]

0.9354838709677419


In [27]:
for precisione, t, val in zip(range(7,2,-1), time, res):
    print(F"{precisione}: {val[0]} in {t/1000}s")

7: 0.2903225806451613 in -320.35854600000005s
6: 0.6989247311827956 in -440.5930543333334s
5: 0.8817204301075269 in -1914.1983566666668s
4: 0.946236559139785 in -2318.1647826666663s
3: 0.9354838709677419 in -3038.728571666667s


In [26]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig["t"] *= -1

df_res_rig["precision"] = range(7,2,-1)

df_res_rig.to_csv(f"Test DTW animals.csv", index=None)

[[0.2903225806451613,
  0.2903225806451613,
  0.2903225806451613,
  0.2903225806451613],
 [0.6989247311827956,
  0.6989247311827956,
  0.6989247311827959,
  0.6989247311827956],
 [0.8817204301075269,
  0.8817204301075269,
  0.8817204301075269,
  0.8817204301075269],
 [0.946236559139785, 0.946236559139785, 0.946236559139785, 0.946236559139785],
 [0.9354838709677419,
  0.9354838709677419,
  0.9354838709677419,
  0.9354838709677419]]