# Testing delle misure di distanza al variare della precision - Animals

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from tqdm.auto import tqdm

In [8]:
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from sklearn.model_selection import train_test_split
from cri98tj.selectors.Random_selector import Random_selector

import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_csv('../examples/Animals Dataset/data/animals_preapred.zip').sort_values(by=["tid", "t"])# precision=5, 50 movelet, DTW

In [10]:
df.head()

Unnamed: 0,tid,class,t,c1,c2
0,1,D,0,50.1066,3.79665
1,1,D,4,50.1045,3.79455
2,1,D,7,50.1111,3.79845
3,1,D,9,50.1072,3.79845
4,1,D,15,50.1132,3.79965


In [11]:
tid_train, tid_test, _, _ = train_test_split(df.groupby(by=["tid"]).max().reset_index()["tid"],
                                                        df.groupby(by=["tid"]).max().reset_index()["class"],
                                                        test_size=.3,
                                                        stratify=df.groupby(by=["tid"]).max().reset_index()["class"],
                                                        random_state=3)

spatioTemporalCols = ["c1", "c2", "t"]
n_movelets=50
n_jobs = 20
verbose = False

## Distanza euclidea, k=range(2,7)

In [25]:
from cri98tj.distancers.Euclidean_distancer import euclideanBestFitting

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=n_movelets, bestFittingMeasure=euclideanBestFitting, movelets_per_class=n_movelets*3, trajectories_for_orderline=100, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
distancer = Euclidean_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = {}
for precision in tqdm(range(1,9)):
    partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
    part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
    shapelets = selector.fit_transform(part)
    _, dist_np = distancer.fit_transform((df.values, shapelets))

    clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=1000)

    dist_np_df = pd.DataFrame(dist_np)
    X = dist_np_df.drop(columns=[0]).values
    y = dist_np_df[0].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    res[str(precision)] = classification_report(y_test, y_pred)

  0%|          | 0/8 [00:00<?, ?it/s]

In [26]:
for precisione, r in res.items():
    print(F"PRECISION={precisione}")
    print(r)
    print("\r\n\r\n\r\n\r\n")

PRECISION=1
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.60      0.67      0.63         9
           E       0.80      0.67      0.73        12

    accuracy                           0.77        31
   macro avg       0.77      0.78      0.77        31
weighted avg       0.78      0.77      0.77        31






PRECISION=2
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.60      0.67      0.63         9
           E       0.80      0.67      0.73        12

    accuracy                           0.77        31
   macro avg       0.77      0.78      0.77        31
weighted avg       0.78      0.77      0.77        31






PRECISION=3
              precision    recall  f1-score   support

           C       0.90      0.90      0.90        10
           D       0.67      0.67      0.67         9
           E       0.

## Interpolated Root Distance

In [27]:
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = RandomInformationGain_selector(top_k=n_movelets, bestFittingMeasure=InterpolatedRootDistanceBestFitting, movelets_per_class=n_movelets*3, trajectories_for_orderline=100, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = {}
for precision in tqdm(range(1,9)):
    partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
    part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
    shapelets = selector.fit_transform(part)
    _, dist_np = distancer.fit_transform((df.values, shapelets))

    clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=1000)

    dist_np_df = pd.DataFrame(dist_np)
    X = dist_np_df.drop(columns=[0]).values
    y = dist_np_df[0].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    res[str(precision)] = classification_report(y_test, y_pred)

  0%|          | 0/8 [00:00<?, ?it/s]

In [28]:
for precisione, r in res.items():
    print(F"PRECISION={precisione}")
    print(r)
    print("\r\n\r\n\r\n\r\n")

PRECISION=1
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.56      0.56      0.56         9
           E       0.73      0.67      0.70        12

    accuracy                           0.74        31
   macro avg       0.73      0.74      0.73        31
weighted avg       0.74      0.74      0.74        31






PRECISION=2
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.67      0.67      0.67         9
           E       0.82      0.75      0.78        12

    accuracy                           0.81        31
   macro avg       0.80      0.81      0.80        31
weighted avg       0.80      0.81      0.80        31






PRECISION=3
              precision    recall  f1-score   support

           C       0.83      1.00      0.91        10
           D       0.50      0.56      0.53         9
           E       0.

## DTW

In [12]:
from cri98tj.distancers.DTW_distancer import DTW_distancer
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting

normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
selector = Random_selector(movelets_per_class=int(n_movelets/3), n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
distancer = DTW_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)

res = {}
for precision in tqdm(range(1,9)):
    partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)
    part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
    shapelets = selector.fit_transform(part)
    dist_np = distancer.fit_transform((df.values, shapelets))

    clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=1000)

    dist_np_df = pd.DataFrame(dist_np)
    X = dist_np_df.drop(columns=[0]).values
    y = dist_np_df[0].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    res[str(precision)] = classification_report(y_test, y_pred)

  0%|          | 0/8 [00:00<?, ?it/s]

AttributeError: 'Random_selector' object has no attribute 'n_movelets'

In [31]:
for precisione, r in res.items():
    print(F"PRECISION={precisione}")
    print(r)
    print("\r\n\r\n\r\n\r\n")

PRECISION=1
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.50      0.56      0.53         9
           E       0.70      0.58      0.64        12

    accuracy                           0.71        31
   macro avg       0.70      0.71      0.71        31
weighted avg       0.71      0.71      0.71        31






PRECISION=2
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.45      0.56      0.50         9
           E       0.67      0.50      0.57        12

    accuracy                           0.68        31
   macro avg       0.68      0.69      0.67        31
weighted avg       0.68      0.68      0.67        31






PRECISION=3
              precision    recall  f1-score   support

           C       0.91      1.00      0.95        10
           D       0.45      0.56      0.50         9
           E       0.