In [1]:
import geolib.geohash
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from tqdm.auto import tqdm
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from sklearn.model_selection import train_test_split
from cri98tj.distancers.Euclidean_distancer import euclideanBestFitting

In [2]:
df_original = pd.read_csv('../examples/Animals Dataset/data/animals_preapred.zip').sort_values(by=["tid", "t"])
df_original = df_original[["tid", "class", "c1", "c2", "t"]]
df_original.head()

Unnamed: 0,tid,class,c1,c2,t
0,1,D,50.1066,3.79665,0
1,1,D,50.1045,3.79455,4
2,1,D,50.1111,3.79845,7
3,1,D,50.1072,3.79845,9
4,1,D,50.1132,3.79965,15


In [3]:
tid_train, tid_test, _, _ = train_test_split(df_original.groupby(by=["tid"]).max().reset_index()["tid"],
                                                        df_original.groupby(by=["tid"]).max().reset_index()["class"],
                                                        test_size=.3,
                                                        stratify=df_original.groupby(by=["tid"]).max().reset_index()["class"],
                                                        random_state=3)

spatioTemporalCols = ["c1", "c2", "t"]
n_jobs = 24
verbose = False

In [4]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

def compute_measures(test, pred):
    return (accuracy_score(test, pred), precision_score(test, pred, average="micro"), f1_score(test, pred, average="micro"), recall_score(test, pred, average="micro"))


In [27]:
%%time

from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting

precision = 2

res = []
n_mov_rig = []
time = []
for i in tqdm(range(2, 20)):
    df = df_original[["tid", "class", "c1", "c2", "t"]].copy()
    
    res.append((.0, .0, .0, .0, .0))
    n_mov_rig.append(0)
    time.append(.0)

    for _ in range(3):
        normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
        distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)
        partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)

        start = datetime.now()
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        selector = RandomInformationGain_selector(top_k=int(1.4**i), bestFittingMeasure=InterpolatedRootDistanceBestFitting, movelets_per_class=None, trajectories_for_orderline=None, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
        shapelets = selector.fit_transform(part)
        _, dist_np = distancer.fit_transform((df.values, shapelets))
        stop = start - datetime.now()

        n_mov_rig[i-2] += (dist_np.shape[1])

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        res[i-2] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i-2]))
        time[i-2] += stop.total_seconds()*1000 #millisecondi

    res[i-2] = list(map(lambda x: x/3, res[i-2]))
    n_mov_rig[i-2] /= 3
    time[i-2] /= 3

  0%|          | 0/18 [00:00<?, ?it/s]

CPU times: user 7min 41s, sys: 1min 23s, total: 9min 5s
Wall time: 29min 41s


In [28]:
df_res_rig = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig["t"] = time

df_res_rig.t *= -1

df_res_rig["n"] = n_mov_rig

df_res_rig.to_csv(f"Test n_movelet animals RIG {precision}.csv", index=None)

In [5]:

precision = 4

df_res_rig = pd.read_csv(f"Test n_movelet animals RIG {precision}.csv")

df_res_rig

Unnamed: 0,acc,prec,f1,recall,t,n
0,0.316129,0.316129,0.316129,0.316129,22300.6424,2.0
1,0.490323,0.490323,0.490323,0.490323,24325.7862,3.0
2,0.606452,0.606452,0.606452,0.606452,24646.384,4.0
3,0.741935,0.741935,0.741935,0.741935,27821.274,6.0
4,0.806452,0.806452,0.806452,0.806452,27863.3256,8.0
5,0.748387,0.748387,0.748387,0.748387,27946.102,11.0
6,0.812903,0.812903,0.812903,0.812903,28597.8178,15.0
7,0.877419,0.877419,0.877419,0.877419,29347.0732,21.0
8,0.883871,0.883871,0.883871,0.883871,30171.6186,29.0
9,0.851613,0.851613,0.851613,0.851613,32568.5104,41.0


In [6]:
## Test con selezione random


In [7]:
%%time
from cri98tj.selectors.Random_selector import Random_selector
from sklearn.metrics import accuracy_score
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
from datetime import datetime

res = []
n_mov_r = []
time = []
i=1
for n in tqdm(df_res_rig.n.unique()):
    df = df_original[["tid", "class", "c1", "c2", "t"]].copy()
    
    res.append((.0, .0, .0, .0, .0))
    n_mov_r.append(0)
    time.append(.0)
    
    c = 0
    
    while c < 3: 
        try:
            normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
            distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)
            partitioner = Geohash_partitioner(precision=precision, spatioTemporalColumns=spatioTemporalCols, verbose=verbose)

            start = datetime.now()
            part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
            selector = Random_selector(movelets_per_class=max(1, n//6), n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
            shapelets = selector.fit_transform(part)

            _, dist_np = distancer.fit_transform((df.values, shapelets))
            stop = start - datetime.now()

            n_mov_r[i-1] += (dist_np.shape[1])

            clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

            dist_np_df = pd.DataFrame(dist_np)
            X = dist_np_df.drop(columns=[0]).values
            y = dist_np_df[0].values

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)

            res[i-1] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i-1]))
            time[i-1] += stop.total_seconds()*1000 #millisecondi

            c += 1
        except:
            print("failed")

    res[i-1] = list(map(lambda x: x/3, res[i-1]))
    n_mov_r[i-1] /= 3
    time[i-1] /= 3
    
    i +=1

  0%|          | 0/16 [00:00<?, ?it/s]

CPU times: user 6min 14s, sys: 58.4 s, total: 7min 13s
Wall time: 14min 49s


In [8]:
df_res_r = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_r["t"] = time

df_res_r.t *= -1

df_res_r["n"] = n_mov_r

df_res_r.to_csv(f"Test n_movelet animals R {precision}.csv", index=None)

In [9]:
df_res_r

Unnamed: 0,acc,prec,f1,recall,t,n
0,0.430108,0.430108,0.430108,0.430108,7081.353,4.0
1,0.526882,0.526882,0.526882,0.526882,7201.945667,4.0
2,0.473118,0.473118,0.473118,0.473118,7191.734667,4.0
3,0.483871,0.483871,0.483871,0.483871,6355.536333,4.0
4,0.462366,0.462366,0.462366,0.462366,6075.208333,4.0
5,0.580645,0.580645,0.580645,0.580645,7393.053333,4.0
6,0.677419,0.677419,0.677419,0.677419,7504.184,7.0
7,0.602151,0.602151,0.602151,0.602151,8290.274667,10.0
8,0.752688,0.752688,0.752688,0.752688,7891.199667,13.0
9,0.752688,0.752688,0.752688,0.752688,8930.935667,19.0


In [26]:
#voronoi

In [None]:
#r=600, ss=300/600 sd=10

In [11]:
%%time
from cri98tj.selectors.Random_selector import Random_selector
from sklearn.metrics import accuracy_score
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
from datetime import datetime
from cri98tj.partitioners.Voronoi_partitioner import Voronoi_partitioner

res = []
n_mov_r = []
time = []
i=1
for n in tqdm(df_res_rig.n.unique()):
    df = df_original[["tid", "class", "c1", "c2", "t"]].copy()
    
    res.append((.0, .0, .0, .0, .0))
    n_mov_r.append(0)
    time.append(.0)
    
    
    for _ in range(3): 
        normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None, verbose=verbose)
        distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=n_jobs, verbose=verbose)
        partitioner = Voronoi_partitioner(spatioTemporalColumns=spatioTemporalCols, radius=600, stop_distance=10, stop_seconds=600)

        start = datetime.now()
        part = partitioner.fit_transform(df[df.tid.isin(tid_train)].values)
        selector = RandomInformationGain_selector(top_k=n, bestFittingMeasure=InterpolatedRootDistanceBestFitting, movelets_per_class=None, trajectories_for_orderline=None, n_jobs=n_jobs, spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer, verbose=verbose)
        shapelets = selector.fit_transform(part)

        _, dist_np = distancer.fit_transform((df.values, shapelets))
        stop = start - datetime.now()

        n_mov_r[i-1] += (dist_np.shape[1])

        clf = RandomForestClassifier(max_depth=3, random_state=3, n_jobs=n_jobs, n_estimators=5000)

        dist_np_df = pd.DataFrame(dist_np)
        X = dist_np_df.drop(columns=[0]).values
        y = dist_np_df[0].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        res[i-1] = tuple(a+b for a, b in zip(compute_measures(y_test, y_pred), res[i-1]))
        time[i-1] += stop.total_seconds()*1000 #millisecondi


    res[i-1] = list(map(lambda x: x/3, res[i-1]))
    n_mov_r[i-1] /= 3
    time[i-1] /= 3
    
    i +=1

  0%|          | 0/16 [00:00<?, ?it/s]

Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing command: java -Xmx2048M -jar octo_processor.jar -app spacetime_reconstruct -i tmp/gps_sorted.csv.gz -o tmp/gps_600_50.csv.gz -ds 10 -dt 600
Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_edgelist_200.csv.gz -r 600
         0 12:19:56,442  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:52) - Starting clustering of points
        50 12:19:56,492  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:65) - Processed 10000 points
        51 12:19:56,493  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:71) - Export voronoi diagram
ShapeFileWriter: start mixed-geom-test
       109 12:19:56,551  INFO (ExtractDelaunayFromTrajectoryPoints.java:prepareNext:141) - found 677 edges
Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing comm

Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_edgelist_200.csv.gz -r 600
         0 12:30:40,873  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:52) - Starting clustering of points
        49 12:30:40,922  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:65) - Processed 10000 points
        50 12:30:40,923  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:71) - Export voronoi diagram
ShapeFileWriter: start mixed-geom-test
       106 12:30:40,979  INFO (ExtractDelaunayFromTrajectoryPoints.java:prepareNext:141) - found 677 edges
Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing command: java -Xmx2048M -jar octo_processor.jar -app spacetime_reconstruct -i tmp/gps_sorted.csv.gz -o tmp/gps_600_50.csv.gz -ds 10 -dt 600
Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_

Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing command: java -Xmx2048M -jar octo_processor.jar -app spacetime_reconstruct -i tmp/gps_sorted.csv.gz -o tmp/gps_600_50.csv.gz -ds 10 -dt 600
Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_edgelist_200.csv.gz -r 600
         0 12:42:46,561  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:52) - Starting clustering of points
        54 12:42:46,615  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:65) - Processed 10000 points
        55 12:42:46,616  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:71) - Export voronoi diagram
ShapeFileWriter: start mixed-geom-test
       116 12:42:46,677  INFO (ExtractDelaunayFromTrajectoryPoints.java:prepareNext:141) - found 677 edges
Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing comm

Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_edgelist_200.csv.gz -r 600
         0 12:53:35,672  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:52) - Starting clustering of points
        52 12:53:35,724  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:65) - Processed 10000 points
        53 12:53:35,725  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:71) - Export voronoi diagram
ShapeFileWriter: start mixed-geom-test
       112 12:53:35,784  INFO (ExtractDelaunayFromTrajectoryPoints.java:prepareNext:141) - found 677 edges
Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing command: java -Xmx2048M -jar octo_processor.jar -app spacetime_reconstruct -i tmp/gps_sorted.csv.gz -o tmp/gps_600_50.csv.gz -ds 10 -dt 600
Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_

Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing command: java -Xmx2048M -jar octo_processor.jar -app spacetime_reconstruct -i tmp/gps_sorted.csv.gz -o tmp/gps_600_50.csv.gz -ds 10 -dt 600
Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_edgelist_200.csv.gz -r 600
         0 13:06:04,485  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:52) - Starting clustering of points
        48 13:06:04,533  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:65) - Processed 10000 points
        49 13:06:04,534  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:71) - Export voronoi diagram
ShapeFileWriter: start mixed-geom-test
       106 13:06:04,591  INFO (ExtractDelaunayFromTrajectoryPoints.java:prepareNext:141) - found 677 edges
Executing command: cat tmp/gps.csv | sort -t',' -k1 -k2 -k3 -k9 -m | gzip -c > tmp/gps_sorted.csv.gz
Executing comm

Executing command: java -Xmx2048M -jar octo_processor.jar -app point_aggregate -i tmp/gps_600_50.csv.gz -o tmp/gps_edgelist_200.csv.gz -r 600
         0 13:18:08,238  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:52) - Starting clustering of points
        49 13:18:08,287  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:65) - Processed 10000 points
        50 13:18:08,288  INFO (ExtractDelaunayFromTrajectoryPoints.java:parseSource:71) - Export voronoi diagram
ShapeFileWriter: start mixed-geom-test
       105 13:18:08,343  INFO (ExtractDelaunayFromTrajectoryPoints.java:prepareNext:141) - found 677 edges
CPU times: user 16min 37s, sys: 1min 35s, total: 18min 12s
Wall time: 59min 38s


In [12]:
df_res_rig_v = pd.DataFrame(res, columns=["acc", "prec", "f1", "recall"])

df_res_rig_v["t"] = time

df_res_rig_v.t *= -1

df_res_rig_v["n"] = n_mov_r

df_res_rig_v.to_csv(f"Test n_movelet animals RIG voronoi.csv", index=None)

In [None]:
df_res_rig_v

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=n_mov_r, y=acc_r, mode='lines', name='Random selector'))
fig.add_trace(go.Scatter(x=n_mov_rig, y=acc_rig, mode='lines', name='Random info gain selector'))

fig.show()