In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import glob
from datetime import datetime

In [3]:
df = pd.read_csv("data/seabird_prepared.csv")
df.head()

Unnamed: 0,lat,lon,alt,tid,bird,species,year,date_time,max_depth.m,colony2
0,56.095451,-6.233089,-23.059999,1340627854,1,tCOGU,t2012,2012-06-25 13:37:34,-2.172046,1
1,56.095408,-6.23352,-2.983077,1340627954,1,tCOGU,t2012,2012-06-25 13:39:14,-1.152306,1
2,56.095437,-6.234275,3.470286,1340628054,1,tCOGU,t2012,2012-06-25 13:40:54,-2.172046,1
3,56.095635,-6.234815,1.902667,1340628154,1,tCOGU,t2012,2012-06-25 13:42:34,-2.172046,1
4,56.095821,-6.235293,2.824952,1340628254,1,tCOGU,t2012,2012-06-25 13:44:14,-2.172046,1


In [4]:
df.date_time = df.date_time.apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp())

df.head()

Unnamed: 0,lat,lon,alt,tid,bird,species,year,date_time,max_depth.m,colony2
0,56.095451,-6.233089,-23.059999,1340627854,1,tCOGU,t2012,1340624000.0,-2.172046,1
1,56.095408,-6.23352,-2.983077,1340627954,1,tCOGU,t2012,1340624000.0,-1.152306,1
2,56.095437,-6.234275,3.470286,1340628054,1,tCOGU,t2012,1340624000.0,-2.172046,1
3,56.095635,-6.234815,1.902667,1340628154,1,tCOGU,t2012,1340625000.0,-2.172046,1
4,56.095821,-6.235293,2.824952,1340628254,1,tCOGU,t2012,1340625000.0,-2.172046,1


In [5]:
df0 = df[["bird", "species", "lat", "lon", "date_time"]]\
        .rename(columns={"lat": "c1", "lon": "c2", "date_time": "t", "species": "class", "bird": "tid"})

df0.head()

Unnamed: 0,tid,class,c1,c2,t
0,1,tCOGU,56.095451,-6.233089,1340624000.0
1,1,tCOGU,56.095408,-6.23352,1340624000.0
2,1,tCOGU,56.095437,-6.234275,1340624000.0
3,1,tCOGU,56.095635,-6.234815,1340625000.0
4,1,tCOGU,56.095821,-6.235293,1340625000.0


In [6]:
df0["pos"] = df0.groupby(['tid']).cumcount()

In [7]:
def chunkdf(df = pd.DataFrame(), chunkDim=1000):
    prec = 0
    for i in range(chunkDim, len(df)+chunkDim, chunkDim):
        yield df[(df.index >= prec) & (df.index < i)]
        prec = i

def padLastValue(df):
    array = df.values
    for i in tqdm(range(len(array))):
        for j in range(len(array[i])):
            if math.isnan(array[i][j]):
                array[i][j] =array[i][j-1]
    return pd.DataFrame(array)

percentile=.9

print("LAT")
df_lat = pd.DataFrame()
df_lat = df0.groupby(['tid','pos'])['c1'].max().unstack()
df_lat = padLastValue(df_lat)

print("LON")
df_lon = pd.DataFrame()
df_lon = df0.groupby(['tid','pos'])['c2'].max().unstack()
df_lon = padLastValue(df_lon)

print("TIME")
df_time = pd.DataFrame()
df_time = df0.groupby(['tid','pos'])['t'].max().unstack()
df_time = padLastValue(df_time)

LAT


  0%|          | 0/108 [00:00<?, ?it/s]

LON


  0%|          | 0/108 [00:00<?, ?it/s]

TIME


  0%|          | 0/108 [00:00<?, ?it/s]

In [8]:
from sktime.transformations.panel.rocket import Rocket
from sktime.datatypes._panel._convert import from_2d_array_to_nested

nested_lat = from_2d_array_to_nested(df_lat).rename(columns={0: "dim_0"})
nested_lon = from_2d_array_to_nested(df_lon).rename(columns={0: "dim_0"})
nested_time = from_2d_array_to_nested(df_time).rename(columns={0: "dim_0"})


nested = nested_lat
#nested = nested_UTMGridNorth

nested["dim_1"] = nested_lon["dim_0"]
nested["dim_2"] = nested_time["dim_0"]

nested.head()

Unnamed: 0,dim_0,dim_1,dim_2
0,0 56.095451 1 56.095408 2 56...,0 -6.233089 1 -6.233520 2 -6.23...,0 1.340624e+09 1 1.340624e+09 2 ...
1,0 56.087070 1 56.086763 2 56...,0 -6.241752 1 -6.242173 2 -6.24...,0 1.340552e+09 1 1.340552e+09 2 ...
2,0 56.095547 1 56.093861 2 56...,0 -6.232417 1 -6.228412 2 -6.23...,0 1.340642e+09 1 1.340642e+09 2 ...
3,0 56.095661 1 56.094316 2 56...,0 -6.231675 1 -6.230451 2 -6.23...,0 1.340638e+09 1 1.340639e+09 2 ...
4,0 56.096390 1 56.096124 2 56...,0 -6.232935 1 -6.233001 2 -6.23...,0 1.340635e+09 1 1.340635e+09 2 ...


In [9]:
y = df0.groupby(['tid'])['class'].max()

In [10]:
len(nested)

108

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(nested, y, test_size=.3, stratify=y)

rocket = Rocket(n_jobs=-1, num_kernels=10000)  # by default, ROCKET uses 10,000 kernels
rocket.fit(X_train)
X_train_transform = rocket.transform(X_train)
X_test_transform = rocket.transform(X_test)

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=3, scoring="accuracy")

clf.fit(X_train_transform, y_train)

report(clf.cv_results_, n_top=3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Model with rank: 1
Mean validation score: 0.664 (std: 0.189)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 300}

Model with rank: 2
Mean validation score: 0.654 (std: 0.156)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 600}

Model with rank: 3
Mean validation score: 0.643 (std: 0.181)
Parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 600}



In [80]:
clf = RandomForestClassifier(max_depth=17, criterion='gini', random_state=5, n_jobs=-1, n_estimators=300)
#clf = tree.DecisionTreeClassifier(max_depth=2)

clf.fit(X_train_transform, y_train)

from sklearn import metrics
y_pred=clf.predict(X_test_transform)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       0.75      0.33      0.46         9
       tEUSH       0.75      0.60      0.67         5
       tRAZO       0.68      0.89      0.77        19

    accuracy                           0.70        33
   macro avg       0.73      0.61      0.63        33
weighted avg       0.71      0.70      0.67        33



In [82]:
## Movelets

In [None]:
import random
import os


def prepareForShapelet(df=pd.DataFrame, trajectoryIDAttribute= None, targetAttribute=None,
                       timestampAttribute=None, latAttr=None, lonAttr=None, trainPerc=.80, trajPerc=1.0, maxPoints=None):
    attributes = [timestampAttribute, "latLon"]+ [x for x in list(df.columns) if x not in [trajectoryIDAttribute, targetAttribute, timestampAttribute, latAttr, lonAttr]] #mi assicuro di avere il timestam come primo elemento
    if(os.path.exists("train") | os.path.exists("test")):
        print("Le cartelle train e test esistono già!")
        return

    df["latLon"] = df[latAttr].astype(str) + " " + df[lonAttr].astype(str)

    ids = list(df[trajectoryIDAttribute].unique())
    #ids = random.sample(ids, round(len(ids)*trajPerc))
    ids = ids[:round(len(ids)*trajPerc)]
    ids_train = random.sample(ids, round(trainPerc*len(ids)))
    ids_test = [x for x in ids if x not in ids_train]

    os.makedirs("train")
    os.makedirs("test")

    for id in tqdm(ids_train):
        df_id = df[df[trajectoryIDAttribute] == id]
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)

    for id in tqdm(ids_test):
        df_id = df[df[trajectoryIDAttribute] == id]
        if maxPoints is not None:
            df_id = df_id.head(maxPoints)
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)

prepareForShapelet(df0, "tid", "class", "t", "c1", "c2", trainPerc=.7, trajPerc=1)

In [65]:
movelet_train_df = pd.read_csv("data/Movelet_output/train.csv")
movelet_test_df = pd.read_csv("data/Movelet_output/test.csv")

movelet_train_df.head()

Unnamed: 0,sh_TID37_START1484_SIZE51_CLASStCOGU,sh_TID17_START2799_SIZE18_CLASStCOGU,sh_TID8_START813_SIZE95_CLASStCOGU,sh_TID37_START658_SIZE86_CLASStCOGU,sh_TID1_START2014_SIZE7_CLASStCOGU,sh_TID32_START90_SIZE27_CLASStCOGU,sh_TID84_START987_SIZE2_CLASStRAZO,sh_TID103_START2212_SIZE2_CLASStRAZO,sh_TID81_START223_SIZE39_CLASStRAZO,sh_TID15_START2340_SIZE18_CLASStCOGU,sh_TID23_START1837_SIZE83_CLASStCOGU,sh_TID39_START193_SIZE96_CLASStRAZO,sh_TID17_START435_SIZE18_CLASStCOGU,sh_TID46_START981_SIZE13_CLASStRAZO,sh_TID40_START749_SIZE13_CLASStRAZO,sh_TID9_START1301_SIZE92_CLASStCOGU,sh_TID17_START3076_SIZE19_CLASStCOGU,class
0,1.625842,0.985065,0.9874,1.627654,0.981097,0.906283,1.560816,1.496875,1.570457,0.664531,0.987076,1.612939,0.947763,1.600724,1.603072,0.972232,0.98827,tCOGU
1,1.62561,1.004802,1.015268,1.6274,1.018796,0.985898,1.56076,1.496705,1.570197,1.001187,1.009078,1.612765,0.922504,1.60048,1.602819,1.009958,1.008859,tEUSH
2,1.945494,1.776285,1.770864,1.943541,1.7906,1.782295,1.972289,0.075381,1.958965,1.773291,1.772221,1.961741,1.778535,1.971062,1.968936,1.775907,1.778065,tRAZO
3,1.943944,1.774224,1.768818,1.941982,1.78859,1.780203,1.970683,0.072461,1.95726,1.771228,1.770183,1.960246,1.776458,1.969596,1.967466,1.773868,1.776,tRAZO
4,1.94623,1.773533,1.772168,1.945589,1.787647,1.779918,1.97063,0.071628,1.958964,1.770542,1.772909,1.963817,1.775789,1.969943,1.967828,1.777048,1.775346,tRAZO


In [66]:
movelet_train_df.values[:, -1]

array(['tCOGU', 'tEUSH', 'tRAZO', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU',
       'tCOGU', 'tEUSH', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU', 'tCOGU',
       'tRAZO', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU', 'tRAZO', 'tRAZO',
       'tRAZO', 'tRAZO', 'tCOGU', 'tRAZO', 'tCOGU', 'tRAZO', 'tRAZO',
       'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU', 'tCOGU',
       'tCOGU', 'tCOGU', 'tCOGU', 'tCOGU', 'tEUSH', 'tEUSH', 'tRAZO',
       'tEUSH', 'tRAZO', 'tRAZO', 'tRAZO', 'tEUSH', 'tEUSH', 'tRAZO',
       'tRAZO', 'tCOGU', 'tCOGU', 'tEUSH', 'tRAZO', 'tEUSH', 'tRAZO',
       'tRAZO', 'tCOGU', 'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO',
       'tRAZO', 'tRAZO', 'tEUSH', 'tRAZO', 'tCOGU', 'tEUSH', 'tRAZO',
       'tRAZO', 'tEUSH', 'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO'], dtype=object)

In [78]:
X_train = movelet_train_df.values[:, :-1]
y_train = movelet_train_df.values[:, -1]

X_test = movelet_test_df.values[:, :-1]
y_test = movelet_test_df.values[:, -1]

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=3, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Model with rank: 1
Mean validation score: 0.777 (std: 0.165)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 300}

Model with rank: 2
Mean validation score: 0.764 (std: 0.171)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.764 (std: 0.193)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 900}

Model with rank: 3
Mean validation score: 0.764 (std: 0.193)
Parameters: {'criterion': 'entropy', 'max_depth': 17, 'n_estimators': 600}

Model with rank: 3
Mean validation score: 0.764 (std: 0.193)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 1200}



In [81]:
clf = RandomForestClassifier(max_depth=2, criterion='gini', random_state=5, n_jobs=-1, n_estimators=1200)
#clf = tree.DecisionTreeClassifier(max_depth=2)

clf.fit(X_train, y_train)

from sklearn import metrics
y_pred=clf.predict(X_test)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       0.71      0.56      0.63         9
       tEUSH       0.00      0.00      0.00         3
       tRAZO       0.72      0.90      0.80        20

    accuracy                           0.72        32
   macro avg       0.48      0.49      0.48        32
weighted avg       0.65      0.72      0.68        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 10/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.571 total time=   0.5s
[CV 2/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.625 total time=   1.8s
[CV 7/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=1.000 total time=   1.4s
[CV 2/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.625 total time=   1.4s
[CV 9/10] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.571 total time=   0.5s
[CV 9/10] END criterion=entropy, max_depth=17, n_estimators=600;, score=0.571 total time=   0.9s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=300;, score=0.625 total time=   0.4s
[CV 5/10] END criterion=gini, max_depth=2, n_estimators=300;, score=0.750 total time=   0.4s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.750 total time=   0.9s
[CV 9/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.571 total time=   1.3s
[CV 3/10] END criterion=gini, max_depth=7, n_es

[CV 6/10] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.875 total time=   0.9s
[CV 9/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.571 total time=   1.8s
[CV 6/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=1.000 total time=   1.9s
[CV 4/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.625 total time=   1.9s
[CV 1/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.875 total time=   1.9s
[CV 2/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.625 total time=   1.8s
[CV 10/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.571 total time=   1.4s
[CV 4/10] END criterion=gini, max_depth=12, n_estimators=900;, score=0.625 total time=   1.3s
[CV 1/10] END criterion=gini, max_depth=17, n_estimators=600;, score=0.875 total time=   0.9s
[CV 5/10] END criterion=gini, max_depth=17, n_estimators=900;, score=0.875 total time=   1.0s
[CV 7/10] END criterion=entropy, max_depth=2

[CV 5/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.750 total time=   0.5s
[CV 3/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.625 total time=   1.9s
[CV 9/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.571 total time=   1.4s
[CV 3/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.500 total time=   1.4s
[CV 1/10] END criterion=entropy, max_depth=17, n_estimators=600;, score=0.875 total time=   0.9s
[CV 5/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.875 total time=   1.4s
[CV 7/10] END criterion=gini, max_depth=2, n_estimators=600;, score=1.000 total time=   0.9s
[CV 1/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.875 total time=   0.5s
[CV 6/10] END criterion=gini, max_depth=7, n_estimators=300;, score=1.000 total time=   0.5s
[CV 4/10] END criterion=gini, max_depth=7, n_estimators=600;, score=0.625 total time=   0.9s
[CV 9/10] END criterion=gini, max_depth=7, n_est

In [34]:
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from cri98tj.distancers.DTW_distancer import DTW_distancer
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.normalizers.normalizer_utils import dataframe_pivot
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.partitioners.Voronoi_partitioner import Voronoi_partitioner
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.selectors.Random_selector import Random_selector
from cri98tj.selectors.RandomOrderline_selector import RandomOrderline_selector
from cri98tj.TrajectoryTransformer import TrajectoryTransformer
from sklearn.ensemble import RandomForestClassifier

  from tqdm.autonotebook import tqdm


In [35]:
df0.head()

Unnamed: 0,tid,class,c1,c2,t
0,1,tCOGU,56.095451,-6.233089,1340624000.0
1,1,tCOGU,56.095408,-6.23352,1340624000.0
2,1,tCOGU,56.095437,-6.234275,1340624000.0
3,1,tCOGU,56.095635,-6.234815,1340625000.0
4,1,tCOGU,56.095821,-6.235293,1340625000.0


In [60]:
from sklearn.model_selection import train_test_split

spatioTemporalCols = ["c1", "c2", "t"]
tid_train, tid_test, _, _ = train_test_split(df0.groupby(by=["tid"]).max().reset_index()["tid"],
                                                            df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                            test_size=.3,
                                                            stratify=df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                            random_state=3)

partitioner = Geohash_partitioner(precision=4, spatioTemporalColumns=spatioTemporalCols)


normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None)
#selector = Random_selector(movelets_per_class=20, normalizer=normalizer,
#                                   spatioTemporalColumns=spatioTemporalCols)
selector = RandomInformationGain_selector(top_k=20, bestFittingMeasure=InterpolatedRootDistanceBestFitting,
                                                  movelets_per_class=550, trajectories_for_orderline=50, n_jobs=24,
                                                  spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=24)

part = partitioner.fit_transform(df0[df0.tid.isin(tid_train)].values)
shapelets = selector.fit_transform(part)
best_is, dist_np = distancer.fit_transform((df0.values, shapelets))


clf = RandomForestClassifier(max_depth=2, random_state=3, n_jobs=10, n_estimators=1000)

dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

Encoding 177660 points with precision 4


  0%|          | 0/177660 [00:00<?, ?it/s]

Cutting sub-trajectories length at 1926.0 over 3650
Pivoting tables


  df_pivot = df_pivot.merge(df.groupby(['partId'])['class'].max().reset_index(), on=["partId"])
  result = np.asarray(values, dtype=dtype)


Cutting sub-trajectories length at 3259.0 over 6047
Pivoting tables


  df_pivot = df_pivot.merge(df.groupby(['partId'])['class'].max().reset_index(), on=["partId"])
  result = np.asarray(values, dtype=dtype)


Computing scores


  0%|          | 0/386 [00:00<?, ?it/s]

  0%|          | 0/386 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)


0.	 score=0.4767160492994498
1.	 score=0.4340701758039929
2.	 score=0.43014721139064793
3.	 score=0.4289094280100074
4.	 score=0.42829488693396
5.	 score=0.41013060866727824
6.	 score=0.40588238232750373
7.	 score=0.3940225782621163
8.	 score=0.39385931146948594
9.	 score=0.39098641430158243
10.	 score=0.3865798890189778
11.	 score=0.3793197222753435
12.	 score=0.3783550638278239
13.	 score=0.37227527828579565
14.	 score=0.3700572397934936
15.	 score=0.3612204321973935
16.	 score=0.3504037356664116
17.	 score=0.3469296770452943
18.	 score=0.346202879455936
19.	 score=0.3458159759315933
Pivoting tables


  df_pivot = df_pivot.merge(df.groupby(['partId'])['class'].max().reset_index(), on=["partId"])
  result = np.asarray(values, dtype=dtype)


  0%|          | 0/20 [00:00<?, ?it/s]

Collecting distances from 20


  0%|          | 0/20 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       tCOGU       0.67      0.22      0.33         9
       tEUSH       0.80      0.80      0.80         5
       tRAZO       0.72      0.95      0.82        19

    accuracy                           0.73        33
   macro avg       0.73      0.66      0.65        33
weighted avg       0.72      0.73      0.68        33



In [64]:
from sklearn.model_selection import train_test_split

spatioTemporalCols = ["c1", "c2", "t"]
tid_train, tid_test, _, _ = train_test_split(df0.groupby(by=["tid"]).max().reset_index()["tid"],
                                                            df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                            test_size=.3,
                                                            stratify=df0.groupby(by=["tid"]).max().reset_index()["class"],
                                                            random_state=3)

partitioner = Geohash_partitioner(precision=2, spatioTemporalColumns=spatioTemporalCols)


normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None)
#selector = Random_selector(movelets_per_class=20, normalizer=normalizer,
#                                   spatioTemporalColumns=spatioTemporalCols)
selector = RandomInformationGain_selector(top_k=20, bestFittingMeasure=InterpolatedRootDistanceBestFitting,
                                                  movelets_per_class=100, trajectories_for_orderline=50, n_jobs=24,
                                                  spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=24)

part = partitioner.fit_transform(df0[df0.tid.isin(tid_train)].values)
shapelets = selector.fit_transform(part)
best_is, dist_np = distancer.fit_transform((df0.values, shapelets))


clf = RandomForestClassifier(max_depth=2, random_state=3, n_jobs=10, n_estimators=1000)

dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

Encoding 177660 points with precision 2


  0%|          | 0/177660 [00:00<?, ?it/s]

Cutting sub-trajectories length at 2804.0 over 4180
Pivoting tables


  df_pivot = df_pivot.merge(df.groupby(['partId'])['class'].max().reset_index(), on=["partId"])
  result = np.asarray(values, dtype=dtype)


Cutting sub-trajectories length at 3348.0 over 5210
Pivoting tables


  df_pivot = df_pivot.merge(df.groupby(['partId'])['class'].max().reset_index(), on=["partId"])
  result = np.asarray(values, dtype=dtype)


Computing scores


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)


0.	 score=0.5362921379434116
1.	 score=0.4754077990633321
2.	 score=0.4616886614471356
3.	 score=0.44507446526361183
4.	 score=0.4401583257357671
5.	 score=0.42704234176930367
6.	 score=0.42281426164836633
7.	 score=0.41387665220253433
8.	 score=0.4110820784876117
9.	 score=0.4083713326958174
10.	 score=0.4076734967176854
11.	 score=0.40512656650324064
12.	 score=0.400346121323083
13.	 score=0.3759945139357612
14.	 score=0.37561430551983865
15.	 score=0.3672483989396462
16.	 score=0.35886637091476103
17.	 score=0.35865728766996363
18.	 score=0.3555987390757007
19.	 score=0.3520496055043467
Pivoting tables


  df_pivot = df_pivot.merge(df.groupby(['partId'])['class'].max().reset_index(), on=["partId"])
  result = np.asarray(values, dtype=dtype)


  0%|          | 0/20 [00:00<?, ?it/s]

Collecting distances from 20


  0%|          | 0/20 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       tCOGU       0.00      0.00      0.00         9
       tEUSH       1.00      0.80      0.89         5
       tRAZO       0.64      0.95      0.77        19

    accuracy                           0.67        33
   macro avg       0.55      0.58      0.55        33
weighted avg       0.52      0.67      0.58        33



In [63]:
clf = RandomForestClassifier(max_depth=2, random_state=3, n_jobs=10, n_estimators=1000)

dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       1.00      0.11      0.20         9
       tEUSH       1.00      0.80      0.89         5
       tRAZO       0.68      1.00      0.81        19

    accuracy                           0.73        33
   macro avg       0.89      0.64      0.63        33
weighted avg       0.81      0.73      0.65        33



In [76]:
clf = RandomForestClassifier(max_depth=2, random_state=3, n_jobs=10, n_estimators=1000)

dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       0.00      0.00      0.00         9
       tEUSH       1.00      0.80      0.89         5
       tRAZO       0.64      0.95      0.77        19

    accuracy                           0.67        33
   macro avg       0.55      0.58      0.55        33
weighted avg       0.52      0.67      0.58        33



In [82]:
df1 = pd.DataFrame()

df1["pos"] = df0.groupby(['tid']).cumcount()

In [84]:
df1.pos.max()

6047