# GeoLife

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import glob
from datetime import datetime

In [2]:
df = pd.read_csv("data/GeoLife_prepared.zip")

df = df[df.tid < df.tid.max()*.2] #prendo il 20% del dataset

df0 = df[["tid", "label", "lat", "lon", "time"]].rename(columns={
    "label": "class", "lat": "c1", "lon": "c2", "time": "t"})

df0.head()

Unnamed: 0,tid,class,c1,c2,t
0,0,train,39.894178,116.3182,1206716000.0
1,0,train,39.894505,116.321132,1206716000.0
2,0,train,39.894953,116.326452,1206716000.0
3,0,train,39.8946,116.332542,1206716000.0
4,0,train,39.889622,116.33704,1206716000.0


In [3]:
remap_class = {
    "subway": "public",
    "motorcycle": "private",
    "run": "private",
    "walk": "private",
    "boat": "private",
    "airplane": "public",
    "train": "public",
    "car": "private",
    "taxi": "public",
    "bike": "private",
    "bus": "public"
}

In [4]:
df0["class"] = df0["class"].apply(lambda x: remap_class[x])

In [5]:
df0["class"].unique()

array(['public', 'private'], dtype=object)

## Rocket

In [29]:
t_rocket = 0

start = datetime.now()

df0["pos"] = df0.groupby(['tid']).cumcount()

df0 = df0[df0.pos < df0.pos.quantile(.9)]

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [30]:
df0.describe().pos.apply(lambda x: int(x))

count    1410006
mean        3540
std         4581
min            0
25%          403
50%         1423
75%         4917
max        19346
Name: pos, dtype: int64

In [31]:
df0.pos.quantile(.9)

10833.0

In [32]:
start = datetime.now()


def padLastValue(df):
    array = df.values
    for i in tqdm(range(len(array))):
        for j in range(len(array[i])):
            if math.isnan(array[i][j]):
                array[i][j] =array[i][j-1]
    return pd.DataFrame(array)

print("LAT")
df_lat = pd.DataFrame()
df_lat = df0.groupby(['tid','pos'])['c1'].max().unstack()
df_lat = padLastValue(df_lat)

print("LON")
df_lon = pd.DataFrame()
df_lon = df0.groupby(['tid','pos'])['c2'].max().unstack()
df_lon = padLastValue(df_lon)

print("TIME")
df_time = pd.DataFrame()
df_time = df0.groupby(['tid','pos'])['t'].max().unstack()
df_time = padLastValue(df_time)

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

LAT


  0%|          | 0/1196 [00:00<?, ?it/s]

LON


  0%|          | 0/1196 [00:00<?, ?it/s]

TIME


  0%|          | 0/1196 [00:00<?, ?it/s]

In [33]:
start = datetime.now()

from sktime.transformations.panel.rocket import Rocket
from sktime.datatypes._panel._convert import from_2d_array_to_nested

nested_lat = from_2d_array_to_nested(df_lat).rename(columns={0: "dim_0"})
nested_lon = from_2d_array_to_nested(df_lon).rename(columns={0: "dim_0"})
nested_time = from_2d_array_to_nested(df_time).rename(columns={0: "dim_0"})


nested = nested_lat
#nested = nested_UTMGridNorth

nested["dim_1"] = nested_lon["dim_0"]
nested["dim_2"] = nested_time["dim_0"]

nested.head()

y = df0.groupby(['tid'])['class'].max()

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [34]:
start = datetime.now()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(nested, y, test_size=.3, stratify=y)

rocket = Rocket(n_jobs=-1, num_kernels=10000)  # by default, ROCKET uses 10,000 kernels
rocket.fit(X_train)
X_train_transform = rocket.transform(X_train)
X_test_transform = rocket.transform(X_test)

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=3, scoring="balanced_accuracy")

clf.fit(X_train_transform, y_train)

report(clf.cv_results_, n_top=3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Model with rank: 1
Mean validation score: 0.767 (std: 0.066)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 1200}

Model with rank: 2
Mean validation score: 0.764 (std: 0.060)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.762 (std: 0.060)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 1200}



In [36]:
clf = RandomForestClassifier(max_depth=12, criterion='entropy', random_state=5, n_jobs=-1, n_estimators=900)
#clf = tree.DecisionTreeClassifier(max_depth=2)

clf.fit(X_train_transform, y_train)

from sklearn import metrics
y_pred=clf.predict(X_test_transform)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     private       0.75      0.79      0.77       173
      public       0.79      0.75      0.77       186

    accuracy                           0.77       359
   macro avg       0.77      0.77      0.77       359
weighted avg       0.77      0.77      0.77       359



In [37]:
t_rocket

1746267.737

[CV 1/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.741 total time=  15.5s
[CV 6/10] END criterion=entropy, max_depth=7, n_estimators=300;, score=0.678 total time=  14.2s
[CV 3/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.684 total time=  42.1s
[CV 3/10] END criterion=entropy, max_depth=12, n_estimators=600;, score=0.661 total time=  31.9s
[CV 9/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.795 total time=  48.5s
[CV 4/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.824 total time=  48.6s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.684 total time=  10.8s
[CV 1/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.776 total time=  14.3s
[CV 5/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.843 total time=  10.3s
[CV 9/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.772 total time=  30.0s
[CV 7/10] END criterion=gini, max_depth=12, n_es

[CV 8/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.724 total time=   5.4s
[CV 9/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.689 total time=  15.4s
[CV 3/10] END criterion=entropy, max_depth=7, n_estimators=600;, score=0.660 total time=  28.4s
[CV 7/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.665 total time=  55.8s
[CV 7/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.678 total time= 1.1min
[CV 4/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.835 total time= 1.1min
[CV 3/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.673 total time=  30.1s
[CV 8/10] END criterion=gini, max_depth=12, n_estimators=300;, score=0.759 total time=  13.2s
[CV 2/10] END criterion=gini, max_depth=12, n_estimators=900;, score=0.751 total time=  39.1s
[CV 9/10] END criterion=gini, max_depth=17, n_estimators=300;, score=0.771 total time=  14.1s
[CV 4/10] END criterion=gini, max_depth=17, 

[CV 7/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.689 total time=   5.4s
[CV 5/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.773 total time=  15.3s
[CV 9/10] END criterion=entropy, max_depth=7, n_estimators=300;, score=0.771 total time=  14.3s
[CV 3/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.685 total time=  55.5s
[CV 7/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.665 total time=  48.2s
[CV 6/10] END criterion=entropy, max_depth=17, n_estimators=600;, score=0.678 total time=  32.5s
[CV 10/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.784 total time= 1.1min
[CV 8/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.746 total time=  29.8s
[CV 4/10] END criterion=gini, max_depth=12, n_estimators=600;, score=0.801 total time=  25.9s
[CV 8/10] END criterion=gini, max_depth=12, n_estimators=1200;, score=0.722 total time=  51.9s
[CV 4/10] END criterion=gini, max_depth=

[CV 3/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.660 total time=  15.5s
[CV 5/10] END criterion=entropy, max_depth=7, n_estimators=300;, score=0.843 total time=  14.2s
[CV 4/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.823 total time=  41.8s
[CV 1/10] END criterion=entropy, max_depth=12, n_estimators=600;, score=0.810 total time=  32.1s
[CV 10/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.784 total time=  48.5s
[CV 3/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.660 total time=  47.7s
[CV 10/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.736 total time=   7.3s
[CV 8/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.723 total time=  10.7s
[CV 2/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.762 total time=  10.2s
[CV 8/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.746 total time=  10.3s
[CV 2/10] END criterion=gini, max_depth=7, n_es

## Movelets

In [6]:
import random
import os


def prepareForShapelet(df=pd.DataFrame, trajectoryIDAttribute= None, targetAttribute=None,
                       timestampAttribute=None, latAttr=None, lonAttr=None, trainPerc=.80, trajPerc=1.0, maxPoints=None):
    attributes = [timestampAttribute, "latLon"]+ [x for x in list(df.columns) if x not in [trajectoryIDAttribute, targetAttribute, timestampAttribute, latAttr, lonAttr]] #mi assicuro di avere il timestam come primo elemento
    if(os.path.exists("train") | os.path.exists("test")):
        print("Le cartelle train e test esistono già!")
        return

    df["latLon"] = df[latAttr].astype(str) + " " + df[lonAttr].astype(str)

    ids = list(df[trajectoryIDAttribute].unique())
    #ids = random.sample(ids, round(len(ids)*trajPerc))
    ids = ids[:round(len(ids)*trajPerc)]
    ids_train = random.sample(ids, round(trainPerc*len(ids)))
    ids_test = [x for x in ids if x not in ids_train]

    os.makedirs("train")
    os.makedirs("test")

    for id in tqdm(ids_train):
        df_id = df[df[trajectoryIDAttribute] == id]
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)

    for id in tqdm(ids_test):
        df_id = df[df[trajectoryIDAttribute] == id]
        if maxPoints is not None:
            df_id = df_id.head(maxPoints)
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)

prepareForShapelet(df0, "tid", "class", "t", "c1", "c2", trainPerc=.7, trajPerc=1)

  0%|          | 0/837 [00:00<?, ?it/s]

  0%|          | 0/359 [00:00<?, ?it/s]

In [None]:
# 25733s

In [14]:
movelet_train_df = pd.read_csv("data/Movelet_output/train.csv")
movelet_test_df = pd.read_csv("data/Movelet_output/test.csv")

movelet_train_df.head()

Unnamed: 0,sh_TID69_START24_SIZE2_CLASSC,sh_TID98_START110_SIZE2_CLASSC,sh_TID80_START23_SIZE1_CLASSC,sh_TID75_START90_SIZE2_CLASSC,sh_TID13_START140_SIZE13_CLASSE,sh_TID13_START7_SIZE132_CLASSE,sh_TID98_START118_SIZE2_CLASSC,sh_TID88_START99_SIZE2_CLASSC,class
0,1.588584,1.649389,1.57602,1.61177,0.272882,3.028427e-310,1.588081,1.597207,D
1,0.969382,0.474057,0.896539,0.629428,1.840714,7.079885e-310,0.994866,0.711483,C
2,0.775535,0.922867,0.539484,0.853516,1.671193,7.079885e-310,0.777712,0.751979,C
3,1.244408,1.323665,1.240741,1.283351,0.99876,4.977281e-310,1.249638,1.248647,D
4,1.61024,1.66512,1.605926,1.629457,0.287881,2.273328e-310,1.609066,1.618398,E


In [37]:
movelet_test_df.head()

Unnamed: 0,sh_TID69_START24_SIZE2_CLASSC,sh_TID98_START110_SIZE2_CLASSC,sh_TID80_START23_SIZE1_CLASSC,sh_TID75_START90_SIZE2_CLASSC,sh_TID13_START140_SIZE13_CLASSE,sh_TID13_START7_SIZE132_CLASSE,sh_TID98_START118_SIZE2_CLASSC,sh_TID88_START99_SIZE2_CLASSC,class
0,1.805489,1.836301,1.808063,1.817643,0.323427,4.473884e-310,1.805202,1.809469,E
1,1.127098,0.26902,1.138115,0.541418,1.845579,7.079885e-310,1.133103,0.753968,C
2,1.223616,1.304504,1.191131,1.263482,0.993015,5.58664e-310,1.228602,1.226847,D
3,1.233781,1.305168,1.181491,1.271689,1.660812,7.079885e-310,1.239996,1.234993,D
4,1.135216,0.981418,0.840591,1.17466,1.756175,7.079885e-310,1.14002,0.989248,D


[CV 3/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.714 total time=   0.5s
[CV 8/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.571 total time=   1.3s
[CV 8/10] END criterion=entropy, max_depth=7, n_estimators=600;, score=0.714 total time=   0.9s
[CV 6/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.857 total time=   1.8s
[CV 10/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.714 total time=   1.3s
[CV 4/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.571 total time=   1.3s
[CV 2/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.714 total time=   0.9s
[CV 7/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.571 total time=   1.3s
[CV 7/10] END criterion=gini, max_depth=7, n_estimators=600;, score=0.857 total time=   0.9s
[CV 2/10] END criterion=gini, max_depth=12, n_estimators=300;, score=0.571 total time=   0.4s
[CV 6/10] END criterion=gini, max_depth=12, n_e

[CV 4/10] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.571 total time=   0.9s
[CV 7/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.714 total time=   1.7s
[CV 8/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.714 total time=   1.7s
[CV 4/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.571 total time=   1.8s
[CV 10/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.714 total time=   1.4s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.714 total time=   1.3s
[CV 1/10] END criterion=gini, max_depth=7, n_estimators=600;, score=0.625 total time=   0.9s
[CV 5/10] END criterion=gini, max_depth=7, n_estimators=900;, score=1.000 total time=   1.3s
[CV 9/10] END criterion=gini, max_depth=12, n_estimators=600;, score=0.714 total time=   0.9s
[CV 4/10] END criterion=gini, max_depth=17, n_estimators=300;, score=0.571 total time=   0.4s
[CV 7/10] END criterion=gini, max_depth=17, n_e

[CV 1/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.750 total time=   0.5s
[CV 6/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.714 total time=   1.3s
[CV 4/10] END criterion=entropy, max_depth=7, n_estimators=600;, score=0.571 total time=   0.9s
[CV 2/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.571 total time=   1.8s
[CV 8/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.714 total time=   1.3s
[CV 2/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.571 total time=   1.4s
[CV 10/10] END criterion=gini, max_depth=2, n_estimators=300;, score=0.857 total time=   0.4s
[CV 10/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.714 total time=   0.9s
[CV 3/10] END criterion=gini, max_depth=7, n_estimators=300;, score=1.000 total time=   0.4s
[CV 7/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.714 total time=   0.4s
[CV 6/10] END criterion=gini, max_depth=7, n_es

[CV 10/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.714 total time=   0.4s
[CV 3/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.714 total time=   1.8s
[CV 7/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.714 total time=   1.3s
[CV 5/10] END criterion=entropy, max_depth=12, n_estimators=600;, score=1.000 total time=   0.9s
[CV 8/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.714 total time=   1.8s
[CV 2/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.571 total time=   1.7s
[CV 9/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.714 total time=   1.3s
[CV 2/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.571 total time=   1.3s
[CV 10/10] END criterion=gini, max_depth=12, n_estimators=300;, score=0.714 total time=   0.4s
[CV 7/10] END criterion=gini, max_depth=12, n_estimators=600;, score=0.714 total time=   0.9s
[CV 10/10] END criterion=gini, max_depth=1

[CV 6/10] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.714 total time=   0.9s
[CV 8/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.571 total time=   1.7s
[CV 9/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.857 total time=   1.8s
[CV 2/10] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.571 total time=   0.4s
[CV 6/10] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.714 total time=   0.4s
[CV 5/10] END criterion=entropy, max_depth=17, n_estimators=600;, score=1.000 total time=   0.9s
[CV 5/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=1.000 total time=   1.8s
[CV 5/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.714 total time=   1.7s
[CV 9/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.857 total time=   1.3s
[CV 1/10] END criterion=gini, max_depth=12, n_estimators=900;, score=0.750 total time=   1.3s
[CV 1/10] END criterion=gini, max_depth=

[CV 2/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.714 total time=   0.5s
[CV 2/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.571 total time=   1.8s
[CV 6/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.857 total time=   1.4s
[CV 8/10] END criterion=entropy, max_depth=12, n_estimators=600;, score=0.714 total time=   0.9s
[CV 10/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.714 total time=   1.8s
[CV 6/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.857 total time=   1.8s
[CV 4/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.571 total time=   1.7s
[CV 1/10] END criterion=gini, max_depth=7, n_estimators=1200;, score=0.875 total time=   1.8s
[CV 7/10] END criterion=gini, max_depth=12, n_estimators=900;, score=0.714 total time=   1.3s
[CV 1/10] END criterion=gini, max_depth=17, n_estimators=900;, score=0.625 total time=   1.3s
[CV 6/10] END criterion=entropy, max_dept

In [16]:
X_train = movelet_train_df.values[:, :-1]
y_train = movelet_train_df.values[:, -1]

X_test = movelet_test_df.values[:, :-1]
y_test = movelet_test_df.values[:, -1]

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=3, n_jobs=24, verbose=3, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Model with rank: 1
Mean validation score: 0.747 (std: 0.030)
Parameters: {'criterion': 'entropy', 'max_depth': 17, 'n_estimators': 600}

Model with rank: 2
Mean validation score: 0.733 (std: 0.049)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 900}

Model with rank: 2
Mean validation score: 0.733 (std: 0.049)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 300}

Model with rank: 2
Mean validation score: 0.733 (std: 0.049)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 600}

Model with rank: 2
Mean validation score: 0.733 (std: 0.049)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 900}

Model with rank: 2
Mean validation score: 0.733 (std: 0.049)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 1200}

Model with rank: 2
Mean validation score: 0.733 (std: 0.049)
Parameters: {'criterion': 'entropy', 'max_depth': 17, 'n

In [49]:
#rf = RandomForestClassifier(max_depth=7, random_state=3, n_jobs=-1, n_estimators=300, criterion="entropy")
rf = clf.best_estimator_

rf.fit(X_train, y_train)

from sklearn import metrics
y_pred=rf.predict(X_test)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.67      0.89      0.76         9
           D       0.50      0.25      0.33        12
           E       0.38      0.50      0.43        10

    accuracy                           0.52        31
   macro avg       0.52      0.55      0.51        31
weighted avg       0.51      0.52      0.49        31



In [46]:
clf.best_estimator_

In [47]:
rf = clf.best_estimator_
#clf = tree.DecisionTreeClassifier(max_depth=2)

rf.fit(X_test, y_test)

from sklearn import metrics
y_pred=rf.predict(X_train)


from sklearn.metrics import classification_report

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           C       0.95      0.76      0.84        25
           D       0.35      0.39      0.37        18
           E       0.45      0.50      0.47        28

    accuracy                           0.56        71
   macro avg       0.58      0.55      0.56        71
weighted avg       0.60      0.56      0.58        71



## Geolet

In [39]:
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from cri98tj.distancers.DTW_distancer import DTW_distancer
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.normalizers.normalizer_utils import dataframe_pivot
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.partitioners.Voronoi_partitioner import Voronoi_partitioner
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.selectors.Random_selector import Random_selector
from cri98tj.selectors.RandomOrderline_selector import RandomOrderline_selector
from cri98tj.TrajectoryTransformer import TrajectoryTransformer
from sklearn.ensemble import RandomForestClassifier

  from tqdm.autonotebook import tqdm


In [40]:
df_geo = df0[["tid", "class", "c1", "c2", "t"]].copy()

df_geo.head()

Unnamed: 0,tid,class,c1,c2,t
0,0,public,39.894178,116.3182,1206716000.0
1,0,public,39.894505,116.321132,1206716000.0
2,0,public,39.894953,116.326452,1206716000.0
3,0,public,39.8946,116.332542,1206716000.0
4,0,public,39.889622,116.33704,1206716000.0


In [41]:
%%time

from sklearn.model_selection import train_test_split

spatioTemporalCols = ["c1", "c2", "t"]
tid_train, tid_test, _, _ = train_test_split(df_geo.groupby(by=["tid"]).max().reset_index()["tid"],
                                                            df_geo.groupby(by=["tid"]).max().reset_index()["class"],
                                                            test_size=.3,
                                                            stratify=df_geo.groupby(by=["tid"]).max().reset_index()["class"],
                                                            random_state=3)

partitioner = Geohash_partitioner(precision=6, spatioTemporalColumns=spatioTemporalCols)


normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None)
#selector = Random_selector(movelets_per_class=20, normalizer=normalizer,
#                                   spatioTemporalColumns=spatioTemporalCols)
selector = RandomInformationGain_selector(top_k=50, bestFittingMeasure=InterpolatedRootDistanceBestFitting,
                                                  movelets_per_class=100, trajectories_for_orderline=50, n_jobs=24,
                                                  spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=24)

part = partitioner.fit_transform(df_geo[df_geo.tid.isin(tid_train)].values)
shapelets = selector.fit_transform(part)
best_is, dist_np = distancer.fit_transform((df_geo.values, shapelets))

Encoding 956839 points with precision 6


  0%|          | 0/956839 [00:00<?, ?it/s]

Cutting sub-trajectories length at 622.0 over 1039
Pivoting tables


  0%|          | 0/7963 [00:00<?, ?it/s]

Cutting sub-trajectories length at 15937.949999999997 over 19346
Pivoting tables


  0%|          | 0/64753 [00:00<?, ?it/s]

Computing scores


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)


0.	 score=0.3542983912509887
1.	 score=0.35098936694196436
2.	 score=0.2820011843299428
3.	 score=0.2505899289186875
4.	 score=0.24249346982222852
5.	 score=0.23029311462187319
6.	 score=0.20316779549655406
7.	 score=0.19092276275152154
8.	 score=0.17751281159156984
9.	 score=0.16482008764884615
10.	 score=0.1598183671471256
11.	 score=0.1594203207490792
12.	 score=0.15353287486163314
13.	 score=0.15189823172699013
14.	 score=0.15178684311560153
15.	 score=0.15031093013968855
16.	 score=0.1435997442667385
17.	 score=0.13779239312115177
18.	 score=0.13624398074921018
19.	 score=0.1352705260992848
20.	 score=0.13504974687850546
21.	 score=0.13392176375052212
22.	 score=0.12216685199561073
23.	 score=0.11869188252064111
24.	 score=0.11724643907519772
25.	 score=0.11059527651227041
26.	 score=0.11044512927388794
27.	 score=0.11018669326545161
28.	 score=0.10428626362972793
29.	 score=0.10063497047843484
30.	 score=0.09862067594943458
31.	 score=0.09331265064140903
32.	 score=0.092207256036

  0%|          | 0/1410006 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Collecting distances from 50


  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 1min 2s, sys: 21.8 s, total: 1min 24s
Wall time: 2h 42min 50s


In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 2000, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 30, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=12), parameters, cv=5, n_jobs=2, verbose=3, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.833 total time=   0.4s
[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.838 total time=   0.4s
[CV 5/5] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.838 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.857 total time=   0.8s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.856 total time=   0.8s
[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.833 total time=   1.2s
[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.844 total time=   1.1s
[CV 5/5] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.838 total time=   1.2s
[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.851 total time=   1.8s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.856 total

[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.845 total time=   0.5s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.850 total time=   0.4s
[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.833 total time=   0.9s
[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.838 total time=   0.8s
[CV 5/5] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.832 total time=   0.8s
[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.851 total time=   1.2s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.856 total time=   1.2s
[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.827 total time=   1.6s
[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.844 total time=   1.6s
[CV 5/5] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.832 total time=   1.5s
[CV 2/5] END criterion=entropy, max_depth=2, n_

[CV 3/5] END criterion=entropy, max_depth=27, n_estimators=1200;, score=0.886 total time=   1.6s
[CV 5/5] END criterion=entropy, max_depth=27, n_estimators=1200;, score=0.886 total time=   1.6s
[CV 2/5] END criterion=entropy, max_depth=27, n_estimators=1500;, score=0.899 total time=   2.0s
[CV 4/5] END criterion=entropy, max_depth=27, n_estimators=1500;, score=0.868 total time=   2.0s
[CV 1/5] END criterion=entropy, max_depth=27, n_estimators=1800;, score=0.875 total time=   2.5s
[CV 3/5] END criterion=entropy, max_depth=27, n_estimators=1800;, score=0.880 total time=   2.4s
[CV 5/5] END criterion=entropy, max_depth=27, n_estimators=1800;, score=0.886 total time=   2.4s
[CV 4/5] END criterion=gini, max_depth=2, n_estimators=300;, score=0.850 total time=   0.4s
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=600;, score=0.827 total time=   0.8s
[CV 3/5] END criterion=gini, max_depth=2, n_estimators=600;, score=0.844 total time=   0.8s
[CV 5/5] END criterion=gini, max_depth=2, n_e

In [56]:
rf = RandomForestClassifier(n_jobs=12, criterion='gini', max_depth= 15, n_estimators=400)#clf.best_estimator_

dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

rf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     private       0.83      0.88      0.85       173
      public       0.89      0.83      0.86       186

    accuracy                           0.86       359
   macro avg       0.86      0.86      0.86       359
weighted avg       0.86      0.86      0.86       359

[CV 2/5] END criterion=gini, max_depth=27, n_estimators=1800;, score=0.893 total time=   2.4s
[CV 4/5] END criterion=gini, max_depth=27, n_estimators=1800;, score=0.874 total time=   2.6s
[CV 4/5] END criterion=gini, max_depth=27, n_estimators=300;, score=0.874 total time=   0.4s
[CV 1/5] END criterion=gini, max_depth=27, n_estimators=600;, score=0.881 total time=   0.9s
[CV 3/5] END criterion=gini, max_depth=27, n_estimators=600;, score=0.868 total time=   0.8s
[CV 5/5] END criterion=gini, max_depth=27, n_estimators=600;, score=0.886 total time=   0.8s
[CV 2/5] END criterion=gini, max_depth=27, n_estimators=900;, score=0.887 total time=   1.2s
[CV 4/5] END criteri