In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import glob
from datetime import datetime

In [2]:
df = pd.read_csv("data/seabird_prepared.csv")
df.head()

Unnamed: 0,lat,lon,alt,tid,bird,species,year,date_time,max_depth.m,colony2
0,56.095451,-6.233089,-23.059999,1340627854,1,tCOGU,t2012,2012-06-25 13:37:34,-2.172046,1
1,56.095408,-6.23352,-2.983077,1340627954,1,tCOGU,t2012,2012-06-25 13:39:14,-1.152306,1
2,56.095437,-6.234275,3.470286,1340628054,1,tCOGU,t2012,2012-06-25 13:40:54,-2.172046,1
3,56.095635,-6.234815,1.902667,1340628154,1,tCOGU,t2012,2012-06-25 13:42:34,-2.172046,1
4,56.095821,-6.235293,2.824952,1340628254,1,tCOGU,t2012,2012-06-25 13:44:14,-2.172046,1


In [3]:
df.date_time = df.date_time.apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp())

df.head()

Unnamed: 0,lat,lon,alt,tid,bird,species,year,date_time,max_depth.m,colony2
0,56.095451,-6.233089,-23.059999,1340627854,1,tCOGU,t2012,1340624000.0,-2.172046,1
1,56.095408,-6.23352,-2.983077,1340627954,1,tCOGU,t2012,1340624000.0,-1.152306,1
2,56.095437,-6.234275,3.470286,1340628054,1,tCOGU,t2012,1340624000.0,-2.172046,1
3,56.095635,-6.234815,1.902667,1340628154,1,tCOGU,t2012,1340625000.0,-2.172046,1
4,56.095821,-6.235293,2.824952,1340628254,1,tCOGU,t2012,1340625000.0,-2.172046,1


In [4]:
df0 = df[["bird", "species", "lat", "lon", "date_time"]]\
        .rename(columns={"lat": "c1", "lon": "c2", "date_time": "t", "species": "class", "bird": "tid"})

df0.head()

Unnamed: 0,tid,class,c1,c2,t
0,1,tCOGU,56.095451,-6.233089,1340624000.0
1,1,tCOGU,56.095408,-6.23352,1340624000.0
2,1,tCOGU,56.095437,-6.234275,1340624000.0
3,1,tCOGU,56.095635,-6.234815,1340625000.0
4,1,tCOGU,56.095821,-6.235293,1340625000.0


## Rocket

In [5]:
t_rocket = 0

start = datetime.now()

df0["pos"] = df0.groupby(['tid']).cumcount()

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [6]:
start = datetime.now()


def padLastValue(df):
    array = df.values
    for i in tqdm(range(len(array))):
        for j in range(len(array[i])):
            if math.isnan(array[i][j]):
                array[i][j] =array[i][j-1]
    return pd.DataFrame(array)

percentile=.9

print("LAT")
df_lat = pd.DataFrame()
df_lat = df0.groupby(['tid','pos'])['c1'].max().unstack()
df_lat = padLastValue(df_lat)

print("LON")
df_lon = pd.DataFrame()
df_lon = df0.groupby(['tid','pos'])['c2'].max().unstack()
df_lon = padLastValue(df_lon)

print("TIME")
df_time = pd.DataFrame()
df_time = df0.groupby(['tid','pos'])['t'].max().unstack()
df_time = padLastValue(df_time)

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

LAT


  0%|          | 0/108 [00:00<?, ?it/s]

LON


  0%|          | 0/108 [00:00<?, ?it/s]

TIME


  0%|          | 0/108 [00:00<?, ?it/s]

In [7]:
start = datetime.now()

from sktime.transformations.panel.rocket import Rocket
from sktime.datatypes._panel._convert import from_2d_array_to_nested

nested_lat = from_2d_array_to_nested(df_lat).rename(columns={0: "dim_0"})
nested_lon = from_2d_array_to_nested(df_lon).rename(columns={0: "dim_0"})
nested_time = from_2d_array_to_nested(df_time).rename(columns={0: "dim_0"})


nested = nested_lat
#nested = nested_UTMGridNorth

nested["dim_1"] = nested_lon["dim_0"]
nested["dim_2"] = nested_time["dim_0"]

nested.head()

y = df0.groupby(['tid'])['class'].max()

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [8]:
start = datetime.now()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(nested, y, test_size=.3, stratify=y)

rocket = Rocket(n_jobs=-1, num_kernels=10000)  # by default, ROCKET uses 10,000 kernels
rocket.fit(X_train)
X_train_transform = rocket.transform(X_train)
X_test_transform = rocket.transform(X_test)

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=3, scoring="balanced_accuracy")

clf.fit(X_train_transform, y_train)

report(clf.cv_results_, n_top=3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Model with rank: 1
Mean validation score: 0.583 (std: 0.172)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 600}

Model with rank: 2
Mean validation score: 0.578 (std: 0.170)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 900}

Model with rank: 3
Mean validation score: 0.576 (std: 0.266)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 600}



In [10]:
rf = clf.best_estimator_

rf.fit(X_train_transform, y_train)

from sklearn import metrics
y_pred=rf.predict(X_test_transform)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       0.57      0.44      0.50         9
       tEUSH       0.00      0.00      0.00         5
       tRAZO       0.62      0.84      0.71        19

    accuracy                           0.61        33
   macro avg       0.40      0.43      0.40        33
weighted avg       0.51      0.61      0.55        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
t_rocket

51808.375

## Movelets

In [None]:
import random
import os


def prepareForShapelet(df=pd.DataFrame, trajectoryIDAttribute= None, targetAttribute=None,
                       timestampAttribute=None, latAttr=None, lonAttr=None, trainPerc=.80, trajPerc=1.0, maxPoints=None):
    attributes = [timestampAttribute, "latLon"]+ [x for x in list(df.columns) if x not in [trajectoryIDAttribute, targetAttribute, timestampAttribute, latAttr, lonAttr]] #mi assicuro di avere il timestam come primo elemento
    if(os.path.exists("train") | os.path.exists("test")):
        print("Le cartelle train e test esistono già!")
        return

    df["latLon"] = df[latAttr].astype(str) + " " + df[lonAttr].astype(str)

    ids = list(df[trajectoryIDAttribute].unique())
    #ids = random.sample(ids, round(len(ids)*trajPerc))
    ids = ids[:round(len(ids)*trajPerc)]
    ids_train = random.sample(ids, round(trainPerc*len(ids)))
    ids_test = [x for x in ids if x not in ids_train]

    os.makedirs("train")
    os.makedirs("test")

    for id in tqdm(ids_train):
        df_id = df[df[trajectoryIDAttribute] == id]
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)

    for id in tqdm(ids_test):
        df_id = df[df[trajectoryIDAttribute] == id]
        if maxPoints is not None:
            df_id = df_id.head(maxPoints)
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)

prepareForShapelet(df0, "tid", "class", "t", "c1", "c2", trainPerc=.7, trajPerc=1)

In [65]:
movelet_train_df = pd.read_csv("data/Movelet_output/train.csv")
movelet_test_df = pd.read_csv("data/Movelet_output/test.csv")

movelet_train_df.head()

Unnamed: 0,sh_TID37_START1484_SIZE51_CLASStCOGU,sh_TID17_START2799_SIZE18_CLASStCOGU,sh_TID8_START813_SIZE95_CLASStCOGU,sh_TID37_START658_SIZE86_CLASStCOGU,sh_TID1_START2014_SIZE7_CLASStCOGU,sh_TID32_START90_SIZE27_CLASStCOGU,sh_TID84_START987_SIZE2_CLASStRAZO,sh_TID103_START2212_SIZE2_CLASStRAZO,sh_TID81_START223_SIZE39_CLASStRAZO,sh_TID15_START2340_SIZE18_CLASStCOGU,sh_TID23_START1837_SIZE83_CLASStCOGU,sh_TID39_START193_SIZE96_CLASStRAZO,sh_TID17_START435_SIZE18_CLASStCOGU,sh_TID46_START981_SIZE13_CLASStRAZO,sh_TID40_START749_SIZE13_CLASStRAZO,sh_TID9_START1301_SIZE92_CLASStCOGU,sh_TID17_START3076_SIZE19_CLASStCOGU,class
0,1.625842,0.985065,0.9874,1.627654,0.981097,0.906283,1.560816,1.496875,1.570457,0.664531,0.987076,1.612939,0.947763,1.600724,1.603072,0.972232,0.98827,tCOGU
1,1.62561,1.004802,1.015268,1.6274,1.018796,0.985898,1.56076,1.496705,1.570197,1.001187,1.009078,1.612765,0.922504,1.60048,1.602819,1.009958,1.008859,tEUSH
2,1.945494,1.776285,1.770864,1.943541,1.7906,1.782295,1.972289,0.075381,1.958965,1.773291,1.772221,1.961741,1.778535,1.971062,1.968936,1.775907,1.778065,tRAZO
3,1.943944,1.774224,1.768818,1.941982,1.78859,1.780203,1.970683,0.072461,1.95726,1.771228,1.770183,1.960246,1.776458,1.969596,1.967466,1.773868,1.776,tRAZO
4,1.94623,1.773533,1.772168,1.945589,1.787647,1.779918,1.97063,0.071628,1.958964,1.770542,1.772909,1.963817,1.775789,1.969943,1.967828,1.777048,1.775346,tRAZO


In [66]:
movelet_train_df.values[:, -1]

array(['tCOGU', 'tEUSH', 'tRAZO', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU',
       'tCOGU', 'tEUSH', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU', 'tCOGU',
       'tRAZO', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU', 'tRAZO', 'tRAZO',
       'tRAZO', 'tRAZO', 'tCOGU', 'tRAZO', 'tCOGU', 'tRAZO', 'tRAZO',
       'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO', 'tCOGU', 'tCOGU', 'tCOGU',
       'tCOGU', 'tCOGU', 'tCOGU', 'tCOGU', 'tEUSH', 'tEUSH', 'tRAZO',
       'tEUSH', 'tRAZO', 'tRAZO', 'tRAZO', 'tEUSH', 'tEUSH', 'tRAZO',
       'tRAZO', 'tCOGU', 'tCOGU', 'tEUSH', 'tRAZO', 'tEUSH', 'tRAZO',
       'tRAZO', 'tCOGU', 'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO',
       'tRAZO', 'tRAZO', 'tEUSH', 'tRAZO', 'tCOGU', 'tEUSH', 'tRAZO',
       'tRAZO', 'tEUSH', 'tRAZO', 'tRAZO', 'tRAZO', 'tRAZO'], dtype=object)

In [78]:
X_train = movelet_train_df.values[:, :-1]
y_train = movelet_train_df.values[:, -1]

X_test = movelet_test_df.values[:, :-1]
y_test = movelet_test_df.values[:, -1]

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=3, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Model with rank: 1
Mean validation score: 0.777 (std: 0.165)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 300}

Model with rank: 2
Mean validation score: 0.764 (std: 0.171)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.764 (std: 0.193)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 900}

Model with rank: 3
Mean validation score: 0.764 (std: 0.193)
Parameters: {'criterion': 'entropy', 'max_depth': 17, 'n_estimators': 600}

Model with rank: 3
Mean validation score: 0.764 (std: 0.193)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 1200}



In [81]:
clf = RandomForestClassifier(max_depth=2, criterion='gini', random_state=5, n_jobs=-1, n_estimators=1200)
#clf = tree.DecisionTreeClassifier(max_depth=2)

clf.fit(X_train, y_train)

from sklearn import metrics
y_pred=clf.predict(X_test)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       0.71      0.56      0.63         9
       tEUSH       0.00      0.00      0.00         3
       tRAZO       0.72      0.90      0.80        20

    accuracy                           0.72        32
   macro avg       0.48      0.49      0.48        32
weighted avg       0.65      0.72      0.68        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 10/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.571 total time=   0.5s
[CV 2/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.625 total time=   1.8s
[CV 7/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=1.000 total time=   1.4s
[CV 2/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.625 total time=   1.4s
[CV 9/10] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.571 total time=   0.5s
[CV 9/10] END criterion=entropy, max_depth=17, n_estimators=600;, score=0.571 total time=   0.9s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=300;, score=0.625 total time=   0.4s
[CV 5/10] END criterion=gini, max_depth=2, n_estimators=300;, score=0.750 total time=   0.4s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.750 total time=   0.9s
[CV 9/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.571 total time=   1.3s
[CV 3/10] END criterion=gini, max_depth=7, n_es

[CV 6/10] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.875 total time=   0.9s
[CV 9/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.571 total time=   1.8s
[CV 6/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=1.000 total time=   1.9s
[CV 4/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.625 total time=   1.9s
[CV 1/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.875 total time=   1.9s
[CV 2/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.625 total time=   1.8s
[CV 10/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.571 total time=   1.4s
[CV 4/10] END criterion=gini, max_depth=12, n_estimators=900;, score=0.625 total time=   1.3s
[CV 1/10] END criterion=gini, max_depth=17, n_estimators=600;, score=0.875 total time=   0.9s
[CV 5/10] END criterion=gini, max_depth=17, n_estimators=900;, score=0.875 total time=   1.0s
[CV 7/10] END criterion=entropy, max_depth=2

[CV 5/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.750 total time=   0.5s
[CV 3/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.625 total time=   1.9s
[CV 9/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.571 total time=   1.4s
[CV 3/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.500 total time=   1.4s
[CV 1/10] END criterion=entropy, max_depth=17, n_estimators=600;, score=0.875 total time=   0.9s
[CV 5/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.875 total time=   1.4s
[CV 7/10] END criterion=gini, max_depth=2, n_estimators=600;, score=1.000 total time=   0.9s
[CV 1/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.875 total time=   0.5s
[CV 6/10] END criterion=gini, max_depth=7, n_estimators=300;, score=1.000 total time=   0.5s
[CV 4/10] END criterion=gini, max_depth=7, n_estimators=600;, score=0.625 total time=   0.9s
[CV 9/10] END criterion=gini, max_depth=7, n_est

## Geolet

In [13]:
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from cri98tj.distancers.DTW_distancer import DTW_distancer
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.normalizers.normalizer_utils import dataframe_pivot
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.partitioners.Voronoi_partitioner import Voronoi_partitioner
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.selectors.Random_selector import Random_selector
from cri98tj.selectors.RandomOrderline_selector import RandomOrderline_selector
from cri98tj.TrajectoryTransformer import TrajectoryTransformer
from sklearn.ensemble import RandomForestClassifier

  from tqdm.autonotebook import tqdm


In [46]:
df_geo = df0[["tid", "class", "c1", "c2", "t"]].copy()

In [38]:
%%time

from sklearn.model_selection import train_test_split

spatioTemporalCols = ["c1", "c2", "t"]
tid_train, tid_test, _, _ = train_test_split(df_geo.groupby(by=["tid"]).max().reset_index()["tid"],
                                                            df_geo.groupby(by=["tid"]).max().reset_index()["class"],
                                                            test_size=.3,
                                                            stratify=df_geo.groupby(by=["tid"]).max().reset_index()["class"],
                                                            random_state=3)

partitioner = Geohash_partitioner(precision=5, spatioTemporalColumns=spatioTemporalCols)


normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None)
#selector = Random_selector(movelets_per_class=20, normalizer=normalizer,
#                                   spatioTemporalColumns=spatioTemporalCols)
selector = RandomInformationGain_selector(top_k=50, bestFittingMeasure=InterpolatedRootDistanceBestFitting,
                                                  movelets_per_class=500, trajectories_for_orderline=50, n_jobs=24,
                                                  spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=24)

part = partitioner.fit_transform(df_geo[df_geo.tid.isin(tid_train)].values)
shapelets = selector.fit_transform(part)
best_is, dist_np = distancer.fit_transform((df_geo.values, shapelets))

Encoding 177660 points with precision 5


  0%|          | 0/177660 [00:00<?, ?it/s]

Cutting sub-trajectories length at 1834.0 over 2728
Pivoting tables


  0%|          | 0/24959 [00:00<?, ?it/s]

Cutting sub-trajectories length at 3195.550000000003 over 5210
Pivoting tables


  0%|          | 0/88929 [00:00<?, ?it/s]

Computing scores


  0%|          | 0/471 [00:00<?, ?it/s]

  0%|          | 0/471 [00:00<?, ?it/s]

[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=1500;, score=0.667 total time=   2.3s
[CV 4/5] END criterion=entropy, max_depth=7, n_estimators=1800;, score=0.733 total time=   2.8s
[CV 3/5] END criterion=entropy, max_depth=17, n_estimators=600;, score=0.667 total time=   0.9s
[CV 2/5] END criterion=entropy, max_depth=17, n_estimators=1500;, score=0.867 total time=   2.3s
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=1800;, score=0.667 total time=   2.7s
[CV 5/5] END criterion=gini, max_depth=12, n_estimators=300;, score=0.733 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=12, n_estimators=1200;, score=0.867 total time=   1.8s
[CV 3/5] END criterion=gini, max_depth=17, n_estimators=900;, score=0.667 total time=   1.4s
[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.733 total time=   1.8s
[CV 5/5] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.800 total time=   1.4s
[CV 4/5] END criterion=entropy, max_depth=12, n_est

[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=1500;, score=0.667 total time=   2.3s
[CV 2/5] END criterion=entropy, max_depth=7, n_estimators=1800;, score=0.867 total time=   2.7s
[CV 5/5] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.733 total time=   0.5s
[CV 5/5] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.800 total time=   1.4s
[CV 4/5] END criterion=gini, max_depth=2, n_estimators=300;, score=0.667 total time=   0.5s
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=900;, score=0.533 total time=   1.3s
[CV 5/5] END criterion=gini, max_depth=2, n_estimators=1800;, score=0.667 total time=   2.7s
[CV 3/5] END criterion=gini, max_depth=12, n_estimators=900;, score=0.667 total time=   1.3s
[CV 5/5] END criterion=gini, max_depth=12, n_estimators=1800;, score=0.800 total time=   2.7s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.667 total time=   1.4s
[CV 2/5] END criterion=entropy, max_depth=7, n_estimators

[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.800 total time=   0.9s
[CV 2/5] END criterion=entropy, max_depth=7, n_estimators=300;, score=0.800 total time=   0.5s
[CV 2/5] END criterion=entropy, max_depth=7, n_estimators=600;, score=0.867 total time=   0.9s
[CV 5/5] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.800 total time=   1.9s
[CV 5/5] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.800 total time=   1.9s
[CV 4/5] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.733 total time=   1.9s
[CV 3/5] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.667 total time=   1.8s
[CV 3/5] END criterion=gini, max_depth=7, n_estimators=1200;, score=0.667 total time=   1.8s
[CV 1/5] END criterion=gini, max_depth=12, n_estimators=1200;, score=0.600 total time=   1.8s
[CV 1/5] END criterion=gini, max_depth=17, n_estimators=900;, score=0.600 total time=   1.4s
[CV 4/5] END criterion=gini, max_depth=17, n_estimat

  y = column_or_1d(y, warn=True)


0.	 score=0.4319367808044403
1.	 score=0.4252323139535297
2.	 score=0.34182318594367556
3.	 score=0.34046096679507065
4.	 score=0.33238401875167867
5.	 score=0.32530379607658655
6.	 score=0.3241678340148897
7.	 score=0.3158376835842165
8.	 score=0.3132302609929365
9.	 score=0.31065953636793653
10.	 score=0.31053782191449564
11.	 score=0.3100447157242421
12.	 score=0.3065042020282702
13.	 score=0.29899831624482776
14.	 score=0.29620844206743246
15.	 score=0.2876961338352062
16.	 score=0.28725182691561235
17.	 score=0.27797293064493833
18.	 score=0.2763972048341927
19.	 score=0.27496120368010923
20.	 score=0.26621738875615897
21.	 score=0.26348001124268694
22.	 score=0.2626025105272969
23.	 score=0.26060879865508757
24.	 score=0.2587872502999258
25.	 score=0.25462170204160905
26.	 score=0.2510706565262777
27.	 score=0.24844067428770278
28.	 score=0.24795806458175695
29.	 score=0.24714005291320373
30.	 score=0.24630127742674324
31.	 score=0.24314994448404836
32.	 score=0.2422065539706264


  0%|          | 0/263718 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Collecting distances from 50


  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 27.6 s, sys: 1.39 s, total: 29 s
Wall time: 48min 2s


In [39]:
dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 2000, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=5, n_jobs=24, verbose=3, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Model with rank: 1
Mean validation score: 0.773 (std: 0.131)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 900}

Model with rank: 1
Mean validation score: 0.773 (std: 0.131)
Parameters: {'criterion': 'gini', 'max_depth': 12, 'n_estimators': 900}

Model with rank: 1
Mean validation score: 0.773 (std: 0.131)
Parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 1200}

Model with rank: 1
Mean validation score: 0.773 (std: 0.131)
Parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 1800}



In [41]:
clf = clf.best_estimator_

clf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       tCOGU       0.67      0.67      0.67         9
       tEUSH       0.75      0.60      0.67         5
       tRAZO       0.75      0.79      0.77        19

    accuracy                           0.73        33
   macro avg       0.72      0.69      0.70        33
weighted avg       0.73      0.73      0.73        33

