# Vehicles

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import glob
from datetime import datetime

In [2]:
df_original = pd.read_csv('data/vehicles_preapred.zip').sort_values(by=["tid", "t"])
df_original["c1"] = df_original.c1/100000
df_original["c2"] = df_original.c2/100000

df = df_original[["tid", "class", "c1", "c2", "t"]].copy()

df.head()

df0 = df[["tid", "class", "c1", "c2", "t"]]

df.head()

Unnamed: 0,tid,class,c1,c2,t
0,30901,B,42.07716,4.738411,0
1,30901,B,42.077246,4.739088,30
2,30901,B,42.077259,4.739096,60
3,30901,B,42.077369,4.739158,90
4,30901,B,42.077635,4.739343,120


## Rocket

In [6]:
t_rocket = 0

start = datetime.now()

df0["pos"] = df0.groupby(['tid']).cumcount()

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [7]:
start = datetime.now()


def padLastValue(df):
    array = df.values
    for i in tqdm(range(len(array))):
        for j in range(len(array[i])):
            if math.isnan(array[i][j]):
                array[i][j] =array[i][j-1]
    return pd.DataFrame(array)

percentile=.9

print("LAT")
df_lat = pd.DataFrame()
df_lat = df0.groupby(['tid','pos'])['c1'].max().unstack()
df_lat = padLastValue(df_lat)

print("LON")
df_lon = pd.DataFrame()
df_lon = df0.groupby(['tid','pos'])['c2'].max().unstack()
df_lon = padLastValue(df_lon)

print("TIME")
df_time = pd.DataFrame()
df_time = df0.groupby(['tid','pos'])['t'].max().unstack()
df_time = padLastValue(df_time)

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

LAT


  0%|          | 0/381 [00:00<?, ?it/s]

LON


  0%|          | 0/381 [00:00<?, ?it/s]

TIME


  0%|          | 0/381 [00:00<?, ?it/s]

In [8]:
start = datetime.now()

from sktime.transformations.panel.rocket import Rocket
from sktime.datatypes._panel._convert import from_2d_array_to_nested

nested_lat = from_2d_array_to_nested(df_lat).rename(columns={0: "dim_0"})
nested_lon = from_2d_array_to_nested(df_lon).rename(columns={0: "dim_0"})
nested_time = from_2d_array_to_nested(df_time).rename(columns={0: "dim_0"})


nested = nested_lat
#nested = nested_UTMGridNorth

nested["dim_1"] = nested_lon["dim_0"]
nested["dim_2"] = nested_time["dim_0"]

nested.head()

y = df0.groupby(['tid'])['class'].max()

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [9]:
start = datetime.now()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(nested, y, test_size=.3, stratify=y)

rocket = Rocket(n_jobs=-1, num_kernels=10000)  # by default, ROCKET uses 10,000 kernels
rocket.fit(X_train)
X_train_transform = rocket.transform(X_train)
X_test_transform = rocket.transform(X_test)

t_rocket = (datetime.now() - start).total_seconds() * 1000 #ms

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=3, scoring="balanced_accuracy")

clf.fit(X_train_transform, y_train)

report(clf.cv_results_, n_top=3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Model with rank: 1
Mean validation score: 0.881 (std: 0.088)
Parameters: {'criterion': 'gini', 'max_depth': 12, 'n_estimators': 300}

Model with rank: 2
Mean validation score: 0.872 (std: 0.087)
Parameters: {'criterion': 'gini', 'max_depth': 12, 'n_estimators': 600}

Model with rank: 3
Mean validation score: 0.869 (std: 0.070)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 600}

Model with rank: 3
Mean validation score: 0.869 (std: 0.080)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 600}

Model with rank: 3
Mean validation score: 0.869 (std: 0.088)
Parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.869 (std: 0.088)
Parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 1200}



In [11]:
clf = RandomForestClassifier(max_depth=12, criterion='gini', random_state=5, n_jobs=-1, n_estimators=300)
#clf = tree.DecisionTreeClassifier(max_depth=2)

clf.fit(X_train_transform, y_train)

from sklearn import metrics
y_pred=clf.predict(X_test_transform)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           B       0.97      0.88      0.92        33
           T       0.95      0.99      0.97        82

    accuracy                           0.96       115
   macro avg       0.96      0.93      0.95       115
weighted avg       0.96      0.96      0.96       115



In [12]:
t_rocket

31445.109

## Movelets

In [13]:
import random
import os


def prepareForShapelet(df=pd.DataFrame, trajectoryIDAttribute= None, targetAttribute=None,
                       timestampAttribute=None, latAttr=None, lonAttr=None, trainPerc=.80, trajPerc=1.0, maxPoints=None):
    attributes = [timestampAttribute, "latLon"]+ [x for x in list(df.columns) if x not in [trajectoryIDAttribute, targetAttribute, timestampAttribute, latAttr, lonAttr]] #mi assicuro di avere il timestam come primo elemento
    if(os.path.exists("train") | os.path.exists("test")):
        print("Le cartelle train e test esistono già!")
        return

    df["latLon"] = df[latAttr].astype(str) + " " + df[lonAttr].astype(str)

    ids = list(df[trajectoryIDAttribute].unique())
    #ids = random.sample(ids, round(len(ids)*trajPerc))
    ids = ids[:round(len(ids)*trajPerc)]
    ids_train = random.sample(ids, round(trainPerc*len(ids)))
    ids_test = [x for x in ids if x not in ids_train]

    os.makedirs("train")
    os.makedirs("test")

    for id in tqdm(ids_train):
        df_id = df[df[trajectoryIDAttribute] == id]
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"train/{id} s{id} c{classe}.r2", index=False, header=False)

    for id in tqdm(ids_test):
        df_id = df[df[trajectoryIDAttribute] == id]
        if maxPoints is not None:
            df_id = df_id.head(maxPoints)
        classe = df_id[targetAttribute].iloc[0]
        if maxPoints is not None:
            df[df[trajectoryIDAttribute] == id][attributes].head(maxPoints).to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)
        else:
            df[df[trajectoryIDAttribute] == id][attributes].to_csv(F"test/{id} s{id} c{classe}.r2", index=False, header=False)

prepareForShapelet(df0, "tid", "class", "t", "c1", "c2", trainPerc=.7, trajPerc=1)

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

[CV 8/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.857 total time=   2.2s
[CV 9/10] END criterion=entropy, max_depth=2, n_estimators=900;, score=0.714 total time=   5.9s
[CV 7/10] END criterion=entropy, max_depth=7, n_estimators=600;, score=0.831 total time=   6.6s
[CV 10/10] END criterion=entropy, max_depth=7, n_estimators=1200;, score=0.929 total time=  12.7s
[CV 8/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.929 total time=  12.7s
[CV 2/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.875 total time=  12.7s
[CV 7/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.786 total time=   5.5s
[CV 9/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.786 total time=   8.5s
[CV 8/10] END criterion=gini, max_depth=12, n_estimators=600;, score=1.000 total time=   6.1s
[CV 1/10] END criterion=gini, max_depth=17, n_estimators=300;, score=0.689 total time=   3.2s
[CV 5/10] END criterion=gini, max_depth=17,

[CV 10/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.857 total time=   2.2s
[CV 2/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.849 total time=   7.7s
[CV 5/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.875 total time=   9.4s
[CV 9/10] END criterion=entropy, max_depth=12, n_estimators=300;, score=0.786 total time=   3.5s
[CV 2/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.875 total time=   9.6s
[CV 9/10] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.786 total time=   3.4s
[CV 3/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.849 total time=   9.7s
[CV 3/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.750 total time=   3.0s
[CV 2/10] END criterion=gini, max_depth=2, n_estimators=900;, score=0.849 total time=   4.3s
[CV 1/10] END criterion=gini, max_depth=7, n_estimators=300;, score=0.761 total time=   3.1s
[CV 8/10] END criterion=gini, max_depth=7, 

[CV 7/10] END criterion=entropy, max_depth=2, n_estimators=300;, score=0.786 total time=   2.3s
[CV 1/10] END criterion=entropy, max_depth=2, n_estimators=1200;, score=0.618 total time=   7.6s
[CV 2/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.875 total time=   9.6s
[CV 10/10] END criterion=entropy, max_depth=12, n_estimators=300;, score=0.929 total time=   3.6s
[CV 4/10] END criterion=entropy, max_depth=12, n_estimators=900;, score=0.812 total time=   9.6s
[CV 10/10] END criterion=entropy, max_depth=17, n_estimators=300;, score=0.929 total time=   3.5s
[CV 4/10] END criterion=entropy, max_depth=17, n_estimators=900;, score=0.812 total time=   9.5s
[CV 1/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.618 total time=   3.0s
[CV 10/10] END criterion=gini, max_depth=2, n_estimators=600;, score=0.857 total time=   3.0s
[CV 4/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.812 total time=   5.5s
[CV 8/10] END criterion=gini, max_depth=

[CV 4/10] END criterion=entropy, max_depth=2, n_estimators=600;, score=0.812 total time=   4.2s
[CV 2/10] END criterion=entropy, max_depth=7, n_estimators=300;, score=0.875 total time=   3.4s
[CV 10/10] END criterion=entropy, max_depth=7, n_estimators=300;, score=0.929 total time=   3.4s
[CV 8/10] END criterion=entropy, max_depth=7, n_estimators=900;, score=0.929 total time=   9.7s
[CV 6/10] END criterion=entropy, max_depth=12, n_estimators=600;, score=1.000 total time=   6.7s
[CV 10/10] END criterion=entropy, max_depth=12, n_estimators=1200;, score=0.929 total time=  12.7s
[CV 4/10] END criterion=entropy, max_depth=17, n_estimators=1200;, score=0.812 total time=  12.5s
[CV 5/10] END criterion=gini, max_depth=2, n_estimators=1200;, score=0.688 total time=   5.6s
[CV 10/10] END criterion=gini, max_depth=7, n_estimators=900;, score=0.929 total time=   8.4s
[CV 7/10] END criterion=gini, max_depth=12, n_estimators=600;, score=0.831 total time=   6.1s
[CV 10/10] END criterion=gini, max_dept

In [None]:
# 8514242 ms

In [15]:
movelet_train_df = pd.read_csv("data/Movelet_output/train.csv")
movelet_test_df = pd.read_csv("data/Movelet_output/test.csv")

movelet_train_df.head()

Unnamed: 0,sh_TID42018_START0_SIZE412_CLASSB,sh_TID90604_START152_SIZE9_CLASST,sh_TID86203_START40_SIZE72_CLASST,sh_TID92006_START422_SIZE1_CLASST,sh_TID42094_START863_SIZE6_CLASSB,sh_TID42095_START47_SIZE1_CLASSB,sh_TID86704_START501_SIZE72_CLASST,sh_TID42080_START345_SIZE3_CLASSB,sh_TID42005_START60_SIZE3_CLASSB,class
0,2.0,1.153911,1.132315,1.189271,0.586965,0.273311,1.101939,0.239223,0.267243,B
1,2.0,0.973091,1.126933,0.688177,1.165543,1.151932,1.103792,1.144756,1.148418,B
2,2.0,1.100519,1.098091,1.137493,0.891989,0.880479,1.061744,0.839402,0.868396,B
3,0.968084,1.130059,1.118578,1.164891,0.668426,0.544227,1.085558,0.485539,0.518222,B
4,0.97476,1.221041,1.168284,1.247253,1.125599,0.79898,1.147143,0.973117,0.940765,B


In [16]:
movelet_test_df.head()

Unnamed: 0,sh_TID42018_START0_SIZE412_CLASSB,sh_TID90604_START152_SIZE9_CLASST,sh_TID86203_START40_SIZE72_CLASST,sh_TID92006_START422_SIZE1_CLASST,sh_TID42094_START863_SIZE6_CLASSB,sh_TID42095_START47_SIZE1_CLASSB,sh_TID86704_START501_SIZE72_CLASST,sh_TID42080_START345_SIZE3_CLASSB,sh_TID42005_START60_SIZE3_CLASSB,class
0,2.0,1.093716,1.094384,1.137229,0.94127,0.884245,1.063279,0.863177,0.892008,B
1,0.97384,1.221526,1.167599,1.247132,1.126041,0.801725,1.145457,0.976549,0.944174,B
2,0.977193,1.152266,1.092259,1.202596,0.382538,0.772183,1.063461,0.720976,0.730579,B
3,0.973676,1.159018,1.101053,1.203392,0.256079,0.334016,1.073879,0.242644,0.260859,B
4,2.0,1.098748,1.128672,1.137881,0.905996,0.879337,1.089535,0.849313,0.878094,B


In [17]:
X_train = movelet_train_df.values[:, :-1]
y_train = movelet_train_df.values[:, -1]

X_test = movelet_test_df.values[:, :-1]
y_test = movelet_test_df.values[:, -1]

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=3, n_jobs=24, verbose=0, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Model with rank: 1
Mean validation score: 0.891 (std: 0.011)
Parameters: {'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 600}

Model with rank: 1
Mean validation score: 0.891 (std: 0.011)
Parameters: {'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 900}

Model with rank: 1
Mean validation score: 0.891 (std: 0.011)
Parameters: {'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 1200}

Model with rank: 1
Mean validation score: 0.891 (std: 0.011)
Parameters: {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 600}

Model with rank: 1
Mean validation score: 0.891 (std: 0.011)
Parameters: {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 1200}



In [19]:
#rf = RandomForestClassifier(max_depth=7, random_state=3, n_jobs=-1, n_estimators=300, criterion="entropy")
rf = clf.best_estimator_

rf.fit(X_train, y_train)

from sklearn import metrics
y_pred=rf.predict(X_test)


from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           B       0.83      0.86      0.85        29
           T       0.95      0.94      0.95        85

    accuracy                           0.92       114
   macro avg       0.89      0.90      0.90       114
weighted avg       0.92      0.92      0.92       114



In [20]:
clf.best_estimator_

## Geolet

In [3]:
df0

Unnamed: 0,tid,class,c1,c2,t
0,30901,B,42.077160,4.738411,0
1,30901,B,42.077246,4.739088,30
2,30901,B,42.077259,4.739096,60
3,30901,B,42.077369,4.739158,90
4,30901,B,42.077635,4.739343,120
...,...,...,...,...,...
178294,420106,B,42.054656,4.903373,20700
178295,420106,B,42.053411,4.904073,20730
178296,420106,B,42.053166,4.903783,20760
178297,420106,B,42.053166,4.903783,20790


In [4]:
from cri98tj.distancers.Euclidean_distancer import Euclidean_distancer
from cri98tj.distancers.DTW_distancer import DTW_distancer
from cri98tj.distancers.InterpolatedRootDistance_distancer import InterpolatedRootDistance_distancer, \
    InterpolatedRootDistanceBestFitting
from cri98tj.normalizers.FirstPoint_normalizer import FirstPoint_normalizer
from cri98tj.normalizers.normalizer_utils import dataframe_pivot
from cri98tj.partitioners.Geohash_partitioner import Geohash_partitioner
from cri98tj.partitioners.Voronoi_partitioner import Voronoi_partitioner
from cri98tj.selectors.RandomInformationGain_selector import RandomInformationGain_selector
from cri98tj.selectors.Random_selector import Random_selector
from cri98tj.selectors.RandomOrderline_selector import RandomOrderline_selector
from cri98tj.TrajectoryTransformer import TrajectoryTransformer
from sklearn.ensemble import RandomForestClassifier

  from tqdm.autonotebook import tqdm


In [25]:
df_geo = df0[["tid", "class", "c1", "c2", "t"]].copy()

df_geo.head()

Unnamed: 0,tid,class,c1,c2,t
0,30901,B,42.07716,4.738411,0
1,30901,B,42.077246,4.739088,30
2,30901,B,42.077259,4.739096,60
3,30901,B,42.077369,4.739158,90
4,30901,B,42.077635,4.739343,120


In [35]:
%%time

from sklearn.model_selection import train_test_split

spatioTemporalCols = ["c1", "c2", "t"]
tid_train, tid_test, _, _ = train_test_split(df_geo.groupby(by=["tid"]).max().reset_index()["tid"],
                                                            df_geo.groupby(by=["tid"]).max().reset_index()["class"],
                                                            test_size=.3,
                                                            stratify=df_geo.groupby(by=["tid"]).max().reset_index()["class"],
                                                            random_state=3)

partitioner = Geohash_partitioner(precision=6, spatioTemporalColumns=spatioTemporalCols)


normalizer = FirstPoint_normalizer(spatioTemporalColumns=spatioTemporalCols, fillna=None)
#selector = Random_selector(movelets_per_class=20, normalizer=normalizer,
#                                   spatioTemporalColumns=spatioTemporalCols)
selector = RandomInformationGain_selector(top_k=20, bestFittingMeasure=InterpolatedRootDistanceBestFitting,
                                                  movelets_per_class=300, trajectories_for_orderline=50, n_jobs=24,
                                                  spatioTemporalColumns=spatioTemporalCols, normalizer=normalizer)
distancer = InterpolatedRootDistance_distancer(normalizer=normalizer, spatioTemporalColumns=spatioTemporalCols, n_jobs=24)

part = partitioner.fit_transform(df_geo[df_geo.tid.isin(tid_train)].values)
shapelets = selector.fit_transform(part)
best_is, dist_np = distancer.fit_transform((df_geo.values, shapelets))

Encoding 125348 points with precision 6


  0%|          | 0/125348 [00:00<?, ?it/s]

Cutting sub-trajectories length at 9.0 over 18
Pivoting tables


  0%|          | 0/883 [00:00<?, ?it/s]

Cutting sub-trajectories length at 736.0 over 1001
Pivoting tables


  0%|          | 0/24425 [00:00<?, ?it/s]

Computing scores


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)


0.	 score=0.47337294096579274
1.	 score=0.4037380071906047
2.	 score=0.3208375167281947
3.	 score=0.31077890660766494
4.	 score=0.2829206897471266
5.	 score=0.2824099720506501
6.	 score=0.2503027161314746
7.	 score=0.24304159249419022
8.	 score=0.23487991670867547
9.	 score=0.2320115865524195
10.	 score=0.21004170143237966
11.	 score=0.20338135971011817
12.	 score=0.20128612161488024
13.	 score=0.19745400928276768
14.	 score=0.19050906533782386
15.	 score=0.18967636209335592
16.	 score=0.18230105112980954
17.	 score=0.18150112882988712
18.	 score=0.1785408113695699
19.	 score=0.17476542421802188
Pivoting tables


  0%|          | 0/178299 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Collecting distances from 20


  0%|          | 0/20 [00:00<?, ?it/s]

CPU times: user 7.03 s, sys: 645 ms, total: 7.68 s
Wall time: 50.1 s


In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

dist_np_df = pd.DataFrame(dist_np)
X = dist_np_df.drop(columns=[0]).values
y = dist_np_df[0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=3)


def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(300, 1500, 300),
              'criterion':["entropy", "gini"],
              'max_depth': range(2, 20, 5)}

clf = GridSearchCV(RandomForestClassifier(n_jobs=1), parameters, cv=10, n_jobs=24, verbose=0, scoring="accuracy")

clf.fit(X_train, y_train)

report(clf.cv_results_, n_top=3)

Model with rank: 1
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 600}

Model with rank: 1
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'gini', 'max_depth': 12, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 1200}

Model with rank: 3
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 1200}

Model with rank: 3
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'entropy', 'max_depth': 17, 'n_estimators': 300}

Model with rank: 3
Mean validation score: 0.963 (std: 0.023)
Parameters: {'criterion': 'entropy', 'max_depth': 17, 'n_estimators': 1200}



In [37]:
rf = clf.best_estimator_

rf.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           B       1.00      0.85      0.92        33
           T       0.94      1.00      0.97        82

    accuracy                           0.96       115
   macro avg       0.97      0.92      0.94       115
weighted avg       0.96      0.96      0.96       115

