In [1]:
from hyperopt import hp, fmin, tpe, Trials
from datetime import datetime
from functools import partial
from sklearn import metrics
from typing import Literal
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset_path = './groundTruthGenerator/groundTruth'

In [48]:
from datetime import datetime
class SpeedHampelFilter:
    def __init__(self,
                sigma_n=3, 
                distance_metric:Literal['euclidean', 'haversine']= 'haversine',
                lat='x',
                lng='y',
                t='timestep'
                    ):
        self.sigma_n = sigma_n
        self.distance_metric = distance_metric
        self.lat = lat
        self.lng = lng
        self.t = t

    def haversine_distance(self, lat1:float, lon1:float, lat2:float, lon2:float):
        # Radius of the Earth in kilometers
        earth_radius = 6371
        # Convert latitude and longitude from degrees to radians
        lat1 = math.radians(lat1)
        lon1 = math.radians(lon1)
        lat2 = math.radians(lat2)
        lon2 = math.radians(lon2)
        # Haversine formula
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        # Calculate the distance
        distance = earth_radius * c
        return distance
    
    def euclidean_distance(self, x1:float, y1:float, x2:float, y2:float):
        return math.sqrt((x1 - x2)**2 + (y1 - y2)**2)

    def delta_duration(self, d1, d2):
        if type(d1) == pd.Timestamp and type(d2) == pd.Timestamp:
            return (d2 - d1).total_seconds()
        return d2 - d1

    def compute_speeds(self, points:[[float, float, float]]) -> pd.Series:
        '''
            Compute speed for each Pi to Pi+1
            Parameters:
            - points: a list of point in format [[longitude, latitude, datetime], ...]
            Returns:
            - distances: A pandas.Series of floats for each speed
        '''
        speed = pd.Series()
        if(self.distance_metric == 'haversine'):
            compute_distance = self.haversine_distance
        elif(self.distance_metric == 'euclidean'):
            compute_distance = self.euclidean_distance
        for i in range(len(points)-1):
            p1 = points[i]
            p2 = points[i+1]
            distance = compute_distance(p1[1], p1[0], p2[1], p2[0])
            duration = self.delta_duration(p1[2], p2[2])
            speed[i] = distance / duration
        return speed

    def predict(self, df=pd.DataFrame):
        points = df[[self.lat, self.lng, self.t]].values
        speeds = self.compute_speeds(points)
        speeds_mean = np.mean(speeds)
        speeds_std = np.std(speeds)
        threshold = max(speeds_mean - self.sigma_n * speeds_std, 0) # there is no negative speed
        y_pred = [False for _ in range(len(df))]
        for i in range(len(speeds)):
            y_pred[i] = speeds[i] <= threshold
        return y_pred

### Hyperparameter Optimization - Hyperopt

In [19]:
def optimize(params, data):
    model = SpeedHampelFilter(**params)
    f1s = []
    veh_id_unique = data['id'].unique()
    for veh_id in veh_id_unique:
        trajectory = data[data['id'] == veh_id]
        y_true = trajectory['stop']
        y_pred = model.predict(trajectory.drop(columns=['stop']))
        f1s.append(metrics.f1_score(y_true, y_pred))
    return -1.0 * np.mean(f1s)

In [20]:
param_space = {
    'sigma_n': hp.uniform('sigma_n',0, 5),
    'distance_metric': hp.choice('distance_metric', ['euclidean', 'haversine'])
}

In [21]:
move_stop_train = pd.read_csv(f'{dataset_path}/stop_train.csv')
move_stop_train['index'] = move_stop_train.index

In [22]:
optimization_function = partial(
    optimize,
    data=move_stop_train,
)

In [23]:
trials = Trials()
best_params = fmin(
    fn=optimization_function,
    space=param_space,
    algo=tpe.suggest,
    max_evals=15,
    trials=trials,
)
print(best_params)

100%|██████████| 15/15 [34:22<00:00, 137.47s/trial, best loss: -0.39838979808110014]
{'distance_metric': 0, 'sigma_n': 2.171828076464357}


### Test / Validation

In [28]:
move_stop_test = pd.read_csv(f'{dataset_path}/stop_test.csv')
move_stop_test['index'] = move_stop_test.index
veh_id_unique = move_stop_test['id'].unique()

In [29]:
ac_list = []
pr_list = []
re_list = []
f1_list = []
model = SpeedHampelFilter(sigma_n=2.171828076464357, distance_metric='euclidean')
data = move_stop_test
for veh_id in veh_id_unique:
    trajectory = data[data['id'] == veh_id]
    y_true = trajectory['stop']
    y_pred = model.predict(trajectory.drop(columns=['stop']))

    ac_list.append(metrics.accuracy_score(y_true, y_pred))
    pr_list.append(metrics.precision_score(y_true, y_pred))
    re_list.append(metrics.recall_score(y_true, y_pred))
    f1_list.append(metrics.f1_score(y_true, y_pred))

ac_mean = np.mean(ac_list)
pr_mean = np.mean(pr_list)
re_mean = np.mean(re_list)
f1_mean = np.mean(f1_list)

print('Accuracy mean:', ac_mean)
print('Precision mean:', pr_mean)
print('Recall mean:', re_mean)
print('F1 mean:', f1_mean)

Accuracy mean: 0.9659137020547024
Precision mean: 0.37914336446444397
Recall mean: 0.60346203238686
F1 mean: 0.43862938089235554
