In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use("ggplot")

from conect2py.models import TEDADetect
from conect2py.models import TEDACloud

# Use case

## Pre-processing the data

In [4]:
df_honda = pd.read_csv("./dataset_freematics.csv")
df_honda.head()

Unnamed: 0,engine_load,throttle,speed,rpm,latitude,longitude
0,32.0,15.0,16.0,781.0,,
1,32.0,15.0,16.0,784.0,-5.843676,-35.198215
2,30.0,15.0,16.0,803.0,-5.843672,-35.198257
3,32.0,15.0,15.0,786.0,-5.84367,-35.198288
4,29.0,15.0,15.0,770.0,-5.843665,-35.198341


In [6]:
df_honda = df_honda.rename({"engine_load": "engineLoad", "speed":"speedOBD", "throttle": "throttlePosManifold", "rpm": "engineRPM"}, axis=1)

In [7]:
def calculate_radar_area(data_area_radar):
    data_area_radar['engineRPM'] = data_area_radar['engineRPM'] / 100
    area_values = []

    for i in data_area_radar.itertuples():
        rpm = i.engineRPM
        speed = i.speedOBD
        throttle = i.throttlePosManifold
        engine = i.engineLoad

        values_normalized = [rpm, speed, throttle, engine]
        area = 0.5 * np.abs(np.dot(values_normalized, np.roll(values_normalized, 1)) * np.sin(2 * np.pi / len(values_normalized)))
        area_values.append(area)

    return area_values


In [8]:
area_radar_dataset = df_honda.copy()
area_radar_dataset['areas'] = calculate_radar_area(area_radar_dataset)
area_radar_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engineLoad           531 non-null    float64
 1   throttlePosManifold  531 non-null    float64
 2   speedOBD             531 non-null    float64
 3   engineRPM            531 non-null    float64
 4   latitude             369 non-null    float64
 5   longitude            369 non-null    float64
 6   areas                531 non-null    float64
dtypes: float64(7)
memory usage: 29.2 KB


## Initialising the models

The code below initialises the models and the counters.

Here the two models are used for identifying the driver classification.



In [None]:
# initialize the algorithms
teda = TEDADetect()
autocloud = TEDACloud()
outlier_count = 0
outlier_window = 4
k = 1

total_outliers = 0
outliers_resets = 0
teda_flag = []
resets = []

This code is part of a real-time data processing pipeline where TEDA is used for online outlier detection, and AutoCloud is used for clustering the data points. The reset mechanism helps control the window size for outlier detection. The resulting dataset includes an additional column, cloud_index, indicating the cluster index assigned by AutoCloud. The code also keeps track of various counters and flags for analysis and evaluation purposes.

In [10]:
# iterate over the data
for i, row in area_radar_dataset.iterrows():
    # run teda to verify if the point is an outlier
    outlier = teda.run_online(row[['areas']])

    if outlier:
        outlier_count += 1
        total_outliers += 1
        teda_flag.append(1)
    else:
        teda_flag.append(0)
    
    if outlier_count == outlier_window or not outlier:
        # run autocloud to cluster the points
        cloud_index = autocloud.run_online(row[['areas','engineLoad']], k, 2, is_outlier=outlier)
        
        # get the dataset until the current point
        tmp = area_radar_dataset.iloc[:i+1]
        # autocloud.plot_graph(tmp[['areas','speedOBD']], k)
        k += 1

        # add the index to the row
        area_radar_dataset.loc[i, 'cloud_index'] = cloud_index
    
    if outlier_count == outlier_window:
        outlier_count = 0
        outliers_resets += 1
        resets.append(1)
    else:
        resets.append(0)

autocloud.display_metrics()


Number of DataClouds: 3
DataCloud 1:
Number of points: 76
Points: {1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 389, 143, 391, 392, 399, 400, 402, 403, 404, 23, 405, 25, 26, 27, 28, 406, 407, 289, 290, 291, 292, 419, 420, 421, 422, 423, 424, 390, 425, 46, 47, 430, 433, 434, 435, 439, 312, 313, 314, 315, 316, 334, 335, 336, 87, 88, 89, 221, 222, 223, 352, 353, 354, 249, 250, 110, 239, 240, 251, 246, 247, 248, 121, 122, 123}
Mean: [565.44361842  29.90789474]
Variance: 7492.14269136095
----------------------------------
DataCloud 2:
Number of points: 48
Points: {257, 2, 258, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 261, 143, 264, 265, 23, 25, 26, 27, 28, 260, 300, 301, 46, 47, 302, 303, 321, 87, 88, 89, 343, 346, 221, 222, 223, 349, 229, 110, 121, 122, 123}
Mean: [4094.28   94.  ]
Variance: 6219.14707140805
----------------------------------
DataCloud 3:
Number of points: 25
Points: {3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 17, 18, 29, 30, 32, 33, 34, 49, 51, 52}
Mean: [7599.375   90.   ]
Variance: 3396.014

In [11]:
area_radar_dataset["teda_flag"] = teda_flag
area_radar_dataset["resets"] = resets
area_radar_dataset

Unnamed: 0,engineLoad,throttlePosManifold,speedOBD,engineRPM,latitude,longitude,areas,cloud_index,teda_flag,resets
0,32.0,15.0,16.0,7.81,,,547.440,0.0,0,0
1,32.0,15.0,16.0,7.84,-5.843676,-35.198215,548.160,0.0,0,0
2,30.0,15.0,16.0,8.03,-5.843672,-35.198257,529.690,,1,0
3,32.0,15.0,15.0,7.86,-5.843670,-35.198288,537.210,2.0,0,0
4,29.0,15.0,15.0,7.70,-5.843665,-35.198341,499.400,,1,0
...,...,...,...,...,...,...,...,...,...,...
526,29.0,14.0,0.0,7.75,,,315.375,0.0,0,0
527,29.0,14.0,0.0,7.67,-5.843152,-35.197624,314.215,0.0,0,0
528,29.0,14.0,0.0,7.86,-5.843153,-35.197624,316.970,0.0,0,0
529,29.0,14.0,0.0,7.72,-5.843153,-35.197624,314.940,0.0,0,0


In [12]:
total_outliers

111

In [13]:
area_radar_dataset.cloud_index.isna().sum()

84

In [14]:
outliers_resets

27

In [15]:
area_radar_dataset.cloud_index.value_counts()

1.0    241
0.0    176
2.0     30
Name: cloud_index, dtype: int64

In [16]:
area_radar_dataset.resets.value_counts()

0    504
1     27
Name: resets, dtype: int64

In [21]:
# get the sum of areas for each cluster
area_radar_dataset[area_radar_dataset["teda_flag"] == 0].groupby('cloud_index')['areas'].sum()

cloud_index
0.0    187517.975000
1.0    375789.194403
2.0     27462.965000
Name: areas, dtype: float64

In [37]:
tmp = area_radar_dataset[area_radar_dataset["teda_flag"] == 0]

In [39]:
tmp.cloud_index.value_counts()

1.0    227
0.0    176
2.0     17
Name: cloud_index, dtype: int64

In [40]:
tmp.groupby('cloud_index')['areas'].sum()

cloud_index
0.0    187517.975000
1.0    375789.194403
2.0     27462.965000
Name: areas, dtype: float64