In [1]:
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from dtaidistance.dtw import distance_matrix
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

load_dotenv()

True

In [4]:
db_url = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/postgres"
engine = create_engine(db_url)


sql_query = """
    SELECT
        time, olr, speed, speeduncapped, freeflow, jamfactor, traversability, day_of_week, is_peak_hour, time_to_traverse, congestion_level, is_anomaly
    FROM diu.traffic_data WHERE day_of_week IS NOT NULL AND name IS NOT NULL;
"""
traffic_data = pd.read_sql(sql_query, engine)

In [5]:
traffic_data

Unnamed: 0,time,olr,speed,speeduncapped,freeflow,jamfactor,traversability,day_of_week,is_peak_hour,time_to_traverse,congestion_level,is_anomaly
0,2024-12-19 07:44:21.632000+00:00,CD0BEAA5OAYYECTyQQAJBQQCAuQACgUEAqx8APcBANcACQ...,13.888889,13.888889,16.388890,1.4,open,4,True,551.519996,Low,False
1,2024-12-27 20:00:48.505000+00:00,CCoBEAAmJQYWPiT7GQAJBQQBAk8ACgUEAZ8uABVt/NUACQ...,14.444445,14.444445,14.722222,0.1,open,5,False,245.007683,Low,False
2,2024-12-27 17:00:48.359000+00:00,CCkBEAAlJAYGeiT1zQAJBQQAAdYACgUEAKhjAO2iDaEACQ...,35.277780,35.277780,29.166668,0.0,open,5,True,148.166920,Low,False
3,2024-12-27 17:00:48.359000+00:00,CCgBEAAkIwYTOyUAYQAJBQQBAwAACgQDAQsAAAAACgAJBQ...,6.388889,6.388889,10.555556,3.0,open,5,True,1.721739,Moderate,True
4,2024-12-28 02:20:48.509000+00:00,CCkBEAAlJAYXkiTxsQAJBQQAASEACgUEALcaABT0EnkACQ...,23.333334,23.333334,28.611113,2.7,open,6,False,303.042848,Low,False
...,...,...,...,...,...,...,...,...,...,...,...,...
839487,2025-01-07 11:58:48.451000+00:00,CCkBEAAlJAYByST9QwAJBQQCAhoACgUEAq11ABuA/+sACQ...,12.777778,12.777778,11.944445,0.0,open,2,False,462.286948,Low,False
839488,2025-01-07 11:58:48.451000+00:00,CGMBEABfXgYWqiURlgAJBQQCA6AACgUEAoZqAP9n/yMACQ...,13.611112,13.611112,13.888889,0.1,open,2,False,1114.457070,Low,False
839489,2025-01-07 11:58:48.451000+00:00,CD0BEAA5OAYlZiUFFQAJBQQCA44ACgUEApkiAOQ1/U0ACQ...,11.944445,11.944445,13.611112,1.2,open,2,False,717.237176,Low,False
839490,2025-01-07 11:58:48.451000+00:00,CD0BEAA5OAYTOyUAZQAJBQQCAwQACgUEAphOAAl0BOsACQ...,10.833334,10.833334,11.944445,0.8,open,2,False,515.261507,Low,False


In [7]:
tmp_df = traffic_data.copy()
value_counts = tmp_df['olr'].value_counts()
tmp_df = tmp_df[tmp_df['olr'].isin(value_counts[value_counts >= 100].index)]
tmp_df['is_peak_hour'] = tmp_df['is_peak_hour'].astype(int)
tmp_df['is_anomaly'] = tmp_df['is_anomaly'].astype(int)
tmp_df['congestion_level'] = tmp_df['congestion_level'].map({'High': 2, 'Moderate': 1, 'Low': 0})
tmp_df['traversability'] = tmp_df['traversability'].map({'closed': 1, 'open': 0})
tmp_df.loc[tmp_df['traversability'] == 1, 'speed'] = tmp_df.loc[tmp_df['traversability'] == 1, 'speed'].fillna(0)
tmp_df.loc[tmp_df['traversability'] == 1, 'speeduncapped'] = tmp_df.loc[tmp_df['traversability'] == 1, 'speeduncapped'].fillna(0)
tmp_df.loc[tmp_df['traversability'] == 1, 'time_to_traverse'] = tmp_df.loc[tmp_df['traversability'] == 1, 'time_to_traverse'].fillna(0)

tmp_df

Unnamed: 0,time,olr,speed,speeduncapped,freeflow,jamfactor,traversability,day_of_week,is_peak_hour,time_to_traverse,congestion_level,is_anomaly
0,2024-12-19 07:44:21.632000+00:00,CD0BEAA5OAYYECTyQQAJBQQCAuQACgUEAqx8APcBANcACQ...,13.888889,13.888889,16.388890,1.4,0,4,1,551.519996,0,0
1,2024-12-27 20:00:48.505000+00:00,CCoBEAAmJQYWPiT7GQAJBQQBAk8ACgUEAZ8uABVt/NUACQ...,14.444445,14.444445,14.722222,0.1,0,5,0,245.007683,0,0
2,2024-12-27 17:00:48.359000+00:00,CCkBEAAlJAYGeiT1zQAJBQQAAdYACgUEAKhjAO2iDaEACQ...,35.277780,35.277780,29.166668,0.0,0,5,1,148.166920,0,0
3,2024-12-27 17:00:48.359000+00:00,CCgBEAAkIwYTOyUAYQAJBQQBAwAACgQDAQsAAAAACgAJBQ...,6.388889,6.388889,10.555556,3.0,0,5,1,1.721739,1,1
4,2024-12-28 02:20:48.509000+00:00,CCkBEAAlJAYXkiTxsQAJBQQAASEACgUEALcaABT0EnkACQ...,23.333334,23.333334,28.611113,2.7,0,6,0,303.042848,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
839487,2025-01-07 11:58:48.451000+00:00,CCkBEAAlJAYByST9QwAJBQQCAhoACgUEAq11ABuA/+sACQ...,12.777778,12.777778,11.944445,0.0,0,2,0,462.286948,0,0
839488,2025-01-07 11:58:48.451000+00:00,CGMBEABfXgYWqiURlgAJBQQCA6AACgUEAoZqAP9n/yMACQ...,13.611112,13.611112,13.888889,0.1,0,2,0,1114.457070,0,0
839489,2025-01-07 11:58:48.451000+00:00,CD0BEAA5OAYlZiUFFQAJBQQCA44ACgUEApkiAOQ1/U0ACQ...,11.944445,11.944445,13.611112,1.2,0,2,0,717.237176,0,0
839490,2025-01-07 11:58:48.451000+00:00,CD0BEAA5OAYTOyUAZQAJBQQCAwQACgUEAphOAAl0BOsACQ...,10.833334,10.833334,11.944445,0.8,0,2,0,515.261507,0,0


In [8]:
value_counts

olr
CD0BEAA5OAYYECTyQQAJBQQCAuQACgUEAqx8APcBANcACQUEAQI2AHAB57UIgQAJBQQCAu4ACgUEAo5SAAAA                                                        14474
CCkBEAAlJAYJBST1CgAJBQQAAVQACgUEAJ9gAA1Q9UIACQUEAAHKADAAAA==                                                                                14474
CCgBEAAkIwYQICT+eQAJBQQBAlcACgQDAUMAAFL/3wAJBQQBAtcAMAAA                                                                                    14474
CCkBEAAlJAYTTyT/5gAJBQQBAq4ACgUEAYUcAPyFAI8ACQUEAQJSADAAFw==                                                                                14474
CGIBEABeXQX7NyUHiwAJBQQCAykACgQDAmIABGwEYQAJBQQCA6AAcAMAbwA2AAkFBAIDFAAKBQQCvSEAIngJtwAJBQQCA28ACgUEA6ooABOTB0AACQUEAwNHAAoFBAOOUAAAAA==    14474
CGMBEABfXgYWqiURlgAJBQQCA6AACgUEAoZqAP9n/yMACQUEAgMpAHAD/lz9MQAJBQQCA44ACgUEA7I8AOlz91kACQUEAgPmAAoFBAK7JgDd5PbFAAkFBAIDkAAKBQQCgiIAAAA=    14474
CCkBEAAlJAYQbyT+4QAJBQQBAhAACgUEAYRSAAEJAfsACQUEAQKNADAAAA==                                                            

In [9]:
time_series_data = tmp_df.pivot_table(
    index="time",
    columns="olr",
    values=["speed", "speeduncapped", "freeflow", "jamfactor", "traversability", "time_to_traverse", "congestion_level"],
    aggfunc="mean",
).fillna(method="ffill")

In [10]:
scaler = StandardScaler()
time_series_data_normalized = scaler.fit_transform(time_series_data.values.reshape(-1, time_series_data.shape[1]))

time_series_data_normalized = pd.DataFrame(time_series_data_normalized, columns=time_series_data.columns, index=time_series_data.index)
time_series_list = time_series_data_normalized.values.T.tolist()

In [12]:
dtw_dist_matrix = distance_matrix(time_series_data_normalized)

Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "C:\Users\dgojn\anaconda3\envs\Datenanalyse-in-Unternehmen\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\dgojn\anaconda3\envs\Datenanalyse-in-Unternehmen\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "C:\Users\dgojn\anaconda3\envs\Datenanalyse-in-Unternehmen\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\dgojn\anaconda3\envs\Datenanalyse-in-Unternehmen\lib\concurrent\futures\process.py", line 394, in _queue_management_worker
    work_item.future.set_exception(bpe)
  File "C:\Users\dgojn\anaconda3\envs\Datenanalyse-in-Unternehmen\lib\concurrent\futures\_base.py", line 547, in set_exception
    raise InvalidStateError('{}: {!r}'.format(self._state, self))
concurrent.futures._base.InvalidStateError: CANCELLED: <Future at 0x1e3065bdc10 state=cancelled>


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [59]:
df = pd.DataFrame(X_scaled)
df

Unnamed: 0,0,1,2,3,4,5,6
0,-0.238263,-0.253093,-0.091942,0.086120,-0.231911,1.299512,-0.362058
1,-0.172121,-0.188737,-0.362923,-0.449705,-0.231911,0.044129,-0.362058
2,2.308206,2.224589,1.985572,-0.490922,-0.231911,-0.352502,-0.362058
3,-1.131181,-1.121890,-1.040373,0.745596,-0.231911,-0.952298,1.973527
4,0.886152,0.840949,1.895246,0.621945,-0.231911,0.281823,-0.362058
...,...,...,...,...,...,...,...
851405,-0.370547,-0.381804,-0.814556,-0.490922,-0.231911,0.934040,-0.362058
851406,-0.271334,-0.285271,-0.498413,-0.449705,-0.231911,3.605134,-0.362058
851407,-0.469760,-0.478337,-0.543576,0.003685,-0.231911,1.978240,-0.362058
851408,-0.602044,-0.607047,-0.814556,-0.161184,-0.231911,1.151008,-0.362058


In [60]:
# 4. Clustering mit k-Means
kmeans = KMeans(n_clusters=5, random_state=42)
tmp_df['Cluster'] = kmeans.fit_predict(X_scaled)

# 5. Ergebnisse visualisieren
sns.pairplot(tmp_df, hue='Cluster', vars=features)
plt.show()

  super()._check_params_vs_input(X, default_n_init=10)


KeyboardInterrupt: 

Error in callback <function _draw_all_if_interactive at 0x000002863FE33D30> (for post_execute):


KeyboardInterrupt: 

Error in callback <function flush_figures at 0x00000286B3434B80> (for post_execute):


KeyboardInterrupt: 