In [None]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from pytz import timezone

In [None]:
# 1. CSV-Daten laden
file_path = "../data/processed/charging_sessions_cleaned.csv"
#file_path = "../data/raw/charging_sessions.csv"
df = pd.read_csv(file_path, parse_dates=["connectionTime", "disconnectTime", 'doneChargingTime'])
df.head()

In [None]:
# 2. Zeitzonenanpassung
local_timezone = timezone("America/Los_Angeles") #TODO zeitzone as dataframe auslesen timezone(df['timezone'].to_string())

if df['connectionTime'].dt.tz is None:
    df['connectionTime'] = df['connectionTime'].dt.tz_localize('UTC').dt.tz_convert(local_timezone)
else:
    df['connectionTime'] = df['connectionTime'].dt.tz_convert(local_timezone)

if df['disconnectTime'].dt.tz is None:
    df['disconnectTime'] = df['disconnectTime'].dt.tz_localize('UTC').dt.tz_convert(local_timezone)
else:
    df['disconnectTime'] = df['disconnectTime'].dt.tz_convert(local_timezone)

if df['doneChargingTime'].dt.tz is None:
    df['doneChargingTime'] = df['doneChargingTime'].dt.tz_localize('UTC').dt.tz_convert(local_timezone)
else:
    df['doneChargingTime'] = df['doneChargingTime'].dt.tz_convert(local_timezone)

# 3. Feature-Engineering
df['duration'] = (df['disconnectTime'] - df['connectionTime']).dt.total_seconds() / 3600 #duration in Stunden
df['ConnectionHourOfDay'] = df['connectionTime'].dt.hour
#features = df[['kWhDelivered', 'duration', 'ConnectionHourOfDay', 'siteID']].dropna()
#features = df.dropna()

features = df.drop(columns=['Unnamed: 0','id','sessionID','stationID','timezone'])
# 4. Daten normalisieren
#scaler = StandardScaler()
#scaled_features = scaler.fit_transform(features)

features.info()

In [None]:
features.head()

In [None]:
sns.pairplot(data=features)


In [None]:
droppedfeatures = features.drop(columns=['userInputs','connectionTime','disconnectTime','doneChargingTime']).dropna()
droppedfeatures.info()

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(droppedfeatures)


# 5. Elbow-Methode zur Bestimmung der optimalen Anzahl von Clustern
inertia = []
cluster_range = range(1, 11)

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

# Elbow-Plot erstellen
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, inertia, marker='o', linestyle='--')
plt.title("Elbow-Methode zur Bestimmung der optimalen Clusteranzahl")
plt.xlabel("Anzahl der Cluster (k)")
plt.ylabel("Inertia (Summe der quadrierten Abstände)")
plt.grid(True)
plt.show()

In [None]:
# 6. Clusteranalyse mit der optimalen Anzahl von Clustern
# Benutzer kann die optimale Anzahl der Cluster auswählen
optimal_k = int(input("Gib die optimale Anzahl der Cluster (k) ein: "))
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
droppedfeatures['cluster'] = kmeans.fit_predict(scaled_features)

# Cluster-Zentren zurückskalieren
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)

# Cluster-Zentren ausgeben
print(f"\nCluster-Zentren ({optimal_k} Cluster):")
for i, center in enumerate(cluster_centers):
    print(f"Cluster {i + 1}:")
    print(f"  Gelieferte Energie = {center[0]:.2f} kWh")
    print(f"  Ladezeit = {center[1]:.2f} Stunden")
    print(f"  Stunde des Tages = {center[2]:.2f}")
    print(f"  Standort-ID = {center[3]:.2f}")
    print()

In [None]:
score = silhouette_score(scaled_features,droppedfeatures['cluster'])
print(f"Silhouetten-Score: {score:.2f}")

In [None]:
# 7. Grafische Darstellung der Clusteranalyse
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    droppedfeatures['kWhDelivered'],
    droppedfeatures['duration'],
    droppedfeatures['ConnectionHourOfDay'],
    c=droppedfeatures['cluster'], cmap='viridis', s=50, alpha=0.6
)

# Achsen beschriften
ax.set_xlabel("Gelieferte Energie (kWh)")
ax.set_ylabel("Ladezeit (Stunden)")
ax.set_zlabel("Stunde des Tages")
ax.set_title(f"Clusteranalyse mit {optimal_k} Clustern")

# Farbskala hinzufügen
cbar = plt.colorbar(scatter, pad=0.1, ax=ax)
cbar.set_label("Cluster")

plt.show()

In [3]:
cluster_size = scaled_features.groupby('cluster').size()
cluster_size

AttributeError: 'numpy.ndarray' object has no attribute 'groupby'