In [2]:
# In notebooks/06_Clustering_Analysis.ipynb
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

print("Loading processed battery data...")
df = pd.read_csv('../data/processed_battery_data.csv')

# --- Feature Engineering for Clustering ---
# We want to describe the overall behavior of each battery.
# We'll calculate the average temperature and the rate of capacity fade.
fleet_summary = df.groupby('battery_id').agg(
    avg_temp=('temp_mean', 'mean'),
    # Calculate capacity fade rate: (start_capacity - end_capacity) / num_cycles
    fade_rate=('capacity', lambda x: (x.iloc[0] - x.iloc[-1]) / len(x))
).reset_index()

print("Engineered features for fleet summary:")
print(fleet_summary)

# --- K-Means Clustering ---
features_for_clustering = fleet_summary[['avg_temp', 'fade_rate']]

# Scale the features so they have equal importance
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_for_clustering)

# Run K-Means to find 3 distinct clusters (e.g., Healthy, Hot, Fading Fast)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
fleet_summary['cluster'] = kmeans.fit_predict(scaled_features)

print("\nClustering complete. Here are the results:")
print(fleet_summary)

# --- Save the results ---
output_path = '../data/clustered_fleet_data.csv'
fleet_summary.to_csv(output_path, index=False)
print(f"\n✅ Clustered fleet data saved to {output_path}")

Loading processed battery data...
Engineered features for fleet summary:
  battery_id   avg_temp  fade_rate
0      B0053  12.238551   0.019092
1      B0054  13.512857   0.007184

Clustering complete. Here are the results:
  battery_id   avg_temp  fade_rate  cluster
0      B0053  12.238551   0.019092        0
1      B0054  13.512857   0.007184        1

✅ Clustered fleet data saved to ../data/clustered_fleet_data.csv
