# Cloud Expenditure Optimization – Notebook Suite
This set of notebooks follows the architecture: ETL → Database → ML (Failure, Cost) → Dashboards.

**Data input**: `../data/sample_reports_100.csv` (or `../data/sample_reports.csv`)

**Outputs**: cleaned data and artifacts in `../results/`.

## 04 – Workload Clustering (KMeans)
Cluster workloads by resource usage to spot high-cost patterns.

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

possible_paths = [
    '../data/cleaned_reports.csv',
    '../data/sample_reports_100.csv',
    '/mnt/data/cleaned_reports.csv',
    '/mnt/data/sample_reports_100.csv'
]
data_path = next((p for p in possible_paths if Path(p).exists()), None)
assert data_path is not None, f'Could not find dataset. Checked: {possible_paths}'
print('Using data:', data_path)

df = pd.read_csv(data_path, parse_dates=['timestamp'])
X = df[['response_time_ms', 'cpu_usage', 'memory_usage', 'cost_usd']].copy()

scaler = StandardScaler()
Xs = scaler.fit_transform(X)

# Elbow method (quick)
inertias = []
for k in range(2, 7):
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km.fit(Xs)
    inertias.append(km.inertia_)

inertias


In [None]:
# Plot elbow
import matplotlib.pyplot as plt
ks = list(range(2,7))
plt.figure()
plt.plot(ks, inertias, marker='o')
plt.title('Elbow Plot')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.tight_layout()
plt.show()


In [None]:
# Fit final model with k=3 (example) and attach labels
final_k = 3
km = KMeans(n_clusters=final_k, n_init=10, random_state=42)
labels = km.fit_predict(Xs)
df['cluster'] = labels

# Cluster profiling
profile = df.groupby('cluster')[['response_time_ms','cpu_usage','memory_usage','cost_usd']].mean().round(2)
print(profile)

# Save labeled data for dashboards
from pathlib import Path
out_path = Path('../results/workload_clusters.csv')
df.to_csv(out_path, index=False)
print('Saved clustered data to', out_path)
