# Cloud Expenditure Optimization – Notebook Suite
This set of notebooks follows the architecture: ETL → Database → ML (Failure, Cost) → Dashboards.

**Data input**: `../data/sample_reports_100.csv` (or `../data/sample_reports.csv`)

**Outputs**: cleaned data and artifacts in `../results/`.

## 02 – Failure Prediction (Logistic Regression & Random Forest)
Predict FAILED vs SUCCESS from reports.

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load cleaned or raw as fallback
possible_paths = [
    '../data/cleaned_reports.csv',
    '../data/sample_reports_100.csv',
    '/mnt/data/cleaned_reports.csv',
    '/mnt/data/sample_reports_100.csv'
]
data_path = next((p for p in possible_paths if Path(p).exists()), None)
assert data_path is not None, f'Could not find dataset. Checked: {possible_paths}'
print('Using data:', data_path)

df = pd.read_csv(data_path, parse_dates=['timestamp'])

# Target: FAILED=1 else 0
df['target'] = (df['status'].str.upper() == 'FAILED').astype(int)

features = ['system_name', 'error_code', 'response_time_ms', 'cpu_usage', 'memory_usage', 'cost_usd']
X = df[features]
y = df['target']

cat_features = ['system_name', 'error_code']
num_features = ['response_time_ms', 'cpu_usage', 'memory_usage', 'cost_usd']

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ('num', 'passthrough', num_features)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Logistic Regression
logreg = Pipeline(steps=[('prep', preprocess),
                        ('clf', LogisticRegression(max_iter=200))])
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
y_proba_lr = getattr(logreg, "predict_proba", lambda X: None)(X_test)
auc_lr = roc_auc_score(y_test, y_proba_lr[:,1]) if y_proba_lr is not None else None

print("=== Logistic Regression ===")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", auc_lr)

# Random Forest
rf = Pipeline(steps=[('prep', preprocess),
                    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))])
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = getattr(rf, "predict_proba", lambda X: None)(X_test)
auc_rf = roc_auc_score(y_test, y_proba_rf[:,1]) if y_proba_rf is not None else None

print("\n=== Random Forest ===")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", auc_rf)

# Save simple model artifacts (optional demo)
import joblib, os
art_dir = Path('../results')
art_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(logreg, art_dir / 'model_logreg.joblib')
joblib.dump(rf, art_dir / 'model_random_forest.joblib')
print('Saved models to ../results/')
