# Cloud Expenditure Optimization – Notebook Suite
This set of notebooks follows the architecture: ETL → Database → ML (Failure, Cost) → Dashboards.

**Data input**: `../data/sample_reports_100.csv` (or `../data/sample_reports.csv`)

**Outputs**: cleaned data and artifacts in `../results/`.

## 03 – Cost Forecasting (ARIMA baseline)
Aggregate costs over time and forecast next periods.

In [None]:
import pandas as pd
from pathlib import Path

possible_paths = [
    '../data/cleaned_reports.csv',
    '../data/sample_reports_100.csv',
    '/mnt/data/cleaned_reports.csv',
    '/mnt/data/sample_reports_100.csv'
]
data_path = next((p for p in possible_paths if Path(p).exists()), None)
assert data_path is not None, f'Could not find dataset. Checked: {possible_paths}'
print('Using data:', data_path)

df = pd.read_csv(data_path, parse_dates=['timestamp'])
df = df.sort_values('timestamp')
ts = df.set_index('timestamp')['cost_usd'].resample('15min').sum().fillna(0.0)

ts.head()


In [None]:
# Plot the series (single chart, no custom colors)
import matplotlib.pyplot as plt
plt.figure()
ts.plot()
plt.title('Resampled Cloud Cost (15-min)')
plt.tight_layout()
plt.show()


In [None]:
# Try ARIMA; if statsmodels not installed, use naive rolling mean forecast
forecast_df = None
horizon = 8  # forecast 8 intervals

try:
    from statsmodels.tsa.arima.model import ARIMA
    model = ARIMA(ts, order=(1,1,1))
    model_fit = model.fit()
    fc = model_fit.forecast(steps=horizon)
    forecast_df = fc.to_frame(name='forecast_cost_usd')
except Exception as e:
    print('ARIMA unavailable, falling back to rolling mean:', e)
    rm = ts.rolling(window=5, min_periods=1).mean()
    last_val = rm.iloc[-1]
    forecast_df = pd.DataFrame({'forecast_cost_usd': [last_val]*horizon})

forecast_df.index = pd.date_range(ts.index[-1] + pd.Timedelta(minutes=15), periods=horizon, freq='15min')
forecast_df.head()


In [None]:
# Plot historical + forecast
import matplotlib.pyplot as plt

plt.figure()
ts.plot(label='history')
forecast_df['forecast_cost_usd'].plot(label='forecast')
plt.title('Cloud Cost Forecast (baseline)')
plt.legend()
plt.tight_layout()
plt.show()

# Save to results for dashboard use
out_path = Path('../results/cost_forecast.csv')
forecast_df.to_csv(out_path)
print('Saved forecast to', out_path)
