In [0]:
%pip install pmdarima statsmodels prophet


In [0]:
# --------------------------------------------
# üìå SECTION 1 ‚Äî Load Data from Databricks
# --------------------------------------------

print("="*80)
print("SECTION 1 ‚Äî LOAD DATA FROM DATABRICKS")
print("="*80)

from pyspark.sql import functions as F
import pandas as pd

# Load from your existing table
df_spark = spark.table("workspace.default.train_imputed_timebins_lags")

df_spark = df_spark.select(
    "timestamp", "country", "grid_stress_score",
    "mean_temperature_c", "Actual_Load"
)

df_spark = df_spark.orderBy("timestamp")

df = df_spark.toPandas()
df["timestamp"] = pd.to_datetime(df["timestamp"])

print(df.head())
print(df.shape)


####‚≠ê For Classical Time-Series (ARIMA / SARIMA / Prophet)
You **DO NOT** load a validation dataset separately.  

Why?  

Because:  
üìå Classical time-series models automatically create validation by performing a train/test split inside the time series itself.  

There is only ONE continuous historical time series, and validation happens like this:  

train part = first 90% of the timeline  
test part  = last 10% (or last N hours)  

ARIMA uses past ‚Üí predict future, so you do NOT feed it a separate validation table like in ML.  

In [0]:
# --------------------------------------------
# üìå SECTION 2 ‚Äî Select Country & Prepare TS
# --------------------------------------------

print("="*80)
print("SECTION 2 ‚Äî PREPARE TIME SERIES (PER COUNTRY)")
print("="*80)

country = "DE"   # ‚Üê change if needed

df_country = df[df["country"] == country].copy()

df_country = df_country.sort_values("timestamp")
df_country = df_country.set_index("timestamp")

ts = df_country["grid_stress_score"]

print(ts.head())
print("Length:", len(ts))


In [0]:
# --------------------------------------------
# üìå SECTION 3 ‚Äî Stationarity (ADF Test)
# --------------------------------------------

print("="*80)
print("SECTION 3 ‚Äî STATIONARITY TEST (ADF)")
print("="*80)

from statsmodels.tsa.stattools import adfuller

adf = adfuller(ts.dropna())

print("ADF Statistic:", adf[0])
print("p-value:", adf[1])

if adf[1] < 0.05:
    print("‚úì Time series is stationary ‚Üí d = 0")
else:
    print("‚úó Time series NOT stationary ‚Üí differencing needed (d = 1)")


In [0]:
# --------------------------------------------
# üìå SECTION 4 ‚Äî Auto-ARIMA (Parameter Selection)
# --------------------------------------------

print("="*80)
print("SECTION 4 ‚Äî AUTO ARIMA PARAMETER SEARCH")
print("="*80)

from pmdarima import auto_arima

auto_model = auto_arima(
    ts,
    seasonal=False,
    trace=True,
    stepwise=True,
    suppress_warnings=True
)

print(auto_model.summary())

p, d, q = auto_model.order
print(f"Selected order: (p,d,q) = ({p},{d},{q})")


In [0]:
# --------------------------------------------
# üìå SECTION 5 ‚Äî Fit ARIMA Model
# --------------------------------------------

print("="*80)
print("SECTION 5 ‚Äî FIT ARIMA MODEL")
print("="*80)

from statsmodels.tsa.arima.model import ARIMA

arima = ARIMA(ts, order=(p, d, q))
arima_fit = arima.fit()

print(arima_fit.summary())


In [0]:
# --------------------------------------------
# üìå SECTION 6 ‚Äî Forecast Next 6 Hours
# --------------------------------------------

print("="*80)
print("SECTION 6 ‚Äî FORECAST NEXT 6 HOURS (ARIMA)")
print("="*80)

forecast_steps = 6
arima_forecast = arima_fit.forecast(steps=forecast_steps)

print(arima_forecast)


In [0]:
# --------------------------------------------
# üìå SECTION 7 ‚Äî Plot Forecast
# --------------------------------------------

print("="*80)
print("SECTION 7 ‚Äî PLOT RESULTS")
print("="*80)

import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))
plt.plot(ts[-200:], label="Historical (200 points)")
plt.plot(arima_forecast, label="Forecast (next 6 hours)")

plt.title(f"ARIMA Forecast for {country}")
plt.legend()
plt.show()


In [0]:
# --------------------------------------------
# üìå SECTION 8 ‚Äî SARIMA (Seasonal ARIMA)
# --------------------------------------------

# due to memeory issues, ignore the cell first

#print("="*80)
#print("SECTION 8 ‚Äî SARIMA MODEL (OPTIONAL)")
#print("="*80)

#sarima_auto = auto_arima(
#    ts,
#    seasonal=True,
#    m=24,  # 24-hour seasonality
#    trace=True,
#    suppress_warnings=True
#)

#P, D, Q, m = sarima_auto.seasonal_order
#p, d, q = sarima_auto.order

#print("SARIMA order:", (p, d, q))
#print("Seasonal order:", (P, D, Q, m))

#from statsmodels.tsa.statespace.sarimax import SARIMAX

#sarima = SARIMAX(ts, order=(p,d,q), seasonal_order=(P,D,Q,m))
#sarima_fit = sarima.fit()

#sarima_fc = sarima_fit.forecast(steps=6)
#print(sarima_fc)


In [0]:
# PLOT SARIMA FORECAST
#plt.figure(figsize=(12,5))
#plt.plot(ts[-200:], label="Historical (last 200 points)")
#plt.plot(sarima_fc, label="SARIMA Forecast (next 6 hours)", color="orange")
#plt.legend()
##plt.title(f"SARIMA Forecast for {country}")
# plt.show()


In [0]:
# --------------------------------------------
# üìå SECTION 9 ‚Äî Prophet Forecasting
# --------------------------------------------

print("="*80)
print("SECTION 9 ‚Äî PROPHET FORECASTING")
print("="*80)

from prophet import Prophet

df_prophet = df_country.reset_index()[["timestamp", "grid_stress_score"]]
df_prophet.columns = ["ds", "y"]

m = Prophet(daily_seasonality=True)
m.fit(df_prophet)

future = m.make_future_dataframe(periods=6, freq="H")
forecast = m.predict(future)

forecast.tail(10)


In [0]:
fig = m.plot(forecast)
fig.set_size_inches(12, 5)

m.plot_components(forecast)


In [0]:
# --------------------------------------------
# üìå SECTION 10 ‚Äî Reusable Forecast Function
# --------------------------------------------

# don't know how to use it yet.

# print("="*80)
#print("SECTION 10 ‚Äî REUSABLE FORECAST FUNCTION")
#print("="*80)

#def forecast_arima_pipeline(df, country, steps=6):
#    df_c = df[df["country"] == country].copy()
#   df_c = df_c.sort_values("timestamp").set_index("timestamp")
    
#    ts = df_c["grid_stress_score"]
    
#    auto_model = auto_arima(ts, seasonal=False, suppress_warnings=True)
#    p, d, q = auto_model.order
    
#    model = ARIMA(ts, order=(p,d,q))
#    fit = model.fit()
    
#    fc = fit.forecast(steps=steps)
#    return ts, fc

#ts_hist, fc_next = forecast_arima_pipeline(df, "ALL")
#print(fc_next)


####üéâ ARIMA Pipeline

This notebook:  

‚úî Loads your Databricks tables  
‚úî Converts to Pandas time series  
‚úî Performs all classical time-series steps  
‚úî Fits ARIMA, SARIMA, SARIMAX, Prophet  
‚úî Forecasts next 6 hours  
‚úî Plots results  
‚úî Provides reusable forecasting functions  