In [10]:
import pandas as pd
import numpy as np
import warnings
import sys
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrices
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

try: 
    import holidays
except:
    !pip install holidays
    import holidays
    
try:
    from prophet import Prophet
except: 
    !pip install Prophet
    from prophet import Prophet
    
try:
    from pmdarima.arima import auto_arima
except:
    !pip install pmdarima
    from pmdarima.arima import auto_arima

from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics

%matplotlib inline

%precision 4   


warnings.filterwarnings("ignore")



In [None]:
# Load the data
file_path = "data.csv"  # Update this if needed
df = pd.read_csv(file_path, parse_dates=['ts'], index_col='ts')

In [None]:
# Step 1: Aggregate Data for EDA
agg_df = df.resample('D').agg({
    'v1': ['mean', 'std', 'min', 'max'],
    'v2': ['mean', 'std', 'min', 'max']
})

# Flatten column names
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns]
agg_df.reset_index(inplace=True)

In [None]:
# Generate Time-Based Features
df["year"] = df.index.year
df["quarter"] = df.index.quarter
df["month"] = df.index.month
df["day"] = df.index.day
df["day_of_week"] = df.index.dayofweek + 1
df['weekend'] = (df['day_of_week'] >= 5).astype(int)


In [28]:

# Define the holdout set (last two months of 2024: Sept & Oct)
df_holdout = df[(df['year'] == 2024) & (df['month'] >= 9)]

# Define the work set (everything before November 2024)
df_work = df[(df['year'] < 2024) | ((df['year'] == 2024) & (df['month'] < 11))]

print(f"- Work set size: {df_work.shape[0]} rows")
print(f"- Holdout set size: {df_holdout.shape[0]} rows")

- Work set size: 24568 rows
- Holdout set size: 1440 rows


In [None]:

# # Step 3: Generate Lagged Values and Moving Averages
# lags = [1, 3, 6, 12, 24]  # Lag intervals in hours
# for lag in lags:
#     df[f'v1_lag_{lag}'] = df['v1'].shift(lag)
#     df[f'v2_lag_{lag}'] = df['v2'].shift(lag)

# # Moving Averages
# windows = [3, 6, 12, 24]  # Moving average windows in hours
# for window in windows:
#     df[f'v1_ma_{window}'] = df['v1'].rolling(window=window).mean()
#     df[f'v2_ma_{window}'] = df['v2'].rolling(window=window).mean()

# # Drop rows with NaN values caused by shifting
# df.dropna(inplace=True)

In [19]:
df.head()

Unnamed: 0_level_0,v1,v2,year,quarter,month,day,day_of_week,weekend,v1_lag_1,v2_lag_1,...,v1_lag_24,v2_lag_24,v1_ma_3,v2_ma_3,v1_ma_6,v2_ma_6,v1_ma_12,v2_ma_12,v1_ma_24,v2_ma_24
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.458599,0.172392,2022,1,1,2,7,1,1.428871,0.545715,...,0.999036,0.318587,1.232123,0.473084,1.360514,0.520717,1.159526,0.480125,0.811549,0.314868
2022-01-02 01:00:00,0.397861,0.141158,2022,1,1,2,7,1,0.458599,0.172392,...,0.695036,0.265836,0.761777,0.286422,1.144681,0.432287,1.126329,0.468727,0.799167,0.309673
2022-01-02 02:00:00,0.290278,0.087576,2022,1,1,2,7,1,0.397861,0.141158,...,0.764995,0.320261,0.382246,0.133709,0.903649,0.338882,1.072462,0.445056,0.779387,0.299978
2022-01-02 03:00:00,0.277386,0.086935,2022,1,1,2,7,1,0.290278,0.087576,...,0.297017,0.064074,0.321842,0.105223,0.776982,0.289153,1.017786,0.4137,0.778569,0.300931
2022-01-02 04:00:00,0.187479,0.043659,2022,1,1,2,7,1,0.277386,0.086935,...,0.366184,0.083961,0.251714,0.072723,0.506746,0.179573,0.942258,0.377972,0.771123,0.299252


In [4]:
df.head()

Unnamed: 0_level_0,v1,v2,hour,day_of_week,month,is_weekend,v1_lag_1,v2_lag_1,v1_lag_3,v2_lag_3,...,v1_lag_24,v2_lag_24,v1_ma_3,v2_ma_3,v1_ma_6,v2_ma_6,v1_ma_12,v2_ma_12,v1_ma_24,v2_ma_24
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.458599,0.172392,0,6,1,1,1.428871,0.545715,1.037385,0.385307,...,0.999036,0.318587,1.232123,0.473084,1.360514,0.520717,1.159526,0.480125,0.811549,0.314868
2022-01-02 01:00:00,0.397861,0.141158,1,6,1,1,0.458599,0.172392,1.808898,0.701144,...,0.695036,0.265836,0.761777,0.286422,1.144681,0.432287,1.126329,0.468727,0.799167,0.309673
2022-01-02 02:00:00,0.290278,0.087576,2,6,1,1,0.397861,0.141158,1.428871,0.545715,...,0.764995,0.320261,0.382246,0.133709,0.903649,0.338882,1.072462,0.445056,0.779387,0.299978
2022-01-02 03:00:00,0.277386,0.086935,3,6,1,1,0.290278,0.087576,0.458599,0.172392,...,0.297017,0.064074,0.321842,0.105223,0.776982,0.289153,1.017786,0.4137,0.778569,0.300931
2022-01-02 04:00:00,0.187479,0.043659,4,6,1,1,0.277386,0.086935,0.397861,0.141158,...,0.366184,0.083961,0.251714,0.072723,0.506746,0.179573,0.942258,0.377972,0.771123,0.299252


In [7]:
agg_df.head()

Unnamed: 0,ts,v1_mean,v1_std,v1_min,v1_max,v2_mean,v2_std,v2_min,v2_max
0,2022-01-01,0.834067,0.469176,0.282241,1.808898,0.32096,0.207638,0.063496,0.701144
1,2022-01-02,0.697316,0.407514,0.185973,1.730854,0.261147,0.18567,0.032356,0.795432
2,2022-01-03,1.154373,0.68847,0.188276,2.201807,0.372436,0.193573,0.049149,0.63572
3,2022-01-04,1.153662,0.701012,0.158614,2.561231,0.368689,0.192976,0.036944,0.698055
4,2022-01-05,1.132566,0.653181,0.181847,2.188276,0.34668,0.18815,0.057175,0.622978


In [8]:
df.is_weekend.value_counts()

is_weekend
0    17570
1     6998
Name: count, dtype: int64