#Load dataset

In [None]:
!git clone https://github.com/AbdullahO/SAMoSSA.git


In [None]:
%cd /content/SAMoSSA

In [None]:
import numpy as np
dataset = np.load('/content/SAMoSSA/datasets/electricity/electricity.npy', encoding='bytes')


- Dataset contains 26304 arrays each of length 370 - each user is a column and for each user have 26304 datapoints

#Training stage
-Grid search for hyperparameters \\
-Use validation set to find optimal parameters (use only 50 users for validation to speed up process) \\
-Retrain the model using both the training and validation set on all users



In [None]:
training_set = dataset[25800:25848]      # Arrays 1 to 25824
validation_set = dataset[25848:25872] # Arrays 25825 to 25872

In [None]:
sampled_user_ids=[ 58,  53,  84, 274, 164, 365, 340, 225, 281,  48,  42, 298, 334,
        63,   3, 229, 262, 104,  64,  27, 133,  61, 245,   2,  67, 337,
       127, 248, 218, 217, 317, 280, 243,  76, 219, 250, 305,  75, 350,
        49,  95, 224, 162, 367,  73, 161, 238, 324,  29, 154]

In [None]:
import pandas as pd
import numpy as np

number_of_hours, num_users = training_set.shape

# Generate date range
date_range = pd.date_range(start='01/01/2011 00:00', periods=number_of_hours, freq='D')

# Reshape and create pairs of values and user IDs
data = []
for user_id in range(1, num_users + 1):
    for hour, value in enumerate(training_set[:, user_id - 1]):
        data.append([date_range[hour], value, user_id])

# Create DataFrame
df_train = pd.DataFrame(data, columns=['Date', 'Load', 'UserID'])


In [None]:
df_train

In [None]:
import pandas as pd
import numpy as np

number_of_hours, num_users = validation_set.shape

# Generate date range
date_range = pd.date_range(start='18/02/2011 00:00', periods=number_of_hours, freq='D')

# Reshape and create pairs of values and user IDs
data = []
for user_id in range(1, num_users + 1):
    for hour, value in enumerate(validation_set[:, user_id - 1]):
        data.append([date_range[hour], value, user_id])

# Create DataFrame
df_validation = pd.DataFrame(data, columns=['Date', 'Load', 'UserID'])

In [None]:
df_validation

In [None]:
# Filter the original DataFrame to include only the sampled user IDs
df_train = df_train[df_train['UserID'].isin(sampled_user_ids)]
df_validation = df_validation[df_validation['UserID'].isin(sampled_user_ids)]

In [None]:
!pip install prophet

#Testing Stage
- Fit model on testing and validation set for each user using hyperparameters found (Changepoint Prior Scale: 0.001
Seasonality Prior Scale: 0.01)
- Make predictions
- Calculate SMAPE between test and validation

In [None]:
#code for small dataset
from prophet import Prophet

# Unique UserIDs
user_ids = df_train['UserID'].unique()

# Dictionary to store models for each user
models = {}

for user_id in user_ids:
    # Subset for each user
    df_user = df_train[df_train['UserID'] == user_id][['Date', 'Load']]
    df_user = df_user.rename(columns={'Date': 'ds', 'Load': 'y'})

    # Remove rows with NaN values in 'y'
    df_user = df_user.dropna()

    model = Prophet(changepoint_prior_scale=0.001,
                    seasonality_prior_scale=0.01)

    model.fit(df_user)
    models[user_id] = model

In [None]:
# Preparing a list or a DataFrame to store predictions
predictions = []

for user_id in df_validation['UserID'].unique():
    if user_id in models:
        # Prepare the future DataFrame for this user
        user_test_data = df_validation[df_validation['UserID'] == user_id]
        future_dates = user_test_data[['Date']].rename(columns={'Date': 'ds'})
        future_dates = future_dates.dropna()

        # Make predictions
        forecast = models[user_id].predict(future_dates)

        # Add UserID to match with the test data
        forecast['UserID'] = user_id

        # Selecting only relevant columns for the predictions
        forecast_reduced = forecast[['ds', 'yhat', 'UserID']]

        # Merge with the true values from the test data
        merged_forecast = pd.merge(forecast_reduced, user_test_data, left_on=['UserID', 'ds'], right_on=['UserID', 'Date'],how='left')

        predictions.append(merged_forecast)

# Combine all predictions and true values into a single DataFrame
all_predictions_with_true = pd.concat(predictions)

# Now, all_predictions_with_true DataFrame contains the forecasts, including lower and upper bounds, along with true values for each user

In [None]:
all_predictions_with_true

In [None]:
import pandas as pd
df = pd.DataFrame(all_predictions_with_true)

# Function to calculate SMAPE
def calculate_smape(df):
    def smape(y_true, y_pred):
        denominator = abs(y_true+y_pred)
        diff = abs(y_true - y_pred) / denominator
        return 100 * diff.mean()

    smape_values = df.groupby('UserID').apply(lambda x: smape(x['Load'], x['yhat']))
    return smape_values

smape_results = calculate_smape(df)
overall_mean_smape = smape_results.mean()
overall_mean_smape