#Load dataset

In [None]:
!git clone https://github.com/AbdullahO/SAMoSSA.git


In [None]:
%cd /content/SAMoSSA

In [None]:
import numpy as np
dataset = np.load('/content/SAMoSSA/datasets/electricity/electricity.npy', encoding='bytes')


- Dataset contains 26304 arrays each of length 370 - each user is a column and for each user have 26304 datapoints

#Training stage
- Use validation set to find optimal hyperparameters of DeepAR model (number of layers, epochs,dropout)

In [None]:
training_set = dataset[25800:25848]      # Arrays 1 to 25824
validation_set = dataset[25848:25872] # Arrays 25825 to 25872

In [None]:
import pandas as pd
import numpy as np

number_of_hours, num_users = training_set.shape

# Generate date range
date_range = pd.date_range(start='01/01/2011 00:00', periods=number_of_hours, freq='H')

# Reshape and create pairs of values and user IDs
data = []
for user_id in range(1, num_users + 1):
    for hour, value in enumerate(training_set[:, user_id - 1]):
        data.append([date_range[hour], value, user_id])

# Create DataFrame
df_train = pd.DataFrame(data, columns=['Date', 'Load', 'UserID'])


In [None]:
import pandas as pd
import numpy as np

number_of_hours, num_users = validation_set.shape

# Generate date range
date_range = pd.date_range(start='03/01/2011 00:00', periods=number_of_hours, freq='H')

# Reshape and create pairs of values and user IDs
data = []
for user_id in range(1, num_users + 1):
    for hour, value in enumerate(validation_set[:, user_id - 1]):
        data.append([date_range[hour], value, user_id])

# Create DataFrame
df_valid = pd.DataFrame(data, columns=['Date', 'Load', 'UserID'])


In [None]:
sampled_user_ids=[ 58,  53,  84, 274, 164, 365, 340, 225, 281,  48,  42, 298, 334,
        63,   3, 229, 262, 104,  64,  27, 133,  61, 245,   2,  67, 337,
       127, 248, 218, 217, 317, 280, 243,  76, 219, 250, 305,  75, 350,
        49,  95, 224, 162, 367,  73, 161, 238, 324,  29, 154]

In [None]:
# Filter the original DataFrame to include only the sampled user IDs
df_train = df_train[df_train['UserID'].isin(sampled_user_ids)]
df_valid = df_valid[df_valid['UserID'].isin(sampled_user_ids)]

##Train DeepAR model


In [None]:
!pip install pytorch-lightning pytorch_forecasting
!pip install gluonts

In [None]:
import torch

print(torch.__version__)

In [None]:
from gluonts.dataset.pandas import PandasDataset

train_ds = PandasDataset.from_long_dataframe(df_train, target='Load', item_id='UserID',
                                       timestamp='Date', freq='H')

In [None]:
from gluonts.torch.model.deepar import DeepAREstimator

estimator = DeepAREstimator(freq='H', prediction_length=24, num_layers=7, dropout_rate=0.1, trainer_kwargs={'accelerator': 'gpu', 'max_epochs':100})
#model with optimal hyperparameters - use the same hyperparameters in testing stage but in testing input data is both training and validation set
predictor = estimator.train(train_ds, num_workers=2)

In [None]:
pred = list(predictor.predict(train_ds))

In [None]:
all_preds = list()
for item in pred:
    family = item.item_id
    p = item.samples.mean(axis=0)
    dates = pd.date_range(start=item.start_date.to_timestamp(), periods=len(p), freq='H')
    family_pred = pd.DataFrame({'Date': dates, 'pred': p,'UserID': family})
    all_preds += [family_pred]
all_preds = pd.concat(all_preds, ignore_index=True)


In [None]:
df_valid.reset_index(drop=True, inplace=True)
all_preds.reset_index(drop=True, inplace=True)

In [None]:
all_preds['Load'] = df_valid['Load']

In [None]:
all_preds

In [None]:
all_preds_df = pd.DataFrame(all_preds)

# Exclude rows where 'Time' is 0
non_zero_time_df = all_preds_df[all_preds_df['Load'] != 0]

# Calculate SMAPE for each user
smape_values = []
for user_id, group in non_zero_time_df.groupby('UserID'):
    smape = 100 * (np.abs(group['pred'] - group['Load']).sum()) / (np.abs(group['pred']) + np.abs(group['Load'])).sum()
    smape_values.append(smape)

smape_values

In [None]:
average_smape = np.mean(smape_values)
print("SMAPE for DeepAR model across all users (%):",average_smape)

#Testing stage - Calculate SMAPE on test set
-Combine train and validation set, retrain model, calculate SMAPE on test set

In [None]:
train_val_set = dataset[:25872]

In [None]:
import pandas as pd
import numpy as np

number_of_hours, num_users = train_val_set.shape

# Generate date range
date_range = pd.date_range(start='01/01/2011 00:00', periods=number_of_hours, freq='H')

# Reshape and create pairs of values and user IDs
data = []
for user_id in range(1, num_users + 1):
    for hour, value in enumerate(train_val_set[:, user_id - 1]):
        data.append([date_range[hour], value, user_id])

# Create DataFrame
df_train_val = pd.DataFrame(data, columns=['Date', 'Load', 'UserID'])


In [None]:
df_train_val

In [None]:
from gluonts.dataset.pandas import PandasDataset

train_val_ds = PandasDataset.from_long_dataframe(df_train_val, target='Load', item_id='UserID',
                                       timestamp='Date', freq='H')

In [None]:
from gluonts.torch.model.deepar import DeepAREstimator

estimator = DeepAREstimator(freq='H', prediction_length=48, num_layers=7, dropout_rate=0.1, trainer_kwargs={'accelerator': 'gpu', 'max_epochs':50})

predictor = estimator.train(train_val_ds, num_workers=2)

In [None]:
pred_test = list(predictor.predict(train_val_ds))

In [None]:
all_preds = list()
for item in pred_test:
    family = item.item_id
    p = item.samples.mean(axis=0)
    dates = pd.date_range(start=item.start_date.to_timestamp(), periods=len(p), freq='H')
    family_pred = pd.DataFrame({'Date': dates, 'pred': p,'UserID': family})
    all_preds += [family_pred]
all_preds_test = pd.concat(all_preds, ignore_index=True)

In [None]:
import pandas as pd
import numpy as np

number_of_hours, num_users = testing_set.shape

# Generate date range
date_range = pd.date_range(start='14/12/2013 00:00', periods=number_of_hours, freq='H')

# Reshape and create pairs of values and user IDs
data = []
for user_id in range(1, num_users + 1):
    for hour, value in enumerate(testing_set[:, user_id - 1]):
        data.append([date_range[hour], value, user_id])

# Create DataFrame
df_test = pd.DataFrame(data, columns=['Date', 'Load', 'UserID'])

In [None]:
df_test

In [None]:
df_test.reset_index(drop=True, inplace=True)
all_preds_test.reset_index(drop=True, inplace=True)

In [None]:
all_preds_test['Load'] = df_test['Load']

In [None]:
all_preds_test_df = pd.DataFrame(all_preds_test)


# Calculate SMAPE for each user
smape_values_test = []
for user_id, group in all_preds_test_df.groupby('UserID'):
    smape = 100 * (np.abs(group['pred'] - group['Load']).sum()) / np.abs(group['pred']+ np.abs(group['Load'])+0.0000001).sum()
    smape_values_test.append(smape)

smape_values_test

In [None]:
average_smape_test= np.mean(smape_values_test)
print("SMAPE for DeepAR model across all users (%):",average_smape_test)