In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as mlt
import seaborn as sp
from torch.autograd import Variable
from torch import autograd
from datetime import datetime
import matplotlib.pyplot as plt
from datetime import timedelta
import sys
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from scipy.stats import ks_2samp
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
import json
from types import SimpleNamespace
from math import floor

In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# columns = ['Day of week', 'Energy', 'Week Day', 'Year', 'Month', 'Day of month', 'Hour', 'Sum', 'Seasonal', 'Trend', 'Residual']
columns = ['Day of week', 'Energy', 'stations', 'Week Day', 'Year', 'Month', 'Day of month', 'Hour']
# columns_extended = columns + ['Seasonal', 'Trend', 'Residual']
columns_extended = ['Day of week', 'Energy', 'stations', 'Week Day', 'Month', 'Day of month', 'Hour', 'Year']

In [4]:
%run ../Pre_process/Data_preprocess.ipynb
%run ../Pre_process/Data_postprocess.ipynb
%run Model/ResiDualNet.ipynb
%run Model/ConvGan.ipynb
%run Model/AutoEncoder.ipynb
%run Model/Mean_imputation.ipynb
%run Model/KNN_imputer.ipynb
%run train.ipynb
%run wrapper.ipynb
%run helper.ipynb
%run ../visualize.ipynb
%run test.ipynb
%run ../validation.ipynb

In [5]:
# torch.cuda.empty_cache()

In [6]:
#pre_process_dataset("Data/Raw/boulder_2021.csv", 'boulder')
# pre_process_dataset("../Data/Raw/acn.csv", 'acn2')

In [7]:
with open("config/config_data.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_acn = data.acn
    parameters_boulder = data.boulder
    parameters_paloalto = data.paloalto
    parameters_sap = data.sap
    parameters_perth = data.perth
    parameters_dundee = data.dundee
    parameters_caltech = data.caltech
    parameters_jpl = data.jpl
    parameters_office = data.office

In [8]:
with open("config/config_model.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_seq2seq = data.impute_40.seq2seq
    parameters_seq2seq.device = device

In [9]:
parameter_model = parameters_seq2seq
parameter_data = parameters_caltech

In [10]:
missing_ratio = 0.50

In [11]:
dataset = 'caltech2'

In [12]:
df = pd.read_csv("../Data/Processed/" + dataset + "_data_with_zero.csv")
df['Hour'] = pd.to_datetime(df['Start']).dt.hour
# df['Sum'] = df.groupby(pd.to_datetime(df['Start']).dt.date)['Energy'].cumsum()
df = df[[col for col in df.columns if col != 'Year'] + ['Year']]
df = df.copy().loc[(df['Start'] >= "2018-10-01 00:00:00") & (df['Start'] <= "2020-02-29 23:00:00")].reset_index(drop=True)
df['Start'] = pd.to_datetime(df['Start'])
# df.set_index('Start', inplace=True)

In [13]:
df_train = df.copy().loc[(df['Start'] >= parameter_data.train.start) & (df['Start'] <= parameter_data.train.end)].reset_index(drop=True)
df_train.reset_index(drop=True, inplace=True)

In [14]:
df_test = df.copy().loc[(df['Start'] >= parameter_data.test.start) & (df['Start'] <= parameter_data.test.end)].reset_index(drop=True)
df_test.reset_index(drop=True, inplace=True)

In [15]:
np.random.seed(0)
ratio = round(missing_ratio * len(df_train))
random_row_indices_train = np.random.choice(df_train.index, size=ratio, replace=False)

In [16]:
np.random.seed(0)
ratio = round(missing_ratio * len(df_test))
random_row_indices_test = np.random.choice(df_test.index, size=ratio, replace=False)

In [17]:
df_train_missing = random_index_noise(df_train.copy(), random_row_indices_train)
df_train_mask = np.isnan(df_train_missing)
df_train_missing = df_train_missing.replace(np.nan, 0)

df_test_missing = random_index_noise(df_test.copy(), random_row_indices_test)
df_test_mask = np.isnan(df_test_missing)
df_test_missing = df_test_missing.replace(np.nan, 0)

In [18]:
df_train.set_index('Start', inplace=True)
df_train_missing.set_index('Start', inplace=True)
df_test.set_index('Start', inplace=True)
df_test_missing.set_index('Start', inplace=True)

In [19]:
# df_train_missing['mean'] = df_train_missing['Energy'].expanding().mean()
# df_train_missing['std'] = df_train_missing['Energy'].expanding().std().fillna(0)
df_train_missing = get_data_decomposition(df_train_missing)

# df_test_missing['mean'] = df_test_missing['Energy'].expanding().mean()
# df_test_missing['std'] = df_test_missing['Energy'].expanding().std().fillna(0)
df_test_missing = get_data_decomposition(df_test_missing)

# df_train['mean'] = df_train['Energy'].expanding().mean()
# df_train['std'] = df_train['Energy'].expanding().std().fillna(0)
df_train = get_data_decomposition(df_train)

# df_test['mean'] = df_test['Energy'].expanding().mean()
# df_test['std'] = df_test['Energy'].expanding().std().fillna(0)
df_test = get_data_decomposition(df_test)

# df_train_mask[['mean']] = df_train_missing[['mean']].isnull()
# df_test_mask[['mean']] = df_test_missing[['mean']].isnull()

df_train_mask[['Residual', 'Trend', 'Seasonal']] = df_train_missing[['Residual', 'Trend', 'Seasonal']].isnull()
df_test_mask[['Residual', 'Trend', 'Seasonal']] = df_test_missing[['Residual', 'Trend', 'Seasonal']].isnull()

In [20]:
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

df_train_missing.reset_index(drop=True, inplace=True)
df_test_missing.reset_index(drop=True, inplace=True)

df_train_mask.drop(columns=['Start'], inplace=True)
df_test_mask.drop(columns=['Start'], inplace=True)

In [21]:
df_train = df_train[columns_extended]
df_test = df_test[columns_extended]
df_train_missing = df_train_missing[columns_extended]
df_test_missing = df_test_missing[columns_extended]
df_train_mask = df_train_mask[columns_extended]
df_test_mask = df_test_mask[columns_extended]

In [4636]:
# first_column = df.iloc[:, 0]

In [4185]:
scaler_train_real = MinMaxScaler(feature_range=(0,1))
df_train = scaler_train_real.fit_transform(df_train)
df_train = pd.DataFrame(df_train, columns=columns_extended)
df_test = scaler_train_real.transform(df_test)
df_test = pd.DataFrame(df_test, columns=columns_extended)

scaler_train_missing = MinMaxScaler(feature_range=(0,1))
df_train_missing = scaler_train_missing.fit_transform(df_train_missing)
df_train_missing = pd.DataFrame(df_train_missing, columns=columns_extended)
df_test_missing = scaler_train_missing.transform(df_test_missing)
df_test_missing = pd.DataFrame(df_test_missing, columns=columns_extended)
# df = pd.concat([first_column, df], axis=1)

In [22]:
missing_train, real_train, mask_train = get_train_test_dataset_imputation(df_train, df_train_missing, df_train_mask, 0, parameter_model.lag_size, random_row_indices_train)

In [23]:
missing_test, real_test, mask_test = get_train_test_dataset_imputation(df_test, df_test_missing, df_test_mask, 0, parameter_model.lag_size, random_row_indices_test)

In [24]:
step_per_epoch = len(missing_train) // batch_size

# Converting to tensor
real_train = torch.from_numpy(real_train).float().to(device)
missing_train = torch.from_numpy(missing_train).float().to(device)
mask_train = torch.from_numpy(mask_train).float().to(device)

# GAN Model

In [25]:
wrapper = ModelTrain(parameter_model)
helper = ModelHelper(parameter_model)

In [26]:
discriminator = Discriminator(parameter_model).to(device)
generator = Generator(parameter_model).to(device)
optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr = parameter_model.learning_rate, weight_decay=0.05)
optimizer_generator = torch.optim.Adam(generator.parameters(), lr = parameter_model.learning_rate, weight_decay=0.05)

In [27]:
loss_function_MSE = nn.MSELoss()
loss_function = nn.BCELoss()

In [None]:
real_dataset, gen_dataset, mask_data, errors_generator, errors_discriminator = wrapper.train_Gan(generator, discriminator, optimizer_discriminator, optimizer_generator, loss_function, loss_function_MSE, real_train, missing_train, mask_train, step_per_epoch, helper)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_discriminator, label='d_loss')
mlt.plot(errors_generator, label='g_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
plot_full_dataset(real_dataset.detach().cpu().numpy(), gen_dataset.detach().cpu().numpy(), 00, 300, 'Results/test_9-19.png', 1)

In [4109]:
# real_dataset_train_seq2seq = pd.DataFrame(real_dataset.detach().cpu().numpy(), columns=columns_extended)
# real_dataset_train_seq2seq = scaler_train_real.inverse_transform(real_dataset_train_seq2seq)
# real_dataset_train_seq2seq = torch.tensor(real_dataset_train_seq2seq)
# #real_dataset_train_seq2seq = pd.DataFrame(real_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

In [4110]:
# gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset.detach().cpu().numpy(), columns=columns_extended)
# gen_dataset_train_seq2seq = scaler_train_real.inverse_transform(gen_dataset_train_seq2seq)
# gen_dataset_train_seq2seq = torch.tensor(gen_dataset_train_seq2seq)
# #gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

### Testing..........................................

In [4111]:
real_test = real_test[: -(real_test.shape[0] % parameter_model.batch_size)]
missing_test = missing_test[: -(missing_test.shape[0] % parameter_model.batch_size)]
mask_test = mask_test[: -(mask_test.shape[0] % parameter_model.batch_size)]

In [4112]:
step_per_epoch = len(missing_test) // parameter_model.batch_size

In [4113]:
real_test = torch.from_numpy(real_test).float().to(device)
missing_test = torch.from_numpy(missing_test).float().to(device)
mask_test = torch.from_numpy(mask_test).float().to(device)

In [4114]:
wrapper_test = ModelTest(parameter_model)

In [4115]:
real_dataset_test_gan, imputed_gan, loss, mask_test_result = wrapper_test.test_gan(generator, real_test, missing_test, mask_test, loss_function_MSE, step_per_epoch, helper)

In [4116]:
imputed_gan = torch.cat((imputed_gan, real_dataset_test_gan[:, -1].unsqueeze(1)), dim=1)

In [4117]:
imputed_dataset_final_gan = ((1 - mask_test_result) * real_dataset_test_gan) + (mask_test_result * imputed_gan)

In [None]:
imputed_dataset_final_gan[0]

In [4120]:
dataset_name = dataset + '_9-19'
save_imputed_data(real_dataset_test_gan, imputed_dataset_final_gan, "../Data/Imputed/50_percent/Gan/" + dataset_name + ".csv", columns_extended, None)

In [3738]:
# real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns_extended)
# real_dataset_test_seq2seq = scaler_train_real.inverse_transform(real_dataset_test_seq2seq)
real_dataset_test_gan = torch.tensor(real_dataset_test_gan.detach().cpu().numpy())
#real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns)
# imputed_dataset = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns_extended)
# imputed_dataset = scaler_train_real.inverse_transform(imputed_dataset)
imputed_gan = torch.tensor(imputed_gan.detach().cpu().numpy())
#imputed_dataset = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns)

In [3739]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

In [None]:
plot_full_dataset(real_dataset_test_gan.detach().cpu().numpy(), imputed_gan.detach().cpu().numpy(), 00, 300, 'Results/test_9-19.png', 1)

In [3665]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

In [None]:
validation_matrix_forecasting(real_dataset_test_gan[indices, 1], imputed_gan[indices, 1], 1)

In [None]:
ks_statistic, p_value = ks_2samp(real_dataset_test_gan[:, 1], imputed_gan[:, 1])

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

# ResiDualNet

Trainning------------------------------------------------------------------------------------------------------------------------------

In [4189]:
model = ResiDualNet(parameter_model).to(device)
#model = Seq2SeqAttention(input_size, hidden_size, input_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = parameter_model.learning_rate, weight_decay = 0.005)
loss_function_seq2seq = nn.L1Loss()


In [4190]:
wrapper = ModelTrain(parameter_model)
helper = ModelHelper(parameter_model)

In [None]:
real_dataset, gen_dataset, errors_generator, mask_data = wrapper.train_Seq2Seq(model, optimizer, loss_function_seq2seq, real_train, missing_train, mask_train, step_per_epoch, helper)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_generator, label='d_loss')
mlt.legend()

#mlt.savefig('foo1.png')
mlt.show()

In [None]:
plot_imputation_results_two(real_dataset.detach().cpu().numpy(), gen_dataset.detach().cpu().numpy(), mask_data.detach().cpu().numpy(), 300, 450, 'Results/test_9-19.png')

In [None]:
plot_full_dataset(real_dataset.detach().cpu().numpy(), gen_dataset.detach().cpu().numpy(), 2500, 3000, 'Results/test_9-19.png', 1)

In [4195]:
real_dataset_train_seq2seq = pd.DataFrame(real_dataset.detach().cpu().numpy(), columns=columns_extended)
real_dataset_train_seq2seq = scaler_train_real.inverse_transform(real_dataset_train_seq2seq)
real_dataset_train_seq2seq = torch.tensor(real_dataset_train_seq2seq)
#real_dataset_train_seq2seq = pd.DataFrame(real_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

In [4196]:
gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset.detach().cpu().numpy(), columns=columns_extended)
gen_dataset_train_seq2seq = scaler_train_real.inverse_transform(gen_dataset_train_seq2seq)
gen_dataset_train_seq2seq = torch.tensor(gen_dataset_train_seq2seq)
#gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

Testing-------------------------------------------------------------------------------------------------------------------------------

In [4197]:
real_test = real_test[: -(real_test.shape[0] % parameter_model.batch_size)]
missing_test = missing_test[: -(missing_test.shape[0] % parameter_model.batch_size)]
mask_test = mask_test[: -(mask_test.shape[0] % parameter_model.batch_size)]

In [4198]:
step_per_epoch = len(missing_test) // parameter_model.batch_size

In [4199]:
# tr = df['Energy']
# mlt.figure(figsize=(20, 6))
# mlt.suptitle('Gan prediction on test dataset')
# mlt.ylabel('Energy Consumption in Kwh')
# mlt.plot(tr, label='real')
# mlt.legend()
# mlt.show()

In [4200]:
real_test = torch.from_numpy(real_test).float().to(device)
missing_test = torch.from_numpy(missing_test).float().to(device)
mask_test = torch.from_numpy(mask_test).float().to(device)

In [4201]:
wrapper_test = ModelTest(parameters_seq2seq)

In [4202]:
real_dataset_test_seq2seq, imputed_dataset, loss, mask_test_result = wrapper_test.test_model(model, real_test, missing_test, mask_test, loss_function_seq2seq, step_per_epoch, helper)

In [4203]:
imputed_dataset_temp = ((1 - mask_test_result) * real_dataset_test_seq2seq) + (mask_test_result * imputed_dataset)

In [4204]:
imputed_dataset_final_seq2seq = scaler_train_real.inverse_transform(imputed_dataset_temp.detach().cpu().numpy())

In [4205]:
real_dataset_test_seq2seq_temp = scaler_train_real.inverse_transform(real_dataset_test_seq2seq.detach().cpu().numpy())

In [4206]:
real_dataset_test_seq2seq_temp = pd.DataFrame(real_dataset_test_seq2seq_temp, columns=columns_extended)

In [4207]:
dataset_name = dataset + '_9-19'

In [4208]:
save_imputed_data(real_dataset_test_seq2seq, imputed_dataset_temp, "../Data/Imputed/30_percent/Seq2Seq/" + dataset_name + ".csv", columns_extended, scaler_train_real)

In [4209]:
real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns_extended)
real_dataset_test_seq2seq = scaler_train_real.inverse_transform(real_dataset_test_seq2seq)
real_dataset_test_seq2seq = torch.tensor(real_dataset_test_seq2seq)
#real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns)

In [4210]:
imputed_dataset_seq2seq = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns_extended)
imputed_dataset_seq2seq = scaler_train_real.inverse_transform(imputed_dataset_seq2seq)
imputed_dataset_seq2seq = torch.tensor(imputed_dataset_seq2seq)
#imputed_dataset = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns)

In [4211]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

In [None]:
plot_full_dataset(real_dataset_test_seq2seq.detach().cpu().numpy(), imputed_dataset_seq2seq.detach().cpu().numpy(), 00, 3000, 'Results/test_9-19.png', 1)

In [None]:
mlt.suptitle('Loss')
#mlt.plot(errors_generator, label='train_loss')
mlt.plot(loss[:], label='test_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

Evaluation----------------------------------------------------------------------------------------------------------------

In [4214]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

In [None]:
indices

In [None]:
validation_matrix_forecasting(real_dataset_test_seq2seq[indices, 1], imputed_dataset_seq2seq[indices, 1], 1)

In [None]:
validation_matrix_imputation(real_dataset_test_seq2seq.numpy(), imputed_dataset.numpy(), mask_test_result.detach().cpu().numpy(), 1)

In [None]:
ks_statistic, p_value = ks_2samp(real_dataset_test_seq2seq[:, 1], imputed_dataset_seq2seq[:, 1])

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

# Conv Gan

In [None]:
mean = 0  # Mean of the distribution
std_dev = 1  # Standard deviation of the distribution

# Generate random data from a normal distribution
random_data = np.random.normal(loc=mean, scale=std_dev, size=(real_train.size()))
#random_data = np.clip(random_data, 0, 1)
random_data = torch.tensor(random_data,dtype=torch.float32, requires_grad=True).to(device)

In [None]:
loss_function = nn.BCELoss()
generator = ConvGenerator(input_size, hidden_size, input_size).to(device)
discriminator = ConvDiscriminator(input_size, hidden_size).to(device)
optimizer_discriminator = torch.optim.RMSprop(discriminator.parameters(), lr = learning_rate)
optimizer_generator = torch.optim.RMSprop(generator.parameters(), lr = learning_rate)

In [None]:
real_dataset, gen_dataset, errors_generator, errors_discriminator, mask_results = train_ConvGan(generator, discriminator, optimizer_discriminator, optimizer_generator, loss_function, real_train, missing_train, mask_train, step_per_epoch, random_data)

In [None]:
plot_imputation_results(real_dataset, gen_dataset, mask_results,100,300)

In [None]:
plot_full_dataset(real_dataset, gen_dataset, 300, 600, 3)

In [None]:
tr = real_dataset[:, 7, 0]
te = gen_dataset[:, 7, 0]
ks_statistic, p_value = ks_2samp(tr.detach().cpu().numpy(), te.detach().cpu().numpy())

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_discriminator, label='d_loss')
mlt.plot(errors_generator, label='g_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
RMSE = mean_squared_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy(), squared=False)
print(f'RMSE:{RMSE}')

mae = mean_absolute_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy())
print("MAE:", mae)

# Mean Squared Error (MSE)
mse = mean_squared_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy())
print("MSE:", mse)

In [None]:
random_noise_test = torch.tensor(np.random.randn(real_test.shape[0], lag_size, input_size), dtype=torch.float32, requires_grad=True).to(device)

In [None]:
real_test = torch.from_numpy(real_test).float().to(device)

In [None]:
generator.eval()

In [None]:
real_data_test_conv, real_label_test_conv = gen_real_batch(real_test.shape[0], 0, real_test)

In [None]:
test_res = generator(real_data_test_conv, random_noise_test)

In [None]:
tr_test = real_data_test_conv[:, -1, 0]
te_test = test_res[:, -1, 0]
ks_statistic_test, p_value_test = ks_2samp(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy())

# Print the results
print("KS Statistic:", ks_statistic_test)
print("P-value:", p_value_test)

In [None]:
plot_full_dataset(real_data_test_conv, test_res, 300, 600, -1)

In [None]:
RMSE = mean_squared_error(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy(), squared=False)
print(f'RMSE:{RMSE}')

mae = mean_absolute_error(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy())
print("MAE:", mae)

# Mean Squared Error (MSE)
mse = mean_squared_error(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy())
print("MSE:", mse)

# Auto Encoder Model

In [2490]:
wrapper = ModelTrain(parameter_model)
helper = ModelHelper(parameter_model)

In [2491]:
model = VAE(parameter_model).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = parameter_model.learning_rate)
loss_function_autoencoder = nn.MSELoss()

In [None]:
real_dataset, gen_dataset, errors_generator, mask_data = wrapper.train_Vae(model, optimizer, loss_function_autoencoder, real_train, missing_train, mask_train, step_per_epoch, helper)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_generator, label='d_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
plot_full_dataset(real_dataset.detach().cpu().numpy(), gen_dataset.detach().cpu().numpy(), 2500, 3000, 'Results/test_9-19.png', 1)

In [2495]:
real_dataset_train_vae = pd.DataFrame(real_dataset.detach().cpu().numpy(), columns=columns_extended)
real_dataset_train_vae = scaler_train_real.inverse_transform(real_dataset_train_vae)
real_dataset_train_vae = torch.tensor(real_dataset_train_vae)
#real_dataset_train_seq2seq = pd.DataFrame(real_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

gen_dataset_train_vae = pd.DataFrame(gen_dataset.detach().cpu().numpy(), columns=columns_extended)
gen_dataset_train_vae = scaler_train_real.inverse_transform(gen_dataset_train_vae)
gen_dataset_train_vae = torch.tensor(gen_dataset_train_vae)
#gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

### Testing ...............................

In [2496]:
real_test = real_test[: -(real_test.shape[0] % parameter_model.batch_size)]
missing_test = missing_test[: -(missing_test.shape[0] % parameter_model.batch_size)]
mask_test = mask_test[: -(mask_test.shape[0] % parameter_model.batch_size)]

step_per_epoch = len(missing_test) // parameter_model.batch_size

real_test = torch.from_numpy(real_test).float().to(device)
missing_test = torch.from_numpy(missing_test).float().to(device)
mask_test = torch.from_numpy(mask_test).float().to(device)

In [2497]:
wrapper_test = ModelTest(parameters_seq2seq)

real_dataset_test_vae, imputed_dataset_vae, loss, mask_test_result = wrapper_test.test_Vae(model, real_test, missing_test, mask_test, loss_function_autoencoder, step_per_epoch, helper)

In [2498]:
imputed_dataset_temp = ((1 - mask_test_result) * real_dataset_test_vae) + (mask_test_result * imputed_dataset_vae)

imputed_dataset_final_vae = scaler_train_real.inverse_transform(imputed_dataset_temp.detach().cpu().numpy())

real_dataset_test_vae_temp = scaler_train_real.inverse_transform(real_dataset_test_vae.detach().cpu().numpy())

In [2499]:
dataset_name = dataset + '_9-19'

save_imputed_data(real_dataset_test_vae, imputed_dataset_temp, "../Data/Imputed/30_percent/vae/" + dataset_name + ".csv", columns_extended, scaler_train_real)


In [2500]:
real_dataset_test_vae = pd.DataFrame(real_dataset_test_vae.detach().cpu().numpy(), columns=columns_extended)
real_dataset_test_vae = scaler_train_real.inverse_transform(real_dataset_test_vae)
real_dataset_test_vae = torch.tensor(real_dataset_test_vae)
#real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns)

imputed_dataset_vae = pd.DataFrame(imputed_dataset_vae.detach().cpu().numpy(), columns=columns_extended)
imputed_dataset_vae = scaler_train_real.inverse_transform(imputed_dataset_vae)
imputed_dataset_vae = torch.tensor(imputed_dataset_vae)
#imputed_dataset = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns)

In [None]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

plot_full_dataset(real_dataset_test_vae.detach().cpu().numpy(), imputed_dataset_vae.detach().cpu().numpy(), 00, 3000, 'Results/test_9-19.png', 1)

In [None]:
mlt.suptitle('Loss')
#mlt.plot(errors_generator, label='train_loss')
mlt.plot(loss[:], label='test_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

validation_matrix_forecasting(real_dataset_test_vae[indices, 1], imputed_dataset_vae[indices, 1], 1)

# Mean Imputation

In [4564]:
missing_ratio = 0.50

with open("config/config_data.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_acn = data.acn
    parameters_boulder = data.boulder
    parameters_paloalto = data.paloalto
    parameters_sap = data.sap
    parameters_perth = data.perth
    parameters_dundee = data.dundee
    parameters_caltech = data.caltech
    parameters_jpl = data.jpl
    parameters_office = data.office

parameter_data = parameters_office

In [4565]:
dataset_name = "office2"

In [4566]:
df = pd.read_csv("../Data/Processed/" + dataset_name + "_data_with_zero.csv")
df['Hour'] = pd.to_datetime(df['Start']).dt.hour
# df['Sum'] = df.groupby(pd.to_datetime(df['Start']).dt.date)['Energy'].cumsum()
df = df.copy().loc[df['Start'] <= "2020-12-31 23:00:00"].reset_index(drop=True)

df_test = df.copy().loc[(df['Start'] >= parameter_data.test.start) & (df['Start'] <= parameter_data.test.end)].reset_index(drop=True)

# df_test['Start'] = pd.to_datetime(df_test['Start'])
# df_test.set_index('Start', inplace=True)
# result = seasonal_decompose(df_test['Energy'], model='additive', extrapolate_trend='freq')
# df_test['Seasonal'] = result.seasonal
# df_test['Trend'] = result.trend
# df_test['Residual'] = result.resid
# df_test.reset_index(inplace=True)

df_test.drop(columns=['Start'], inplace=True)

np.random.seed(0)
ratio = round(missing_ratio * len(df_test))
random_row_indices_test = np.random.choice(df_test.index, size=ratio, replace=False)

missing_data = random_index_noise(df_test.copy(), random_row_indices_test)

mask = np.isnan(missing_data)
mask = mask.replace({True: 1, False: 0})

In [None]:
import time

model = MeanImputation()
start = time.time()
imputed_data_mean = model(missing_data, 'Energy')
end = time.time()
print("Time taken for mean imputation in minutes:", (end-start) / 60)

In [4568]:
imputed_data_mean = torch.tensor(imputed_data_mean.values)
df_test = torch.tensor(df_test.values)
mask = torch.tensor(mask.values)

In [4569]:
imputed_data_mean = ((1 - mask) * df_test) + (mask * imputed_data_mean)

In [4570]:
indices_knn = torch.nonzero(mask[:, 1] == 1).view(-1).to('cpu')

In [None]:
plot_full_dataset(df_test.detach().cpu().numpy(), imputed_data_mean.detach().cpu().numpy(), 000, 500, 'Results/test3.png', 1)

In [4572]:
random_row_indices_test.sort()

In [None]:
random_row_indices_test.shape

In [None]:
validation_matrix_forecasting(df_test[indices_knn, 1], imputed_data_mean[indices_knn, 1], 1)

In [3576]:
imputed_data = pd.DataFrame(imputed_data_mean, columns=columns)

In [3577]:
imputed_data['Start'] = imputed_data.apply(lambda row: datetime(int(round(row['Year'])), int(round(row['Month'])), int(round(row['Day of month'])), int(round(row['Hour']))), axis=1)    
# imputed_data['Sum'] = imputed_data.groupby(pd.to_datetime(imputed_data['Start']).dt.date)['Energy'].cumsum()
imputed_data.drop(columns=['Start'], inplace=True)

In [3578]:
imputed_data.to_csv("../Data/Imputed/50_percent/Mean/" + dataset_name + "9_19.csv", index=False)

# KNN Imputer

In [3371]:
missing_ratio = 0.50

In [3372]:
with open("config/config_data.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_acn = data.acn
    parameters_boulder = data.boulder
    parameters_paloalto = data.paloalto
    parameters_sap = data.sap
    parameters_perth = data.perth
    parameters_dundee = data.dundee
    parameters_caltech = data.caltech
    parameters_jpl = data.jpl
    parameters_office = data.office

In [3373]:
parameter_data = parameters_paloalto

In [3374]:
dataset_name  = 'palo_alto2'

In [3375]:
df = pd.read_csv("../Data/Processed/" + dataset_name + "_data_with_zero.csv")
df['Hour'] = pd.to_datetime(df['Start']).dt.hour
# df['Sum'] = df.groupby(pd.to_datetime(df['Start']).dt.date)['Energy'].cumsum()
df = df.copy().loc[df['Start'] <= "2020-12-31 23:00:00"].reset_index(drop=True)

df_test = df.copy().loc[(df['Start'] >= parameter_data.test.start) & (df['Start'] <= parameter_data.test.end)].reset_index(drop=True)

# df_test['Start'] = pd.to_datetime(df_test['Start'])
# df_test.set_index('Start', inplace=True)
# result = seasonal_decompose(df_test['Energy'], model='additive', extrapolate_trend='freq')
# df_test['Seasonal'] = result.seasonal
# df_test['Trend'] = result.trend
# df_test['Residual'] = result.resid
# df_test.reset_index(inplace=True)

df_test.drop(columns=['Start'], inplace=True)

np.random.seed(0)
ratio = round(missing_ratio * len(df_test))
random_row_indices_test = np.random.choice(df_test.index, size=ratio, replace=False)

missing_data = random_index_noise(df_test.copy(), random_row_indices_test)

mask = np.isnan(missing_data)
mask = mask.replace({True: 1, False: 0})

In [3376]:
random_row_indices_test.sort()

In [None]:
random_row_indices_test

In [3378]:
# k_neighbour = get_optimum_k(df, missing_data)
k_neighbour = 4

In [3379]:
model = KnnImputer()
imputed_data = model(missing_data, k_neighbour)

In [3380]:
imputed_data = torch.tensor(imputed_data)
df_test = torch.tensor(df_test.values)
mask = torch.tensor(mask.values)

In [3381]:
indices_knn = torch.nonzero(mask[:, 1] == 1).view(-1).to('cpu')

In [3382]:
# result_knn = set_null_at_indices(imputed_data['Energy'].values.tolist(), indices_knn)

In [None]:
plot_full_dataset(df_test.detach().cpu().numpy(), imputed_data.detach().cpu().numpy(), 000, 1000, 'Results/test3.png', 1)

In [None]:
validation_matrix_forecasting(df_test[indices_knn, 1], imputed_data[indices_knn, 1], 1)

In [None]:
validation_matrix_imputation(df_test.detach().cpu().numpy(), imputed_data.detach().cpu().numpy(), mask.detach().cpu().numpy(), 1)

In [3386]:
imputed_data = pd.DataFrame(imputed_data, columns=columns)

In [3387]:
imputed_data['Start'] = imputed_data.apply(lambda row: datetime(int(round(row['Year'])), int(round(row['Month'])), int(round(row['Day of month'])), int(round(row['Hour']))), axis=1)    


In [3388]:
# imputed_data['Sum'] = imputed_data.groupby(pd.to_datetime(imputed_data['Start']).dt.date)['Energy'].cumsum()
imputed_data.drop(columns=['Start'], inplace=True)

In [3389]:
imputed_data.to_csv("../Data/Imputed/50_percent/Knn/" + dataset_name + "_9_19.csv", index=False)

In [None]:
imputed_dataset = torch.tensor(imputed_dataset)
real_dataset_test_seq2seq = torch.tensor(real_dataset_test_seq2seq)

# Compare All

In [4502]:
dataset = 'caltech2'
imputation = '50'

In [4503]:
df_test.drop(columns=['Start'], inplace=True)

In [4504]:
imputed_dataset_final_seq2seq = pd.read_csv("../Data/Imputed/" + imputation + "_percent/Seq2Seq/" + dataset + "_9-19.csv")
imputed_data_mean = pd.read_csv("../Data/Imputed/" + imputation + "_percent/Mean/" + dataset + "9_19.csv")
imputed_data_knn = pd.read_csv("../Data/Imputed/" + imputation + "_percent/Knn/" + dataset + "_9_19.csv")
imputed_dataset_final_gan = pd.read_csv("../Data/Imputed/" + imputation + "_percent/Gan/" + dataset + "_9-19.csv")

In [None]:
compare_predicted_dataset(torch.tensor(df_test.values), torch.tensor(imputed_dataset_final_seq2seq.values), torch.tensor(imputed_data_knn.values), torch.tensor(imputed_data_mean.values), torch.tensor(imputed_dataset_final_gan.values), imputed_dataset_final_vae, 720, 960, 'Results/test_caltech_50_9_19.png', 1, 'Caltech')