In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as mlt
import seaborn as sp
from torch.autograd import Variable
from torch import autograd
from datetime import datetime
import matplotlib.pyplot as plt
from datetime import timedelta
import sys
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from scipy.stats import ks_2samp
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
import json
from types import SimpleNamespace
from math import floor

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# columns = ['Day of week', 'Energy', 'Week Day', 'Year', 'Month', 'Day of month', 'Hour', 'Sum', 'Seasonal', 'Trend', 'Residual']
columns = ['Day of week', 'Energy', 'Week Day', 'Year', 'Month', 'Day of month', 'Hour', 'Sum']

In [None]:
%run ../Pre_process/Data_preprocess.ipynb
%run ../Pre_process/Data_postprocess.ipynb
%run Model/ResiDualNet.ipynb
%run Model/ConvGan.ipynb
%run Model/AutoEncoder.ipynb
%run Model/Mean_imputation.ipynb
%run Model/KNN_imputer.ipynb
%run train.ipynb
%run wrapper.ipynb
%run helper.ipynb
%run ../visualize.ipynb
%run test.ipynb
%run ../validation.ipynb

In [None]:
torch.cuda.empty_cache()

In [None]:
#pre_process_dataset("Data/Raw/boulder_2021.csv", 'boulder')
#pre_process_dataset("../Data/Raw/boulder_2021.csv", 'boulder2')

In [None]:
with open("config/config_data.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_acn = data.acn
    parameters_boulder = data.boulder
    parameters_paloalto = data.paloalto
    parameters_sap = data.sap
    parameters_perth = data.perth
    parameters_dundee = data.dundee
    parameters_caltech = data.caltech
    parameters_jpl = data.jpl
    parameters_office = data.office

In [None]:
with open("config/config_model.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_seq2seq = data.impute_40.seq2seq
    parameters_seq2seq.device = device

In [None]:
parameter_model = parameters_seq2seq
parameter_data = parameters_office

In [None]:
missing_ratio = 0.50

In [None]:
df = pd.read_csv("../Data/Processed/office_data_with_zero.csv")
df['Hour'] = pd.to_datetime(df['Start']).dt.hour
df['Sum'] = df.groupby(pd.to_datetime(df['Start']).dt.date)['Energy'].cumsum()
df = df.copy().loc[(df['Start'] >= "2018-10-01 00:00:00") & (df['Start'] <= "2020-02-29 23:00:00")].reset_index(drop=True)
df['Start'] = pd.to_datetime(df['Start'])
df.set_index('Start', inplace=True)

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
result = seasonal_decompose(df['Energy'], model='additive', extrapolate_trend='freq')

In [None]:
df['Seasonal'] = result.seasonal
df['Trend'] = result.trend
df['Residual'] = result.resid

In [None]:
df.reset_index(inplace=True)

In [None]:
first_column = df.iloc[:, 0]

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
df = scaler.fit_transform(df.iloc[:, 1:])
df = pd.DataFrame(df, columns=columns)
df = pd.concat([first_column, df], axis=1)

In [None]:
# input_size = df.shape[1]
# hidden_size = input_size * 2

In [None]:
df_train = df.copy().loc[(df['Start'] >= parameter_data.train.start) & (df['Start'] <= parameter_data.train.end)].reset_index(drop=True)
df_train.drop(columns=['Start'], inplace=True)

In [None]:
df_test = df.copy().loc[(df['Start'] >= parameter_data.test.start) & (df['Start'] <= parameter_data.test.end)].reset_index(drop=True)
df_test.drop(columns=['Start'], inplace=True)

In [None]:
df.tail()

In [None]:
# df_train = df.copy().loc[(df['Start'] >= parameter_data.train.start)].reset_index(drop=True)
# train_ratio = df.copy().loc[(df['Start'] >= parameter_data.train.start) & (df['Start'] <= parameter_data.train.end)].reset_index(drop=True).shape[0]
df.drop(columns=['Start'], inplace=True)
# train_ratio = round(train_ratio / df.shape[0], 2)

In [None]:
# scaler = MinMaxScaler(feature_range=(0,1))
# df = scaler.fit_transform(df.iloc[:, 1:])
# df = pd.DataFrame(df, columns=columns)
#df = pd.DataFrame(df, columns=['Energy'])

In [None]:
np.random.seed(0)
ratio = round(missing_ratio * len(df_train))
random_row_indices_train = np.random.choice(df_train.index, size=ratio, replace=False)

In [None]:
np.random.seed(0)
ratio = round(missing_ratio * len(df_test))
random_row_indices_test = np.random.choice(df_test.index, size=ratio, replace=False)

In [None]:
random_row_indices_test.sort()

In [None]:
missing_train, real_train, mask_train = get_train_test_dataset_imputation(df_train, 0, parameter_model.lag_size, random_row_indices_train)

In [None]:
missing_test, real_test, mask_test = get_train_test_dataset_imputation(df_test, 0, parameter_model.lag_size, random_row_indices_test)

In [None]:
step_per_epoch = len(missing_train) // batch_size

# Converting to tensor
real_train = torch.from_numpy(real_train).float().to(device)
missing_train = torch.from_numpy(missing_train).float().to(device)
mask_train = torch.from_numpy(mask_train).float().to(device)

# GAN Model

In [None]:
learning_rate = 0.001

In [None]:
discriminator = Discriminator(input_size, hidden_size, 1).to(device)
generator = Generator(input_size, hidden_size, input_size).to(device)
optimizer_discriminator = torch.optim.SGD(discriminator.parameters(), lr = learning_rate)
optimizer_generator = torch.optim.Adam(generator.parameters(), lr = learning_rate)

In [None]:
loss_function_MSE = nn.MSELoss()
loss_function = nn.BCELoss()

In [None]:
real_dataset, gen_dataset, errors_generator, errors_discriminator = train_Gan(generator, discriminator, optimizer_discriminator, optimizer_generator, loss_function, loss_function_MSE, real_train, missing_train, mask_train, step_per_epoch)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_discriminator, label='d_loss')
mlt.plot(errors_generator, label='g_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
# test_gen_dataset = gen_dataset
tr = real_dataset[:200, -1, 1].view(-1)
te = gen_dataset[:200, -1, 1].view(-1)
mlt.figure(figsize=(20, 6))
mlt.suptitle('Gan prediction on training dataset')
plt.ylabel('Energy Consumption in Kwh')
mlt.plot(tr.detach().cpu().numpy(), label='real')
mlt.plot(te.detach().cpu().numpy(), label='gen')
mlt.legend()
#mlt.savefig('./Results/train_res_impute.png')
mlt.show()

In [None]:
ks_statistic, p_value = ks_2samp(tr.detach().cpu().numpy(), te.detach().cpu().numpy())

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

In [None]:
generator.eval()

In [None]:
mean = 0  # Mean of the distribution
std_dev = 1  # Standard deviation of the distribution

# Generate random data from a normal distribution
random_data = np.random.normal(loc=mean, scale=std_dev, size=(batch_size, lag_size, 1))
#random_data = np.clip(random_data, 0, 1)
random_data = torch.tensor(random_data,dtype=torch.float32, requires_grad=True).to(device)

In [None]:
real_test = torch.from_numpy(real_test).float().to(device)
missing_test = torch.from_numpy(missing_test).float().to(device)
mask_test = torch.from_numpy(mask_test).float().to(device)

In [None]:
real_data, real_label = gen_real_batch(real_test.shape[0], 0, real_test)
z_input, mask_input = gen_z_input(batch_size, 0, missing_test, mask_test)

In [None]:
input_test = z_input + random_data

In [None]:
ttt = generator(input_test)

In [None]:
tr2 = z_input[:, -1, 0].view(-1)
te2 = ttt[:, -1, 0].view(-1)
# for i in range(len(te)):
#     te[i] = max(0, te[i])
#     if te[i] > 0:
#         te[i] = te[i] + 0.2
mlt.figure(figsize=(20, 6))
mlt.suptitle('Gan prediction on testing dataset')
mlt.plot(tr2.detach().cpu().numpy(), label='real')
mlt.plot(te2.detach().cpu().numpy(), label='gen')
plt.ylabel('Energy Consumption in Kwh')
mlt.legend()
#mlt.savefig('./Results/test_res_inpute.png')
mlt.show()

In [None]:
RMSE = mean_squared_error(tr2.detach().cpu().numpy(), te2.detach().cpu().numpy(), squared=False)
print(f'RMSE:{RMSE}')

mae = mean_absolute_error(tr2.detach().cpu().numpy(), te2.detach().cpu().numpy())
print("MAE:", mae)

# Mean Squared Error (MSE)
mse = mean_squared_error(tr2.detach().cpu().numpy(), te2.detach().cpu().numpy())
print("MSE:", mse)

In [None]:
RMSE = mean_squared_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy(), squared=False)
print(f'Test dataset RMSE:{RMSE}')

mae = mean_absolute_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy())
print("MAE:", mae)

# Mean Squared Error (MSE)
mse = mean_squared_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy())
print("MSE:", mse)

In [None]:
ks_statistic, p_value = ks_2samp(tr2.detach().cpu().numpy(), te2.detach().cpu().numpy())

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

In [None]:
alpha = 0.05  
if p_value < alpha:
    print("Reject the null hypothesis: The samples come from different distributions.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the samples.")

# ResiDualNet

Trainning------------------------------------------------------------------------------------------------------------------------------

In [None]:
model = ResiDualNet(parameter_model).to(device)
#model = Seq2SeqAttention(input_size, hidden_size, input_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = parameter_model.learning_rate, weight_decay = 0.005)
loss_function_seq2seq = nn.MSELoss()

In [None]:
wrapper = ModelTrain(parameter_model)
helper = ModelHelper(parameter_model)

In [None]:
real_dataset, gen_dataset, errors_generator, mask_data = wrapper.train_Seq2Seq(model, optimizer, loss_function_seq2seq, real_train, missing_train, mask_train, step_per_epoch, helper)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_generator, label='d_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
plot_imputation_results_two(real_dataset.detach().cpu().numpy(), gen_dataset.detach().cpu().numpy(), mask_data.detach().cpu().numpy(), 300, 450, 'Results/test21.png')

In [None]:
plot_full_dataset(real_dataset.detach().cpu().numpy(), gen_dataset.detach().cpu().numpy(), 000, 30000, 'Results/test1.png', 1)

In [None]:
real_dataset_train_seq2seq = pd.DataFrame(real_dataset.detach().cpu().numpy(), columns=columns)
real_dataset_train_seq2seq = scaler.inverse_transform(real_dataset_train_seq2seq)
real_dataset_train_seq2seq = torch.tensor(real_dataset_train_seq2seq)
#real_dataset_train_seq2seq = pd.DataFrame(real_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

In [None]:
gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset.detach().cpu().numpy(), columns=columns)
gen_dataset_train_seq2seq = scaler.inverse_transform(gen_dataset_train_seq2seq)
gen_dataset_train_seq2seq = torch.tensor(gen_dataset_train_seq2seq)
#gen_dataset_train_seq2seq = pd.DataFrame(gen_dataset_train_seq2seq.detach().cpu().numpy(), columns=columns)

Testing-------------------------------------------------------------------------------------------------------------------------------

In [None]:
real_test = real_test[: -(real_test.shape[0] % parameter_model.batch_size)]
missing_test = missing_test[: -(missing_test.shape[0] % parameter_model.batch_size)]
mask_test = mask_test[: -(mask_test.shape[0] % parameter_model.batch_size)]

In [None]:
step_per_epoch = len(missing_test) // parameter_model.batch_size

In [None]:
# tr = df['Energy']
# mlt.figure(figsize=(20, 6))
# mlt.suptitle('Gan prediction on test dataset')
# mlt.ylabel('Energy Consumption in Kwh')
# mlt.plot(tr, label='real')
# mlt.legend()
# mlt.show()

In [None]:
real_test = torch.from_numpy(real_test).float().to(device)
missing_test = torch.from_numpy(missing_test).float().to(device)
mask_test = torch.from_numpy(mask_test).float().to(device)

In [None]:
wrapper_test = ModelTest(parameters_seq2seq)

In [None]:
real_dataset_test_seq2seq, imputed_dataset, loss, mask_test_result = wrapper_test.test_model(model, real_test, missing_test, mask_test, loss_function_seq2seq, step_per_epoch, helper)

In [None]:
imputed_dataset_temp = ((1 - mask_test_result) * real_dataset_test_seq2seq) + (mask_test_result * imputed_dataset)

In [None]:
imputed_dataset_temp2 = scaler.inverse_transform(imputed_dataset_temp.detach().cpu().numpy())

In [None]:
real_dataset_test_seq2seq_temp = scaler.inverse_transform(real_dataset_test_seq2seq.detach().cpu().numpy())

In [None]:
real_dataset_test_seq2seq_temp[0]

In [None]:
dataset_name = 'office_final'

In [None]:
save_imputed_data(real_dataset_test_seq2seq, imputed_dataset_temp, "../Data/Imputed/50_percent/Seq2Seq/" + dataset_name + ".csv", scaler, columns)

In [None]:
real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns)
real_dataset_test_seq2seq = scaler.inverse_transform(real_dataset_test_seq2seq)
real_dataset_test_seq2seq = torch.tensor(real_dataset_test_seq2seq)
#real_dataset_test_seq2seq = pd.DataFrame(real_dataset_test_seq2seq.detach().cpu().numpy(), columns=columns)

In [None]:
imputed_dataset = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns)
imputed_dataset = scaler.inverse_transform(imputed_dataset)
imputed_dataset = torch.tensor(imputed_dataset)
#imputed_dataset = pd.DataFrame(imputed_dataset.detach().cpu().numpy(), columns=columns)

In [None]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

In [None]:
def set_null_at_indices(lst, indices):
    return [val if i in indices else np.nan for i, val in enumerate(lst)]

# usage
imputed_dataset_list = imputed_dataset['Energy'].values.tolist()
result_seq2seq = set_null_at_indices(imputed_dataset_list, indices)


In [None]:
plot_full_dataset(real_dataset_test_seq2seq.detach().cpu().numpy(), imputed_dataset.detach().cpu().numpy(), 0, 10000, 'Results/test3.png', 1)

In [None]:
mlt.suptitle('Loss')
#mlt.plot(errors_generator, label='train_loss')
mlt.plot(loss[:], label='test_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

Evaluation----------------------------------------------------------------------------------------------------------------

In [None]:
indices = torch.nonzero(mask_test_result[:, 1] == 1).view(-1).to('cpu')

In [None]:
indices

In [None]:
validation_matrix_forecasting(real_dataset_test_seq2seq[indices, 1], imputed_dataset[indices, 1], 1)

In [None]:
validation_matrix_imputation(real_dataset_test_seq2seq.numpy(), imputed_dataset.numpy(), mask_test_result.detach().cpu().numpy(), 1)

In [None]:
ks_statistic, p_value = ks_2samp(real_dataset_test_seq2seq[:, 1], imputed_dataset[:, 1])

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

# Conv Gan

In [None]:
mean = 0  # Mean of the distribution
std_dev = 1  # Standard deviation of the distribution

# Generate random data from a normal distribution
random_data = np.random.normal(loc=mean, scale=std_dev, size=(real_train.size()))
#random_data = np.clip(random_data, 0, 1)
random_data = torch.tensor(random_data,dtype=torch.float32, requires_grad=True).to(device)

In [None]:
loss_function = nn.BCELoss()
generator = ConvGenerator(input_size, hidden_size, input_size).to(device)
discriminator = ConvDiscriminator(input_size, hidden_size).to(device)
optimizer_discriminator = torch.optim.RMSprop(discriminator.parameters(), lr = learning_rate)
optimizer_generator = torch.optim.RMSprop(generator.parameters(), lr = learning_rate)

In [None]:
real_dataset, gen_dataset, errors_generator, errors_discriminator, mask_results = train_ConvGan(generator, discriminator, optimizer_discriminator, optimizer_generator, loss_function, real_train, missing_train, mask_train, step_per_epoch, random_data)

In [None]:
plot_imputation_results(real_dataset, gen_dataset, mask_results,100,300)

In [None]:
plot_full_dataset(real_dataset, gen_dataset, 300, 600, 3)

In [None]:
tr = real_dataset[:, 7, 0]
te = gen_dataset[:, 7, 0]
ks_statistic, p_value = ks_2samp(tr.detach().cpu().numpy(), te.detach().cpu().numpy())

# Print the results
print("KS Statistic:", ks_statistic)
print("P-value:", p_value)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_discriminator, label='d_loss')
mlt.plot(errors_generator, label='g_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
RMSE = mean_squared_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy(), squared=False)
print(f'RMSE:{RMSE}')

mae = mean_absolute_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy())
print("MAE:", mae)

# Mean Squared Error (MSE)
mse = mean_squared_error(tr.detach().cpu().numpy(), te.detach().cpu().numpy())
print("MSE:", mse)

In [None]:
random_noise_test = torch.tensor(np.random.randn(real_test.shape[0], lag_size, input_size), dtype=torch.float32, requires_grad=True).to(device)

In [None]:
real_test = torch.from_numpy(real_test).float().to(device)

In [None]:
generator.eval()

In [None]:
real_data_test_conv, real_label_test_conv = gen_real_batch(real_test.shape[0], 0, real_test)

In [None]:
test_res = generator(real_data_test_conv, random_noise_test)

In [None]:
tr_test = real_data_test_conv[:, -1, 0]
te_test = test_res[:, -1, 0]
ks_statistic_test, p_value_test = ks_2samp(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy())

# Print the results
print("KS Statistic:", ks_statistic_test)
print("P-value:", p_value_test)

In [None]:
plot_full_dataset(real_data_test_conv, test_res, 300, 600, -1)

In [None]:
RMSE = mean_squared_error(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy(), squared=False)
print(f'RMSE:{RMSE}')

mae = mean_absolute_error(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy())
print("MAE:", mae)

# Mean Squared Error (MSE)
mse = mean_squared_error(tr_test.detach().cpu().numpy(), te_test.detach().cpu().numpy())
print("MSE:", mse)

# Auto Encoder Model

In [None]:
model = VAE(input_size, hidden_size, 14).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
loss_function_autoencoder = nn.MSELoss()

In [None]:
real_dataset, gen_dataset, errors_generator, mask_data = train_autoEncoder(model, optimizer, loss_function_autoencoder, real_train, missing_train, mask_train, step_per_epoch)

In [None]:
mlt.suptitle('Loss')
mlt.plot(errors_generator, label='d_loss')
mlt.legend()
#mlt.savefig('foo1.png')
mlt.show()

In [None]:
plot_full_dataset(real_dataset[:, -1, :].detach().cpu().numpy(), gen_dataset[:, -1, :].detach().cpu().numpy(), 200, 600, 'Results/test1.png')

# Mean Imputation

In [None]:
missing_ratio = 0.30

with open("config/config_data.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_acn = data.acn
    parameters_boulder = data.boulder
    parameters_paloalto = data.paloalto
    parameters_sap = data.sap
    parameters_perth = data.perth
    parameters_dundee = data.dundee
    parameters_caltech = data.caltech
    parameters_jpl = data.jpl
    parameters_office = data.office

parameter_data = parameters_office

In [None]:
dataset_name = "office"

In [None]:
df = pd.read_csv("../Data/Processed/" + dataset_name + "_data_with_zero.csv")
df['Hour'] = pd.to_datetime(df['Start']).dt.hour
df['Sum'] = df.groupby(pd.to_datetime(df['Start']).dt.date)['Energy'].cumsum()
df = df.copy().loc[df['Start'] <= "2020-12-31 23:00:00"].reset_index(drop=True)

df_test = df.copy().loc[(df['Start'] >= parameter_data.test.start) & (df['Start'] <= parameter_data.test.end)].reset_index(drop=True)

# df_test['Start'] = pd.to_datetime(df_test['Start'])
# df_test.set_index('Start', inplace=True)
# result = seasonal_decompose(df_test['Energy'], model='additive', extrapolate_trend='freq')
# df_test['Seasonal'] = result.seasonal
# df_test['Trend'] = result.trend
# df_test['Residual'] = result.resid
# df_test.reset_index(inplace=True)

df_test.drop(columns=['Start'], inplace=True)

np.random.seed(0)
ratio = round(missing_ratio * len(df_test))
random_row_indices_test = np.random.choice(df_test.index, size=ratio, replace=False)

missing_data = random_index_noise(df_test.copy(), random_row_indices_test)

mask = np.isnan(missing_data)
mask = mask.replace({True: 1, False: 0})

In [None]:
model = MeanImputation()
imputed_data_mean = model(missing_data, 'Energy')

In [None]:
imputed_data_mean = torch.tensor(imputed_data_mean.values)
df_test = torch.tensor(df_test.values)
mask = torch.tensor(mask.values)

In [None]:
imputed_data_mean = ((1 - mask) * df_test) + (mask * imputed_data_mean)

In [None]:
indices_knn = torch.nonzero(mask[:, 1] == 1).view(-1).to('cpu')

In [None]:
plot_full_dataset(df_test.detach().cpu().numpy(), imputed_data_mean.detach().cpu().numpy(), 000, 500, 'Results/test3.png', 1)

In [None]:
random_row_indices_test.sort()

In [None]:
random_row_indices_test.shape

In [None]:
validation_matrix_forecasting(df_test[indices_knn, 1], imputed_data_mean[indices_knn, 1], 1)

In [None]:
validation_matrix_imputation(df_test.detach().cpu().numpy(), imputed_data_mean.detach().cpu().numpy(), mask.detach().cpu().numpy(), 1)

In [None]:
imputed_data = pd.DataFrame(imputed_data_mean, columns=columns)

In [None]:
imputed_data['Start'] = imputed_data.apply(lambda row: datetime(int(row['Year']), int(row['Month']), int(row['Day of month'])), axis=1)
imputed_data['Sum'] = imputed_data.groupby(pd.to_datetime(imputed_data['Start']).dt.date)['Energy'].cumsum()
imputed_data.drop(columns=['Start'], inplace=True)

In [None]:
imputed_data.to_csv("../Data/Imputed/40_percent/Mean/" + dataset_name + ".csv", index=False)

# KNN Imputer

In [None]:
missing_ratio = 0.50

In [None]:
with open("config/config_data.json") as json_data:
    data = json.load(json_data, object_hook=lambda d: SimpleNamespace(**d))
    parameters_acn = data.acn
    parameters_boulder = data.boulder
    parameters_paloalto = data.paloalto
    parameters_sap = data.sap
    parameters_perth = data.perth
    parameters_dundee = data.dundee
    parameters_caltech = data.caltech
    parameters_jpl = data.jpl
    parameters_office = data.office

In [None]:
parameter_data = parameters_office

In [None]:
dataset_name  = 'office'

In [None]:
df = pd.read_csv("../Data/Processed/" + dataset_name + "_data_with_zero.csv")
df['Hour'] = pd.to_datetime(df['Start']).dt.hour
df['Sum'] = df.groupby(pd.to_datetime(df['Start']).dt.date)['Energy'].cumsum()
df = df.copy().loc[df['Start'] <= "2020-12-31 23:00:00"].reset_index(drop=True)

df_test = df.copy().loc[(df['Start'] >= parameter_data.test.start) & (df['Start'] <= parameter_data.test.end)].reset_index(drop=True)

# df_test['Start'] = pd.to_datetime(df_test['Start'])
# df_test.set_index('Start', inplace=True)
# result = seasonal_decompose(df_test['Energy'], model='additive', extrapolate_trend='freq')
# df_test['Seasonal'] = result.seasonal
# df_test['Trend'] = result.trend
# df_test['Residual'] = result.resid
# df_test.reset_index(inplace=True)

df_test.drop(columns=['Start'], inplace=True)

np.random.seed(0)
ratio = round(missing_ratio * len(df_test))
random_row_indices_test = np.random.choice(df_test.index, size=ratio, replace=False)

missing_data = random_index_noise(df_test.copy(), random_row_indices_test)

mask = np.isnan(missing_data)
mask = mask.replace({True: 1, False: 0})

In [None]:
random_row_indices_test.sort()

In [None]:
random_row_indices_test

In [None]:
# k_neighbour = get_optimum_k(df, missing_data)
k_neighbour = 4

In [None]:
model = KnnImputer()
imputed_data = model(missing_data, k_neighbour)

In [None]:
imputed_data = torch.tensor(imputed_data)
df_test = torch.tensor(df_test.values)
mask = torch.tensor(mask.values)

In [None]:
indices_knn = torch.nonzero(mask[:, 1] == 1).view(-1).to('cpu')

In [None]:
result_knn = set_null_at_indices(imputed_data['Energy'].values.tolist(), indices_knn)

In [None]:
plot_full_dataset(df_test.detach().cpu().numpy(), imputed_data.detach().cpu().numpy(), 000, 100, 'Results/test3.png', 1)

In [None]:
validation_matrix_forecasting(df_test[indices_knn, 1], imputed_data[indices_knn, 1], 1)

In [None]:
validation_matrix_imputation(df_test.detach().cpu().numpy(), imputed_data.detach().cpu().numpy(), mask.detach().cpu().numpy(), 1)

In [None]:
imputed_data = pd.DataFrame(imputed_data, columns=columns)

In [None]:
imputed_data['Start'] = imputed_data.apply(lambda row: datetime(int(row['Year']), int(row['Month']), int(row['Day of month'])), axis=1)

In [None]:
imputed_data['Sum'] = imputed_data.groupby(pd.to_datetime(imputed_data['Start']).dt.date)['Energy'].cumsum()
imputed_data.drop(columns=['Start'], inplace=True)

In [None]:
imputed_data.to_csv("../Data/Imputed/50_percent/Knn/" + dataset_name + "_final.csv", index=False)

In [None]:
imputed_dataset = torch.tensor(imputed_dataset)
real_dataset_test_seq2seq = torch.tensor(real_dataset_test_seq2seq)

In [None]:
imputed_data = torch.tensor(imputed_data)

In [None]:
compare_predicted_dataset(real_dataset_test_seq2seq_temp, imputed_dataset_temp2, imputed_data[7:], imputed_data_mean[7:], 640, 750, 'Results/test_palo_last.png', 1, 'Office')