In [None]:
%cd /nfs/home/medoro/Unsupervised_Anomaly_Detection_thesis

In [None]:
from preprocessing import *
import preprocessing as prp
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data_utils
from usad import *
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [None]:
dataframe = pd.read_csv(r"/nfs/home/medoro/Unsupervised_Anomaly_Detection_thesis/data/train.csv")
dataframe.shape

In [None]:
df=dataframe[['building_id','primary_use', 'timestamp', 'meter_reading', 'sea_level_pressure', 'is_holiday','anomaly']]
df

In [None]:
imputed_df = impute_nulls(df)
imputed_df

Now that we have imputed the missing values for the column containing the energy consumption measurements, we can procees by adding a couple of features more and further imputing the missing dates for each timeseries in the dataset.

In [None]:
df = add_trigonometric_features(imputed_df)
df

In [None]:
dfs_dict = impute_missing_dates(df)

In [None]:
df1 = pd.concat(dfs_dict.values())
df1

Let's now obtain the train and validation set. We are going to split the dataset into 2 sets, according to the building id.

In [None]:
dfs_train, dfs_val, dfs_test = train_val_test_split(df1)
train = pd.concat(dfs_train.values())

In [None]:
val = pd.concat(dfs_val.values())

In [None]:
test = pd.concat(dfs_test.values())

In [None]:
train

In [None]:
val

In [None]:
test

# Training the model

In [None]:
train_window = 72

In [None]:
X_train, y_train = create_multivariate_train_eval_sequences(train, train_window)

In [None]:
X_train, y_train

In [None]:
X_train.shape, y_train.shape

In [None]:
BATCH_SIZE =  128
N_EPOCHS = 40
hidden_size = 1/8

In [None]:
w_size = X_train.shape[1] * X_train.shape[2]
z_size = w_size * hidden_size 
w_size, z_size

In [None]:
z_size = int(z_size)

In [None]:
z_size

In [None]:
import torch.utils.data as data_utils

In [None]:
train_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(X_train).float().contiguous().view(([X_train.shape[0], w_size]))), batch_size = BATCH_SIZE, shuffle = False, num_workers = 0) #.view(([X_train.shape[0], w_size]))

In [None]:
X_val, y_val = create_multivariate_train_eval_sequences(val, train_window)

In [None]:
X_val.shape, y_val.shape

In [None]:
val_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(X_val).float().contiguous().view(([X_val.shape[0],w_size]))) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0) #.view(([X_val.shape[0],w_size]))

In [None]:
!export CUDA_VISIBLE_DEVICES=2

In [None]:
device = get_default_device()
device

In [None]:
model = UsadModel(w_size, z_size)
model = to_device(model,device)

In [None]:
print(device)

In [None]:
history = training(N_EPOCHS,model,train_loader,val_loader) #2.15 min a epoch

In [None]:
plot_history(history)

In [None]:
print(model)

In [None]:
torch.save({
            'encoder': model.encoder.state_dict(),
            'decoder1': model.decoder1.state_dict(),
            'decoder2': model.decoder2.state_dict()
            }, "/home/medoro/Unsupervised_Anomaly_Detection_thesis/checkpoints/model_40epochs_multivariate.pth")

# Testing

In [None]:
checkpoint = torch.load("/home/medoro/Unsupervised_Anomaly_Detection_thesis/checkpoints/model_40epochs_multivariate.pth")

model.encoder.load_state_dict(checkpoint['encoder'])
model.decoder1.load_state_dict(checkpoint['decoder1'])
model.decoder2.load_state_dict(checkpoint['decoder2'])

In [None]:
X_test, y_test = create_multivariate_train_eval_sequences(test, train_window)

In [None]:
X_test.shape, y_test.shape

In [None]:
test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
    torch.from_numpy(X_test).float().contiguous().view(([X_test.shape[0],w_size]))
) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

## Testing (anomaly score-based)

In [None]:
results=testing(model,test_loader) #Prova con il test set

Let's create the dataset to perform predictions.

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
dfs_dict_1 = {}
for building_id, gdf in test.groupby("building_id"):
  gdf[['meter_reading', 'sea_level_pressure', 'weekday_x', 'weekday_y']]=scaler.fit_transform(gdf[['meter_reading', 'sea_level_pressure', 'weekday_x', 'weekday_y']])
  dfs_dict_1[building_id] = gdf[train_window:]
predicted_df = pd.concat(dfs_dict_1.values())

In [None]:
lista = []
for el in results:
  for el2 in el:
    lista.append(el2.cpu().item())

In [None]:
lista

In [None]:
plt.hist(lista, bins=50)
plt.xlabel("Anomaly score")
plt.ylabel("No of samples")
plt.show()

In [None]:
predicted_df['predictions'] = lista

In [None]:
predicted_df

In [None]:
predicted_df.predictions.min(), predicted_df.predictions.max()

In [None]:
perc = 90
threshold = (np.percentile(predicted_df.predictions.values, perc))
threshold

In [None]:
predicted_df['threshold'] = threshold

In [None]:
predicted_df['predicted_anomaly'] = predicted_df.predictions > predicted_df['threshold']
predicted_df['predicted_anomaly']=predicted_df['predicted_anomaly'].replace(False,0)
predicted_df['predicted_anomaly']=predicted_df['predicted_anomaly'].replace(True,1)

In [None]:
predicted_df

In [None]:
predicted_df.predicted_anomaly.unique()

In [None]:
len(predicted_df[predicted_df.predicted_anomaly == 1])/len(predicted_df)

In [None]:
predicted_df.index.names=['timestamp']
predicted_df= predicted_df.reset_index()

In [None]:
predicted_df = pd.merge(predicted_df, df[['timestamp','building_id']], on=['timestamp','building_id'])

In [None]:
print(classification_report(predicted_df.anomaly, predicted_df.predicted_anomaly))

In [None]:
roc_auc_score(predicted_df['anomaly'], predicted_df['predicted_anomaly'])

## Testing (reconstruction-based)

For this we are going to consider non-overlapping windows.

In [None]:
X_test, y_test = create_multivariate_test_sequences(test, train_window)

In [None]:
X_test.shape, y_test.shape

In [None]:
test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
    torch.from_numpy(X_test).float().contiguous().view(([X_test.shape[0],w_size]))
) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
results, w1, w2 = testing_prova(model, test_loader)

In [None]:
w1

In [None]:
w2

In [None]:
len(w1)

In [None]:
len(w2)

In [None]:
len(w1[1]), len(w1[-1])

In [None]:
len(w2[1]), len(w2[-1])

In [None]:
# Fai reshape da [batch, 360] a [batch, 72, 5]
reshaped_w1 = [torch.reshape(w1_el, (w1_el.size()[0], int(w1_el.size()[1]/5), int(w1_el.size()[1]/72))) for w1_el in w1]
reshaped_w1_try = [torch.reshape(w1_el, (w1_el.size()[0]*w1_el.size()[1], w1_el.size()[2])) for w1_el in reshaped_w1]

In [None]:
len(reshaped_w1_try), reshaped_w1_try[0].size(), reshaped_w1_try[-1].size()

In [None]:
stacked = torch.stack(reshaped_w1_try[:-1])
stacked.shape

In [None]:
stacked_reshaped = torch.reshape(stacked, (stacked.size()[0] * stacked.size()[1], stacked.size()[2]))
stacked_reshaped.size()

In [None]:
stacked_array = stacked_reshaped.cpu().numpy()
stacked_array.shape

In [None]:
last_array = reshaped_w1_try[-1].cpu().numpy()
last_array.shape

In [None]:
total1 = np.concatenate([stacked_array, last_array])
total1.shape

Let's do the same for w2.

In [None]:
# Fai reshape da [batch, 360] a [batch, 72, 5]
reshaped_w2 = [torch.reshape(w2_el, (w2_el.size()[0], int(w2_el.size()[1]/5), int(w2_el.size()[1]/72))) for w2_el in w2]
reshaped_w2_try = [torch.reshape(w2_el, (w2_el.size()[0]*w2_el.size()[1], w2_el.size()[2])) for w2_el in reshaped_w2]

In [None]:
stacked2 = torch.stack(reshaped_w2_try[:-1])
stacked2.shape

In [None]:
stacked_reshaped2 = torch.reshape(stacked2, (stacked2.size()[0] * stacked2.size()[1], stacked2.size()[2]))
stacked_reshaped2.size()

In [None]:
stacked_array = stacked_reshaped2.cpu().numpy()
stacked_array

In [None]:
last_array2 = reshaped_w2_try[-1].cpu().numpy()
last_array2

In [None]:
total2 = np.concatenate([stacked_array, last_array2])

In [None]:
total2.shape

Let's now try to see how this refers to the reconstructed time series.

In [None]:
w1_reco = pd.DataFrame(total1)
w1_reco

In [None]:
w2_reco = pd.DataFrame(total2)
w2_reco

The first column is the one we are interested in, as it refers to the meter reading. Let's create the final dataset and concatenate the reconstructions to it.

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
dfs_dict_1 = {}
for building_id, gdf in test.groupby("building_id"):
  gdf[['meter_reading', 'sea_level_pressure', 'weekday_x', 'weekday_y']]=scaler.fit_transform(gdf[['meter_reading', 'sea_level_pressure', 'weekday_x', 'weekday_y']])
  dfs_dict_1[building_id] = gdf
predicted_df_test = pd.concat(dfs_dict_1.values())

In [None]:
predicted_df_test['reconstruction'] = w1_reco.loc[:, 0].values

In [None]:
predicted_df_test['reconstruction2'] = w2_reco.loc[:, 0].values

In [None]:
predicted_df_test

In [None]:
predicted_df_test.reconstruction2.min(), predicted_df_test.reconstruction2.max()

In [None]:
predicted_df_test.reconstruction.min(), predicted_df_test.reconstruction.max()

In [None]:
predicted_df_test.meter_reading.min(), predicted_df_test.meter_reading.max()

In [None]:
predicted_df_test['relative_loss'] = np.abs((predicted_df_test['reconstruction']-predicted_df_test['meter_reading'])/predicted_df_test['reconstruction'])

In [None]:
predicted_df_test['relative_loss2'] = np.abs((predicted_df_test['reconstruction2']-predicted_df_test['meter_reading'])/predicted_df_test['reconstruction2'])

In [None]:
#calculate threshold on relative loss quartiles but only on val, and in this case per building
thresholds=np.array([])
for building_id, gdf in predicted_df_test.groupby("building_id"):
  val_mre_loss_building= gdf['relative_loss'].values
  building_threshold = (np.percentile(val_mre_loss_building, 75)) + 1.5 *((np.percentile(val_mre_loss_building, 75))-(np.percentile(val_mre_loss_building, 25)))
  gdf['threshold']=building_threshold
  thresholds= np.append(thresholds, gdf['threshold'].values)
print(thresholds.shape)
predicted_df_test['threshold']= thresholds

In [None]:
#calculate threshold on relative loss quartiles but only on val, and in this case per building
thresholds=np.array([])
for building_id, gdf in predicted_df_test.groupby("building_id"):
  val_mre_loss_building= gdf['relative_loss2'].values
  building_threshold = (np.percentile(val_mre_loss_building, 75)) + 1.5 *((np.percentile(val_mre_loss_building, 75))-(np.percentile(val_mre_loss_building, 25)))
  gdf['threshold2']=building_threshold
  thresholds= np.append(thresholds, gdf['threshold2'].values)
print(thresholds.shape)
predicted_df_test['threshold2']= thresholds

In [None]:
predicted_df_test

In [None]:
predicted_df_test['predicted_anomaly'] = predicted_df_test['relative_loss'] > predicted_df_test['threshold']
predicted_df_test['predicted_anomaly']=predicted_df_test['predicted_anomaly'].replace(False,0)
predicted_df_test['predicted_anomaly']=predicted_df_test['predicted_anomaly'].replace(True,1)

In [None]:
predicted_df_test['predicted_anomaly2'] = predicted_df_test['relative_loss2'] > predicted_df_test['threshold2']
predicted_df_test['predicted_anomaly2']=predicted_df_test['predicted_anomaly2'].replace(False,0)
predicted_df_test['predicted_anomaly2']=predicted_df_test['predicted_anomaly2'].replace(True,1)

In [None]:
predicted_df_test.index.names=['timestamp']
predicted_df_test= predicted_df_test.reset_index()

In [None]:
predicted_df_test.predicted_anomaly.unique()

In [None]:
predicted_df_test.predicted_anomaly2.unique()

In [None]:
predicted_anomalies = predicted_df_test.loc[predicted_df_test['predicted_anomaly'] == 1]
predicted_anomalies2 = predicted_df_test.loc[predicted_df_test['predicted_anomaly2'] == 1]
true_anomalies = predicted_df_test.loc[predicted_df_test['anomaly'] == 1]

In [None]:
predicted_df_test = pd.merge(predicted_df_test, df[['timestamp','building_id']], on=['timestamp','building_id'])

In [None]:
print(classification_report(predicted_df_test['anomaly'], predicted_df_test['predicted_anomaly']))

In [None]:
print(classification_report(predicted_df_test['anomaly'], predicted_df_test['predicted_anomaly2']))

In [None]:
roc_auc_score(predicted_df_test['anomaly'], predicted_df_test['predicted_anomaly'])

In [None]:
roc_auc_score(predicted_df_test['anomaly'], predicted_df_test['predicted_anomaly2'])

In [None]:
predicted_df_test.building_id.unique()

In [None]:
visualizations = predicted_df_test[predicted_df_test.building_id == 1319]
visualizations

In [None]:
plt.plot(visualizations.meter_reading, label = "meter reading") #predicted_df_test.meter_reading[:8784]
plt.plot(visualizations.reconstruction, label = "w1_reconstruction")
plt.plot(visualizations.reconstruction2, label = "w2_reconstruction")
plt.legend()
plt.show()

In [None]:
predicted_anomalies1 = visualizations.loc[visualizations['predicted_anomaly'] == 1]
predicted_anomalies2 = visualizations.loc[visualizations['predicted_anomaly2'] == 1]
true_anomalies = visualizations.loc[visualizations['anomaly'] == 1]

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=visualizations.index, y=visualizations['meter_reading'], name='meter readings'))
fig.add_trace(go.Scatter(x=visualizations.index, y=visualizations['reconstruction'], name='meter readings reconstructed'))
fig.add_trace(go.Scatter(x=visualizations.index, y=visualizations['reconstruction2'], name='meter readings reconstructed'))
fig.add_trace(go.Scatter(x=true_anomalies.index, y=true_anomalies['meter_reading'], mode='markers', marker=dict(color='forestgreen'), name='True_Anomaly'))
fig.add_trace(go.Scatter(x=predicted_anomalies1.index, y=predicted_anomalies1['meter_reading'], mode='markers', marker=dict(color='yellow'), name='True_Anomaly'))
fig.add_trace(go.Scatter(x=predicted_anomalies2.index, y=predicted_anomalies2['meter_reading'], mode='markers', marker=dict(color='orange'), name='True_Anomaly'))
fig.update_layout(showlegend=True, title='meter readings predicted and anomalies - val')
fig.show()