In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import MinMaxScaler, Normalizer, RobustScaler, StandardScaler,MaxAbsScaler,PowerTransformer,QuantileTransformer
import statsmodels.api as sm
from get_model import get_model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prepare_dataset(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['filled']=df['meter_reading'].apply(lambda x: 0)
    existing_hours = df['timestamp'].dt.floor('H').unique()

    start_date = df['timestamp'].min().replace(minute=0, second=0)
    end_date = df['timestamp'].max().replace(minute=0, second=0)
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    all_hours_present = all(hour in existing_hours for hour in date_range)
    if not(all_hours_present):
        complete_df = pd.DataFrame({'timestamp': date_range})
        df = complete_df.merge(df, on='timestamp', how='left')
        df['filled']=df['filled'].fillna(1)
        df['meter_reading'] = df['meter_reading'].interpolate(method='linear', limit_direction='both')
        df.reset_index(inplace=True, drop=True)
    #apply minmax scaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    df['meter_reading'] = scaler.fit_transform(df['meter_reading'].values.reshape(-1,1))
    grouped_df = df.groupby(df['timestamp'].dt.date)

    # Aggregate 'meter_reading' values into a list for each day
    aggregated_df = grouped_df.agg({'meter_reading': list, 'anomaly': list, 'filled':list}).reset_index()

    # Rename columns and sort by date
    aggregated_df.columns = ['date', 'readings', 'anomalies','filled']
    aggregated_df = aggregated_df.sort_values(by='date')

    # Display the aggregated dataframe
    aggregated_df["length"] = aggregated_df["readings"].apply(lambda lst: len([x for x in lst if not pd.isna(x)]))
    aggregated_df["no_anomalies"] = aggregated_df["anomalies"].apply(lambda x: True if all(val == 0 for val in x) else False)

    df=aggregated_df[aggregated_df["length"]==24]
    df['cycle'] = df["readings"].apply(lambda x: sm.tsa.filters.hpfilter(x, 2)[0])
    df['trend'] = df["readings"].apply(lambda x: sm.tsa.filters.hpfilter(x, 2)[1])
    df["months"] = df["date"].apply(lambda x: str(x.month))
    df["weekday"] = df["date"].apply(lambda x: str(x.weekday()))
    df["weekend"] = df["weekday"].apply(lambda x: 1 if x in ["5","6"] else 0)
    return df

def generate_data(df,column):
    data=df[column]
    data=np.concatenate(data)

    data=data.reshape((1,df.shape[0],24))

    return data

def prepare_line(col1, col2,col3):
    columns=[col1, col2,col3]
    dataset=[]
    for column in columns:
        data=np.array(column)
        data=data.reshape(1,1,24)
        dataset.append(data)
    data_array=np.concatenate(dataset)
    data_array=np.transpose(data_array, (1,0,2))
    return(data_array)
def prepare_line0(col1, col2):
    columns=[col1, col2]
    dataset=[]
    for column in columns:
        data=np.array(column)
        data=data.reshape(1,1,24)
        dataset.append(data)
    data_array=np.concatenate(dataset)
    data_array=np.transpose(data_array, (1,0,2))
    return(data_array)

In [3]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
def get_cluster_labels(df,column):
    def get_majority(lst):
        return max(set(lst), key=lst.count)
    X = np.array(df[column].tolist())
    # do PCA with 2 components on X
    pca = PCA(n_components=2)
    X = pca.fit_transform(X)

    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X)
    max_label = get_majority(kmeans.labels_.tolist())
    labels=[0 if i==max_label else 1 for i in kmeans.labels_ ]
    return labels


In [None]:
#files=os.listdir('dataset/train')[0:5]
import io
import sys
from contextlib import redirect_stdout
from tqdm import tqdm
from IPython.display import clear_output
captured_output = io.StringIO()
files=["118.csv","246.csv","1245.csv","1311.csv","1141.csv"]
models=['AE',
 'VAE',
 'BETA',
 'VAE_LinNF',
 'VAE_IAF',
 'DBVAE',
 'IWVAE',
 'MIWAE',
 'CIWAE',
 'WAE',
'INFOVAE',
 'VAMP',
 'SVAE',
 'PVAE',
 'VQVAE',
 'HVAE',
 'RAE_GP',
 'RHVAE']
pbar=tqdm(total=len(models))
try:
    for model_name in models:
        original_stdout = sys.stdout
        sys.stdout = captured_output
        for file in files:
            
            dataset=pd.read_csv('dataset/train/'+file)
            dataset=prepare_dataset(dataset)
            columns_cluster=["cycle","trend"]
                        
            for column_cluster in columns_cluster:
                dataset['kmeans_{}'.format(column_cluster)]=np.nan
            for month in dataset["months"].unique():
                for day in dataset["weekend"].unique():
                        df=dataset[(dataset["months"]==month) & (dataset["weekend"]==day)].copy()
                        
                        
                        for column_cluster in columns_cluster:
                                
                                df['kmeans_{}'.format(column_cluster)]=get_cluster_labels(df,column_cluster)
                                #create a new column with the cluster labels in dataset and initialize it with NAN
                                
                                
                        dataset.update(df)
            for column_cluster in columns_cluster:
                if not(os.path.exists("experiment_2/csv/{}".format(column_cluster))):
                    os.makedirs("experiment_2/csv/{}".format(column_cluster))
                train=dataset[dataset['kmeans_{}'.format(column_cluster)]==0.0]
                test=dataset[dataset['kmeans_{}'.format(column_cluster)]==1.0]
                for month in dataset["months"].unique():
                    for day in dataset["weekend"].unique():
                        
                        train_data=train[(train["months"]==month) & (train["weekend"]==day)]
                        test_data=test[(test["months"]==month) & (test["weekend"]==day)]
                        train_data.reset_index(inplace=True)
                        test_data.reset_index(inplace=True) 
                        if train_data.shape[0]==0:
                            split=0.5
                            train_data=test_data[0:int(split*test_data.shape[0])]
                            eval_data=test_data[int(split*test_data.shape[0]):]
                        else:
                            if test_data.shape[0]==0:
                                eval_data=None
                            else:
                                eval_data=test_data
                        train_columns=["readings","cycle","trend"]
                        train_numpy=np.concatenate([generate_data(train_data,column) for column in train_columns])
                        train_numpy=train_numpy.transpose((1,0,2))
                        
                        pipeline = get_model(model_name,dim=len(train_columns),train_batch=train_numpy.shape[0])
                        pipeline(
                        train_data=train_numpy# must be torch.Tensor, np.array or torch datasets
                        )
                        print("Model ready!")
                        my_vae_model=pipeline.model
                        train_data["preprocessed"]=train_data[train_columns].apply(lambda row: prepare_line(*row), axis=1)
                        train_data.reset_index(inplace=True, drop=True)
                        train_data["reconstruction"]=train_data["preprocessed"].apply((lambda row: my_vae_model.reconstruct(torch.from_numpy(row.astype(np.float32))).detach().numpy()))

                        if test_data.shape[0]!=0:
                            eval_data["preprocessed"]=eval_data[train_columns].apply(lambda row: prepare_line(*row), axis=1)
                            eval_data["reconstruction"]=eval_data["preprocessed"].apply((lambda row: my_vae_model.reconstruct(torch.from_numpy(row.astype(np.float32))).detach().numpy()))
                            eval_data.reset_index(inplace=True, drop=True)
                        if test_data.shape[0] != 0:
                            dataset_final=pd.concat([train_data,eval_data])
                        else:
                            dataset_final=train_data
                        dataset_final["reconstruction"]=dataset_final["reconstruction"].apply((lambda x: x.reshape((1, len(train_columns), 24))))
                        dataset_final['difference'] = dataset_final['preprocessed'] - dataset_final['reconstruction']

                        for i, column in enumerate(train_columns):
                            dataset_final['difference_norm_{}'.format(column)] = dataset_final['difference'].apply(lambda x: np.linalg.norm(x[:,i,:],axis=(0)))
                        dataset_final.reset_index(inplace=True,drop=True)
                        for ind in dataset_final.index:
                            tmp_df=pd.DataFrame()
                            anomalies=dataset_final.loc[ind,"anomalies"]
                            filled=dataset_final.loc[ind,"filled"]
                            tmp_df["anomalies"]=anomalies
                            tmp_df["filled"]=filled
                            tmp_df.reset_index(inplace=True,drop=True)
                            date=dataset_final.loc[ind,"date"]
                            tmp_df['datetime'] = pd.to_datetime(date) + pd.to_timedelta(tmp_df.index, unit='h')


                            
                            for column in train_columns:

                                tmp_df["difference_norm_{}".format(column)]=dataset_final.loc[ind,"difference_norm_{}".format(column)]
                                
                            if not(os.path.exists("experiment_2/csv/{}/latent_dim_16/{}".format(column_cluster,model_name))):
                                os.makedirs("experiment_2/csv/{}/latent_dim_16/{}".format(column_cluster,model_name))
                            tmp_df.to_csv("experiment_2/csv/{}/latent_dim_16/{}/building_{}_month_{}_weekend_{}_{}_column_{}.csv".format(column_cluster,model_name,file,month,day,ind,column),index=False)
                clear_output(wait=False)    
        sys.stdout = original_stdout
        pbar.update(1)
except Exception as e:
    sys.stdout = original_stdout
    print(e)
    print("problem with this case:")
    print("model: {}, building: {}, month: {}, day: {}, index: {}, column: {}".format(model_name,file,month,day,ind,column))
    print("training samples: {}, testing samples: {}".format(train_data.shape[0],eval_data.shape[0]))
    print("try removing the model from the model list and debug it separately")
    pbar.close()
pbar.close()