In [1]:
import holidays
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import torch
import pywt

class DatasetPreprocess:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def wavelet_denoising(self, signal, wavelet='db5', threshold=0.04):
        # Perform discrete wavelet transform (DWT)
        coeffs = pywt.wavedec(signal, wavelet)

        coeffs_thresholded = [pywt.threshold(c, threshold, mode='soft') for c in coeffs]

        # Reconstruct the signal from coefficients without thresholding
        reconstructed_signal = pywt.waverec(coeffs_thresholded, wavelet)
        return reconstructed_signal
    
    def refine_ev_charging_values(self, reconstructed):
        counts, bin_edges = np.histogram(reconstructed['ev_car'], bins='doane')

        sorted_indices = np.argsort(counts)[::-1]

        # Get the index of the third maximum value
        third_max_index = sorted_indices[1]

        # Determine the bin corresponding to the third maximum value
        charging_bin_start = bin_edges[third_max_index]

        reconstructed['ev_car'] = np.where(reconstructed['ev_car'] >= charging_bin_start, reconstructed['ev_car'], 0)
        reconstructed['ev_label'] = np.where(reconstructed['ev_car'] > 0, 1, 0)
        return reconstructed
    
    def process_faulty_values(self, dt):
        # Replacing the total_power_consumption values by the sum of the individual appliances that have smaller value than actually consumed
        # dt['total_power_consumption'] = np.where(dt['total_power_consumption'] < dt['total_usage'], dt['total_usage'], dt['total_power_consumption'])
        return dt
    
    def preprocess_data(self, data):
        # Perform wavelet denoising
        # data['total_power_consumption'] = self.wavelet_denoising(data['total_power_consumption'])
        # data = self.extract_features(data)
        # Refine the electric vehicle charging values
        data = self.refine_ev_charging_values(data)
        data['total_power_consumption'] = data['ev_car'] + data['clotheswasher1'] + data['dishwasher1'] + data['drye1'] + data['microwave1'] + data['refrigerator1']
        data = self.process_faulty_values(data)
        return data
    
    def extract_features(self, df):
        # Extracting the features from the data
        df['local_15min'] = pd.to_datetime(df['local_15min'], utc=True)
        df['Month']=df['local_15min'].dt.month
        # df['Quarter_of_year']=df['local_15min'].dt.quarter
        df['Day_of_week']=df['local_15min'].dt.day_of_week + 1
        df['Day']=df['local_15min'].dt.day
        df['Hour']=df['local_15min'].dt.hour
        df['holiday'] = df['local_15min'].apply(lambda x: 1 if x in holidays.US() else 0)
        df['minute'] = df['local_15min'].dt.minute
        # df['mean'] = df['total_power_consumption'].expanding().mean()
        #calculate the mean differences between the mean till that and previous point
        # df['mean_diff'] = df['mean'].diff().fillna(0)
        # df['std'] = df['total_power_consumption'].expanding().std().fillna(0)
        # df['std_diff'] = df['std'].diff().fillna(0)
        # df.reset_index(drop=True, inplace=True)
        # One-Hot Encoding
        # write a lambda function for three cases
        # df['dataid_label'] = df['dataid'].apply(lambda x: 0 if x == 661 else 1 if x == 1642 else 2)
        # df['is_ev'] = df['ev_car'].apply(lambda x: 1 if x > 3 else 0)
        return df
    
    

In [2]:
class QuantileDataPreprocess:
    def __init__(self, config):
        self.config = config

    def pre_process_data_for_unetnilm(self, dataport_appliance_data, dataset_name, columns, data_type="training"):
        targets = []
        # states = [] 
        data = pd.read_csv(self.config.data_path + dataset_name + '_compressed.csv')
        # columns = ['ev_car', 'clotheswasher1', 'refrigerator1', 'microwave1']
        # data["total_power_consumption"] = data[columns].sum(axis=1)
        # data['total_usage'] = data['total_usage'] - data['ev_car']
        for app in list(dataport_appliance_data.keys()):
            power = data[app].values
            mean = data[app].mean()
            std = data[app].std()
            min = data[app].min()
            max = data[app].max()
            print(app, mean, std, min, max)
            # meter=self.quantile_filter(dataport_appliance_data[app]['window'], power, p=1)
            # state = binarization(meter,ukdale_appliance_data[app]['on_power_threshold'])
            # min max scalar
            # meter = (power - min)/(max-min)
            meter = (power - mean)/std
            targets.append(meter)
            # states.append(state)
            
        main_mean = data.total_usage.mean()
        main_std = data.total_usage.std()
        main_min = data.total_usage.min()
        main_max = data.total_usage.max()
        # mains_denoise = data.sub_mains.values
        # mains_denoise = self.squantile_filter(10, mains_denoise, 50)
        # mains = data.total_power_consumption.values-np.percentile(data.total_power_consumption.values, 1)
        # # mains = np.where(mains <mains_denoise, mains_denoise, mains)
        # mains = self.quantile_filter(10, mains, 50)
        # mains_denoise = (mains_denoise - 123)/369
        mains = data.total_usage.values
        # mains = (mains - main_min)/(main_max-main_min)
        mains = (mains - main_mean)/main_std
        # states = np.stack(states).T

        # remove the additional data points from target
        # target_length = targets[1].shape[0]
        # for i in range(len(targets)):
        #     targets[i] = targets[i][:target_length]

        # mains = mains[:target_length]


        targets = np.stack(targets).T
        
        del power, meter
        # np.save(save_path+f"/{data_type}/denoise_inputs.npy", mains_denoise)
        #save files in csv format

        np.save(f"Dataset/pecan_street/austin/npy_dataset/{dataset_name}_input_with_ev_unet", mains)
        np.save(f"Dataset/pecan_street/austin/npy_dataset/{dataset_name}_target_with_ev_unet", targets)
        # np.save(save_path+f"/{data_type}/states.npy", states)   

    def get_percentile(self, data,p=50):
        """[summary]
        
        Arguments:
            data {[type]} -- [description]
            quantile {[type]} -- [description]
        
        Returns:
            [type] -- [description]
        """
        return np.percentile(data, p, axis=1, method="nearest")

    def generate_sequences(self, sequence_length, data):
        sequence_length = sequence_length - 1 if sequence_length% 2==0 else sequence_length
        units_to_pad = sequence_length // 2
        # new_mains = np.pad(data, (units_to_pad,units_to_pad),'constant',constant_values=(0,0))
        new_mains = data
        new_mains = np.array([new_mains[i:i + sequence_length] for i in range(len(new_mains) - sequence_length+1)])
        return new_mains

    def quantile_filter(self, sequence_length, data, p=50):
        new_mains = self.generate_sequences(sequence_length, data)
        new_mains = self.get_percentile(new_mains, p)
        return new_mains 

In [3]:
data = pd.read_csv('Dataset/pecan_street/austin/house_energy_compressed/661_compressed.csv')

NameError: name 'pd' is not defined