In [88]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
import json
from types import SimpleNamespace
import sys
from torch.utils.data import DataLoader, TensorDataset, Dataset

class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        target = self.targets[idx]

        return sample, target

class PrepareData:
    def __init__(self, config):
        self.config = config

    def load_data(self, num_of_houses):
        # read from train list and concatenate the data
        for i in range(len(num_of_houses)):
            data = pd.read_csv(self.config.data_path + str(num_of_houses[i]) + '_compressed.csv')
            if i == 0:
                all_data = data
            else:
                all_data = pd.concat([all_data, data], axis=0)
        return all_data
    
    # write a function using MinMax scalar to normalize the data
    def normalize_data(self, data):
        scaler = MinMaxScaler()
        # data = scaler.fit_transform(data)
        data = data.to_numpy()
        # scaler = 0
        return data, scaler
    
    def segment_ev_load_data(self, data, window_length, num_of_houses):
        
        data = torch.from_numpy(data).to(self.config.device).float()
        segmented_data = []
        ground_truth = []

        for i in range(len(num_of_houses)):
            per_house_data = data[data[:, 0] == num_of_houses[i]]
            # Calculate the number of segments
            num_segments = (len(per_house_data) - window_length) + 1
            # Perform sliding window segmentation
            for start in range(num_segments):
                segmented_data.append(per_house_data[start:start + window_length, 2:])
                ground_truth.append(per_house_data[start:start + window_length, 1:2])

        # Perform sliding window segmentation
        # for start in range(num_segments):
        #     segmented_data = torch.cat(segmented_data, data[start:start + window_length, 1:])
        #     ground_truth = torch.cat(ground_truth, data[start:start + window_length, 0:1])

        # segmented_3d = segmented_data.reshape(segmented_data.shape[0], segmented_data.shape[1], -1)
        # ground_truth = ground_truth.reshape(ground_truth.shape[0], ground_truth.shape[1], -1)

        segmented_data = torch.stack(segmented_data)
        ground_truth = torch.stack(ground_truth)

        segmented_data = segmented_data[: -(segmented_data.shape[0] % self.config.batch_size)]
        ground_truth = ground_truth[: -(ground_truth.shape[0] % self.config.batch_size)]
        return segmented_data, ground_truth
        
    #write a function for train test loader
    def get_data_loader(self, data_preprocess, columns):
        data = self.load_data(self.config.train)
        data = data_preprocess.preprocess_data(data)
        id_col = data['dataid'].to_numpy().reshape(-1, 1)

        train_data, scaler_train = self.normalize_data(data[columns])
        train_data = np.concatenate((id_col, train_data), axis=1)
        train_data, train_ground_truth = self.segment_ev_load_data(train_data, self.config.lag_size, self.config.train)

        data = self.load_data(self.config.test)
        data = data_preprocess.preprocess_data(data)
        id_col = data['dataid'].to_numpy().reshape(-1, 1)
        test_data, scalar_test = self.normalize_data(data[columns])
        test_data = np.concatenate((id_col, test_data), axis=1)
        test_data, test_ground_truth = self.segment_ev_load_data(test_data, self.config.lag_size, self.config.test)
        # test_data, test_ground_truth = self.segment_ev_load_data(test_data, self.config.lag_size)

        # train_dataset = CustomDataset(train_data, train_ground_truth)
        # test_dataset = CustomDataset(test_data, test_ground_truth)

        # train_loader = DataLoader(train_dataset, batch_size=self.config.batch_size, shuffle=False)
        # test_loader = DataLoader(test_dataset, batch_size=self.config.batch_size, shuffle=False)

        return train_data, train_ground_truth, test_data, test_ground_truth, scaler_train, scalar_test
        # return train_loader, test_loader, scaler_train, scalar_test
    
    def split_train_test(self, data):
        train_data = data[data['local_15min'] < self.config.split_date]
        test_data = data[data['local_15min'] >= self.config.split_date]
        return train_data, test_data