In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as mlt
import seaborn as sp
from torch.autograd import Variable
from torch import autograd
from datetime import datetime
import matplotlib.pyplot as plt
import argparse
from datetime import timedelta

In [11]:
class PreProcessDataset(nn.Module):
    def __init__(self, raw_data_file_path, dataset_type, dataset_parameters):
        super(PreProcessDataset, self).__init__()
        self.dataset_type = dataset_type
        self.raw_data_file_path = raw_data_file_path
        self.dataset_parameters = dataset_parameters

    def remove_outliers(self, df):
        column_name = 'Energy'
        Q1 = df[column_name].quantile(0.25)
        Q3 = df[column_name].quantile(0.75)
        IQR = Q3 - Q1
            
        # Define the lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
            
        # Filter the DataFrame to keep only the non-outliers
        df_no_outliers = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
        return df_no_outliers

    def build_dataset(self, df):
        df['Start'] =  pd.to_datetime(df['Start'])
        df['End'] =  pd.to_datetime(df['End'])
        return df

    def resample_dataset(self, df, frequency):
        hourly_data = []
        for _, row in df.iterrows():
            plugin_time = row['Start']
            plugout_time = row['End']
            total_energy = row['Energy']
            if 'Charge.Duration' in df.columns:
                total_charging_duration = row['Charge.Duration']
            else:
                total_charging_duration = row['Park.Duration']
            #total_charging_duration = row['Charge.Duration']
            enery_per_minute = total_energy / total_charging_duration
            # Generate hourly rows
            while plugin_time < plugout_time and total_charging_duration > 0:
                # Round down to the nearest hour
                start_time = plugin_time.replace(minute=0, second=0, microsecond=0)

                # Add one hour
                next_start_time = start_time + timedelta(hours=1)

                if next_start_time > plugout_time:
                    break
                    
                time_diff = (next_start_time - plugin_time).total_seconds() / 60
                time_diff = min(time_diff, total_charging_duration)
                enery_consumption = time_diff * enery_per_minute
                total_charging_duration = total_charging_duration - time_diff

                hourly_data.append({
                    'Start': start_time,
                    'End': next_start_time,
                    'Day': row['Day'],
                    'Energy': enery_consumption,
                    'Time Duration': time_diff,
                })
                
                plugin_time = next_start_time
            if total_charging_duration == 0:
                continue
            enery_consumption = total_charging_duration * enery_per_minute
            hourly_data.append({
                'Start': start_time,
                'End': next_start_time,
                'Day': row['Day'],
                'Energy': enery_consumption,
                'Time Duration': plugout_time.minute,
            })

        # Create a new DataFrame from the hourly data
        hourly_df = pd.DataFrame(hourly_data)
        return hourly_df
    
    def aggregate_dataset(self, df, frequency):
        df = df.groupby('Start').agg({
            'End': 'first',
            'Day': 'first',
            'Energy': 'sum',
        }).reset_index()

        df.set_index('Start', inplace=True)
        df = df.resample('1H').asfreq()
        df.reset_index(inplace=True)
        df.loc[:,'Day'] = df['Start'].dt.dayofweek + 1
        df = df.astype({'Day': 'int32'})
        df['Week Day'] = (df['Day'] <= 5).astype(int)
        df.rename(columns={'Day': 'Day of week'}, inplace=True)
        df['Year'] = df['Start'].dt.year
        df['Month'] = df['Start'].dt.month
        df['Day of month'] = df['Start'].dt.day
        df.drop(columns=['End'], inplace=True)
        
        return df

    def interpolate_data(self, df, type=None):
        if type == None:
            df['Energy'] = df['Energy'].replace(np.nan, 0)
        elif type == 'linear':
            df['Energy'] = df['Energy'].interpolate(method = type, order = 2)
        else:
            df['Energy'] = df['Energy'].replace(np.nan, 0)
            df['Energy'] = df.groupby(df['Start'].dt.date)['Energy'].cumsum()
        return df

    def create_different_dataset(self, df_with_zero, dataset_type):
        df_with_zero.to_csv('../Dataset/Processed/' + dataset_type + '_data_with_zero.csv', index=False)

    def forward(self):
        df = pd.read_csv(self.raw_data_file_path)
    
        df = self.remove_outliers(df)
        df = df.loc[df['Start'] <= self.dataset_parameters.split_date]

        df = self.build_dataset(df)
        df = self.resample_dataset(df, self.dataset_parameters.resample_frequency)
        df = self.aggregate_dataset(df, self.dataset_parameters.resample_frequency)

        df_with_zero = self.interpolate_data(df.copy())
        self.create_different_dataset(df_with_zero, self.dataset_type)