In [4]:
# Load packages and directories


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import os
import pandas as pd
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.subplots as sp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import plotly.express as px
import seaborn as sns
import importlib

import preparing_for_bill as pfb

import re
import pickle 
from concurrent.futures import ProcessPoolExecutor
base_directory = "C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning"
data_directory = os.path.join(base_directory, 'edp_data')
cleaned_data_directory = os.path.join(base_directory, 'resampled_30min_data')

# Directory containing the household profile CSV files
survey_data = pd.read_csv(os.path.join(base_directory, 'survey/edp_survey_aircon.csv'))  
output_directory = os.path.join(base_directory, 'figures')  # Change this to your output directory
resampled_data_directory = os.path.join(base_directory, 'resampled_30min_data_real')

filtered_data_directory = os.path.join(base_directory, 'archive/filtered_data_real')

# Directory to save profiles
site_profiles_directory = os.path.join(base_directory, 'full_profiles_real')

resampled_data_directory = os.path.join(base_directory,'resampled_30min_data_real')

In [7]:
edp_2023_01 = pd.read_csv(os.path.join(data_directory, "edp_data_2023_01.csv"))
unique_circuit_labels = edp_2023_01["circuit_label"].unique()
print(unique_circuit_labels)

['ac_load_net' 'pv_site_net' 'load_other' 'load_air_conditioner'
 'load_stove' 'load_oven' 'load_hot_water' 'load_lighting' 'pv_site'
 'load_powerpoint' 'load_pool' 'load_laundry' 'load_subboard' 'load_spa'
 'load_garage' 'ac_load' 'load_refrigerator' 'load_ev_charger'
 'load_hot_water_solar' 'battery_storage' 'load_tenant' 'load_shed'
 'load_office' 'load_kitchen' 'load_battery' 'load_studio'
 'load_air_compressor' 'pv_site_net_battery']


In [12]:
with open("testing_site_ids.pkl", "rb") as file:
    testing_site_ids = pickle.load(file)

In [40]:
importlib.reload(pfb)
to_process_ids = ["W0082", "W0241", "W0321", "W0329", "W0326", "W0082"]
def process_site(site_id, data_directory, site_filtered_data_directory, site_profiles_directory):
    """
    Process all files for a given site in parallel, filter and aggregate the data, 
    and save the final result to a CSV file.
    """
    # Ensure the site-specific filtered data directory exists
    os.makedirs(site_filtered_data_directory, exist_ok=True)

    # Initialize an empty list to store the aggregated dataframes
    dfs = []

    # Process each file in parallel
    with ProcessPoolExecutor(max_workers=2) as executor:
        filepaths = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.endswith('.csv')]
        futures = [executor.submit(pfb.process_file, filepath, site_id) for filepath in filepaths]

        # Collect results as they complete
        for future in futures:
            result = future.result()
            if result is not None:
                dfs.append(result)

    # If no data was collected, skip to next site
    if not dfs:
        print(f"No matching data for site {site_id}")
        return

    # Concatenate all dataframes and sort by datetime
    stacked_df = pd.concat(dfs, ignore_index=True)
    stacked_df['datetime'] = pd.to_datetime(stacked_df['datetime'])
    stacked_df.sort_values(by='datetime', inplace=True)

    # Rename columns to the desired format
    stacked_df.rename(columns={'edp_site_id': 'CUSTOMER_ID', 'datetime': 'TS', 'pv_site_net': 'PV', 'ac_load_net': 'kWh', 'load_air_conditioner': 'Air_Conditioner_Load'}, inplace=True)

    # Save the final dataframe to a CSV file
    output_filename = os.path.join(site_profiles_directory, f'{site_id}_profile.csv')
    stacked_df.to_csv(output_filename, index=False)
    print(f"Processed data for site {site_id} saved to {output_filename}")



for site_id in to_process_ids:
    site_filtered_data_directory = os.path.join(filtered_data_directory, site_id)
    process_site(site_id, data_directory, site_filtered_data_directory, site_profiles_directory)

KeyboardInterrupt: 

In [7]:




def process_filtered_data_2(filtered_sample):
    """Aggregates the filtered data by 'datetime', 'edp_site_id', and 'circuit_label'."""
    if filtered_sample.empty:
        return None

    # Group and aggregate in one step for all circuit types
    aggregated_sample = filtered_sample.groupby(['datetime', 'edp_site_id', 'circuit_label'])['real_energy'].sum().unstack('circuit_label')
    
    # Reset index to make 'datetime' and 'edp_site_id' columns again
    aggregated_sample.reset_index(inplace=True)

    return aggregated_sample


def process_site_2(site_ids, full_data, site_profiles_directory):
    """
    Process all files for a given site sequentially, filter and aggregate the data, 
    and save the final result to a CSV file.
    """

    for site_id in site_ids:
        # Filter for the specific site ID and relevant circuit labels
        print("processing: ", site_id)
        filtered_data = full_data[
            (full_data['circuit_label'].isin([
                'ac_load_net', 'pv_site_net', 'ac_load', 'load_air_conditioner', 
                'pv_site', 'pv_site_net_battery', 'battery_storage', 'load_battery'
            ])) & 
            (full_data['edp_site_id'] == site_id)
        ]
        
        if filtered_data.empty:
            print(f"No matching data for site {site_id}")
            continue  # Skip this site and proceed to the next one

        # Process the filtered data sequentially
        processed_data = process_filtered_data_2(filtered_data)

        # If no data was returned, skip to next site
        if processed_data is None or processed_data.empty:
            print(f"No processed data for site {site_id}")
            continue

        # Ensure datetime is in the correct format and sort
        processed_data['datetime'] = pd.to_datetime(processed_data['datetime'])
        processed_data.sort_values(by='datetime', inplace=True)

        # Rename columns to the desired format
        processed_data.rename(columns={
            'edp_site_id': 'CUSTOMER_ID', 
            'datetime': 'TS', 
            'pv_site_net': 'PV', 
            'ac_load_net': 'kWh', 
            'load_air_conditioner': 'Air_Conditioner_Load'
        }, inplace=True)

        # Save the final dataframe to a CSV file
        output_filename = os.path.join(site_profiles_directory, f'{site_id}_profile.csv')
        processed_data.to_csv(output_filename, index=False)
        print(f"Processed data for site {site_id} saved to {output_filename}")


In [9]:


# Define directories
data_directory = os.path.join(base_directory, "edp_data")
site_profiles_directory = os.path.join(base_directory, "full_profiles_real")

# Get list of CSV files
filepaths = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.endswith('.csv')]

# Process files one by one with optimized memory usage
dfs = []
for filepath in filepaths:
    try:
        print("reading ", filepath)
        # Read only required columns and convert `real_energy` to float32
        df = pd.read_csv(filepath, usecols=['edp_site_id', 'datetime', 'circuit_label', 'real_energy'], 
                         dtype={'real_energy': 'float32'}, 
                         engine='python', on_bad_lines='skip')

        dfs.append(df)
    except Exception as e:
        print(f"Error processing {filepath}: {e}")


# Concatenate all processed DataFrames
if dfs:
    full_data = pd.concat(dfs, ignore_index=True)
    print("Successfully loaded all files into a single DataFrame.")
else:
    print("No valid data to process.")



reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_01.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_02.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_03.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_04.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_05.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_06.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_07.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_08.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\edp_data_2023_09.csv
reading  C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\edp_data\e

In [11]:

process_site_2(testing_site_ids, full_data, site_profiles_directory)

processing:  S0576
Processed data for site S0576 saved to C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\full_profiles_real\S0576_profile.csv
processing:  W0236
Processed data for site W0236 saved to C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\full_profiles_real\W0236_profile.csv
processing:  S0177
Processed data for site S0177 saved to C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\full_profiles_real\S0177_profile.csv
processing:  W0087
Processed data for site W0087 saved to C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\full_profiles_real\W0087_profile.csv
processing:  W0058
Processed data for site W0058 saved to C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\full_profiles_real\W0058_profile.csv
processing:  W0334
Processed data for site W0334 saved to C:/Users/CEEM04/OneDrive-Emily/OneDrive - UNSW/edpdatacleaning\full_profiles_real\W0334_profile.csv
processing:  W0031
Processed data for site W0031 sav

In [13]:


def process_site_profiles(
    testing_site_ids,
    input_directory='full_profiles_real',
    output_directory='resampled_30min_data_real',
    start_date='2018-01-01',
    end_date='2023-12-31'
):
    """
    Process site profiles by resampling and converting units.
    
    Parameters:
    - testing_site_ids: List of site IDs to process
    - input_directory: Directory containing input CSV files
    - output_directory: Directory for output CSV files
    - start_date: Start date for filtering data (YYYY-MM-DD)
    - end_date: End date for filtering data (YYYY-MM-DD)
    """
    # Define column requirements
    required_columns = {'TS', 'kWh', 'Air_Conditioner_Load'}
    optional_columns = {'PV', 'ac_load', 'pv_site', 'pv_site_net_battery', 'battery_storage', 'load_battery'}
    
    # Ensure output directory exists    
    # Define column dtypes for optimized reading
    dtypes = {
        'PV': 'float32',
        'kWh': 'float32',
        'pv_site_net_battery': 'float32',
        'battery_storage': 'float32',
        'load_battery': 'float32',
        'Air_Conditioner_Load': 'float32',
        'pv_site': 'float32',
        'ac_load': 'float32'

    }
    
    for site_id in testing_site_ids:
        try:
            # Construct file paths
            input_file = os.path.join(base_directory, input_directory, f'{site_id}_profile.csv')
            output_file = os.path.join(base_directory, output_directory, f'{site_id}_real_profile.csv')
            
            # Read header first to check available columns
            available_columns = pd.read_csv(input_file, nrows=0).columns
            
            # Verify required columns exist
            missing_required = required_columns - set(available_columns)
            if missing_required:
                raise ValueError(f"Missing required columns: {missing_required}")
            
            # Determine which optional columns are available
            usable_columns = list(required_columns | (optional_columns & set(available_columns)))
            
            # Read CSV with optimized settings
            df = pd.read_csv(
                input_file,
                parse_dates=['TS'],
                index_col='TS',
                dtype={col: dtypes.get(col, 'object') for col in usable_columns if col != 'TS'},
                usecols=usable_columns
            )
            
            # Process the data
            processed_df = (df
                          .loc[start_date:end_date]      # Filter dates
                          .groupby(level=0).mean()       # Average duplicate timestamps
                          .resample('30T')               # Resample to 30-minute intervals
                          .sum(numeric_only=True)        # Sum values in each interval
                          .mul(2)                        # Convert W to Wh
                         )
            
            # Convert units from Wh to kWh for required columns
            processed_df[['kWh', 'Air_Conditioner_Load']] = processed_df[['kWh', 'Air_Conditioner_Load']] / 1000
            
            # Convert units for battery column if it exists
            columns_to_convert = ['PV', 'ac_load', 'pv_site', 'pv_site_net_battery', 'battery_storage', 'load_battery']

            for col in columns_to_convert:
                if col in processed_df.columns:
                    processed_df[col] = processed_df[col] / 1000

            
            # Save the results
            processed_df.reset_index().to_csv(output_file, index=False)
            
            print(f"Successfully processed {site_id} with columns: {list(processed_df.columns)}")
            
        except FileNotFoundError:
            print(f"Error: Input file for site {site_id} not found")
        except pd.errors.EmptyDataError:
            print(f"Error: No data found in file for site {site_id}")
        except ValueError as ve:
            print(f"Error with site {site_id}: {str(ve)}")
        except Exception as e:
            print(f"Error processing site {site_id}: {str(e)}")


process_site_profiles(testing_site_ids)

Successfully processed S0576 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0236 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed S0177 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0087 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0058 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0334 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0031 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0142 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed S0209 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0328 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed S0463 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed W0241 with columns: ['kWh', 'Air_Conditioner_Load', 'PV']
Successfully processed S0299

After Processing, Create new DF with bootstrapped gross profile. 

In [20]:
importlib.reload(pfb)
pfb.process_consumption_profiles_2(
    base_directory,
    profiles_directory='resampled_30min_data_real',
    bootstrapped_file='bootstrapped_real_merged_df.csv'
)


Processing site: S0177
Processed data saved to: resampled_30min_data_real\S0177_processed_profile.csv

Processing site: S0179
Processed data saved to: resampled_30min_data_real\S0179_processed_profile.csv

Processing site: S0209
Found 5 PV values below standby consumption (-0.03kW)
Processed data saved to: resampled_30min_data_real\S0209_processed_profile.csv

Processing site: S0216
Processed data saved to: resampled_30min_data_real\S0216_processed_profile.csv

Processing site: S0227
Processed data saved to: resampled_30min_data_real\S0227_processed_profile.csv

Processing site: S0229
Processed data saved to: resampled_30min_data_real\S0229_processed_profile.csv

Processing site: S0249
Processed data saved to: resampled_30min_data_real\S0249_processed_profile.csv

Processing site: S0250
Processed data saved to: resampled_30min_data_real\S0250_processed_profile.csv

Processing site: S0299
Processed data saved to: resampled_30min_data_real\S0299_processed_profile.csv

Processing site: S

In [None]:
process_consumption_profiles(
    base_directory,
    profiles_directory='resampled_30min_data_real',
    bootstrapped_file='bootstrapped_real_merged.csv'
)

Figure out what ac_load and pv_site mean means

pv site means gross generation
ac_load is gross electricity consumption

In [27]:
raw_ac_load = edp_2023_01[edp_2023_01['circuit_label'] == 'ac_load']
print(raw_ac_load['edp_site_id'].unique())

['S0195' 'S0241' 'S0156' 'S0162']


In [33]:
S0195 = edp_2023_01[edp_2023_01["edp_site_id"] == "S0195"]
print(S0195["circuit_label"].unique())


#only S0162 and S0195 have ac_load, pv_site, and load_air_conditioner

['ac_load' 'pv_site' 'load_air_conditioner']


In [32]:


# filter for when circuit label = pv_site_net
S0162_pv = S0162[S0162["circuit_label"] == "pv_site"]
S0162_ac = S0162[S0162["circuit_label"] == "ac_load"]

df = S0162_pv
trace_load = go.Scatter(x=df['datetime'], y=df['real_energy'], mode='lines', name='Load')
trace_load2 = go.Scatter(x=S0162_ac['datetime'], y=S0162_ac['real_energy'], mode='lines', name='Load')

# Define layout
layout = go.Layout(
    title=f'PV site net battery Generation and Load Over Time',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Energy (W)'),
    legend=dict(x=0.01, y=0.99),
    plot_bgcolor='rgba(0,0,0,0)'
)

# Create figure object combining PV generation and load traces
fig = go.Figure(data=[trace_load, trace_load2], layout=layout)

# Display the figure
fig.show()

then looking at pv site net battery circuit label

In [15]:
pv_site_net_battery = edp_2023_01[edp_2023_01['circuit_label'] == 'pv_site_net_battery']
print(pv_site_net_battery['edp_site_id'].unique())

['W0078' 'W0109' 'W0130' 'W0133' 'W0145' 'W0148' 'W0156' 'W0158' 'W0178'
 'W0192' 'W0200' 'W0203' 'W0213' 'W0217' 'W0227' 'W0228' 'W0235' 'W0247'
 'W0251' 'W0255' 'W0272' 'W0276' 'W0286' 'W0293' 'W0306' 'W0307' 'W0316'
 'W0318' 'W0321' 'W0352']


In [23]:
W0352 = edp_2023_01[edp_2023_01["edp_site_id"] == "W0148"]
print(W0352["circuit_label"].unique())

# filter for when circuit label = pv_site_net
W0352_pv = W0352[W0352["circuit_label"] == "pv_site_net_battery"]
W0352_ac = W0352[W0352["circuit_label"] == "ac_load_net"]

df = W0352_pv
trace_load = go.Scatter(x=df['datetime'], y=df['real_energy'], mode='lines', name='Load')
trace_load2 = go.Scatter(x=W0352_ac['datetime'], y=W0352_ac['real_energy'], mode='lines', name='Load')

# Define layout
layout = go.Layout(
    title=f'PV site net battery Generation and Load Over Time',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Energy (W)'),
    legend=dict(x=0.01, y=0.99),
    plot_bgcolor='rgba(0,0,0,0)'
)

# Create figure object combining PV generation and load traces
fig = go.Figure(data=[trace_load, trace_load2], layout=layout)

# Display the figure
fig.show()

['ac_load_net' 'pv_site_net_battery' 'load_powerpoint'
 'load_air_conditioner' 'load_hot_water']


In [26]:
W0352 = edp_2023_01[edp_2023_01["edp_site_id"] == "W0219"]
print(W0352["circuit_label"].unique())

# filter for when circuit label = pv_site_net
W0352_pv = W0352[W0352["circuit_label"] == "pv_site_net"]
W0352_bat = W0352[W0352["circuit_label"] == "load_battery"]
W0352_ac = W0352[W0352["circuit_label"] == "ac_load_net"]

df = W0352_pv
trace_load = go.Scatter(x=df['datetime'], y=df['real_energy'], mode='lines', name='Load', line = dict(color='blue'))
trace_load2 = go.Scatter(x=W0352_ac['datetime'], y=W0352_bat['real_energy'], mode='lines', name='Load', line = dict(color='red'))
trace_load3 = go.Scatter(x=W0352_ac['datetime'], y=W0352_ac['real_energy'], mode='lines', name='Load', line=dict(color='green'))

# Define layout
layout = go.Layout(
    title=f'Battery Load and Load Over Time',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Energy (W)'),
    legend=dict(x=0.01, y=0.99),
    plot_bgcolor='rgba(0,0,0,0)'
)

# Create figure object combining PV generation and load traces
fig = go.Figure(data=[trace_load, trace_load2, trace_load3], layout=layout)

# Display the figure
fig.show()

['ac_load_net' 'pv_site_net' 'load_battery' 'load_ev_charger'
 'load_air_conditioner' 'load_powerpoint']
