In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

## Purpose

The purpose of this notebook is to process the load data published as part of the Australian Government's *Smart City, Smart Grid* programme run in the city of Newcastle in NSW. This dataset is quite large and so the load profiles will be processed into an average daily profile.

The original data can be found [here](http://datagovau.s3-ap-southeast-2.amazonaws.com/CDINTERVALREADINGALLNOQUOTES.csv.7z)


In [5]:
file_name = "CD_INTERVAL_READING_ALL_NO_QUOTES.csv"
data_path = Path(f"./in/{file_name}")
out_path = Path("./out")
out_path.mkdir(exist_ok=True, parents=True)

df = pd.read_csv(
    data_path.resolve(),
    parse_dates=["READING_DATETIME"],
#     usecols=[
#         "GENERAL_SUPPLY_KWH",
#         "GROSS_GENERATION_KWH",
#         "CUSTOMERID",
#         "READING_DATETIME",
#     ],
    dtype={
        "GENERAL_SUPPLY_KWH": "float",
        "GROSS_GENERATION_KWH": "float",
        "CUSTOMERID": "float",
    },
    engine="c"
)

In [6]:
df.dtypes

CUSTOMER_ID                       int64
READING_DATETIME         datetime64[ns]
 CALENDAR_KEY                     int64
 EVENT_KEY                        int64
 GENERAL_SUPPLY_KWH             float64
 CONTROLLED_LOAD_KWH            float64
 GROSS_GENERATION_KWH           float64
 NET_GENERATION_KWH             float64
 OTHER_KWH                        int64
dtype: object

In [7]:
def aggregate_by_hour(df):
    time = df["READING_DATETIME"].dt.time
    df["time"] = time
    by_time = df.groupby(df["time"]).mean()
    return by_time

res = df.groupby("CUSTOMER_ID").apply(lambda x: aggregate_by_hour(x))
res.to_csv((out_path / "aggregated.csv").resolve())
