In [None]:
import pandas as pd
from datetime import datetime

from src.paths import RAW_DATA_DIR, TRANSFORMED_DATA_DIR


In [None]:
# Read each file from raw dir
#   Concatenate them
#   Save big file to CSV in transformed dir
concat_demand = pd.DataFrame(columns=["datetime", "ba_code", "demand"])

for file_path in RAW_DATA_DIR.glob("*.csv"):
    with open(file_path, "rb"):
        tmp = pd.read_csv(file_path)
        concat_demand = pd.concat([concat_demand, tmp])

# To deal with downcasting when filling NaNs
concat_demand['demand'] = concat_demand['demand'].astype(int)

# For annotating the file name
min_month, min_year = (
    datetime.strptime(concat_demand["datetime"].min(), "%Y-%m-%d").month,
    datetime.strptime(concat_demand["datetime"].min(), "%Y-%m-%d").year,
)

max_month, max_year = (
    datetime.strptime(concat_demand["datetime"].max(), "%Y-%m-%d").month,
    datetime.strptime(concat_demand["datetime"].max(), "%Y-%m-%d").year,
)

data = pd.pivot_table(data=concat_demand, values="demand", index="datetime", columns="ba_code")
# Resetting column names
data.columns.name = None
data.columns = [f"ba_{ba_code}" for ba_code in data.columns]

# Filling missing values with -1 for now
data = data.fillna(-1)

data = data.sort_index()

data.to_csv(TRANSFORMED_DATA_DIR / f'ts_tabular_{min_year}_{min_month}_to_{max_year}_{max_month}.csv')

In [34]:
data.shape, data.size

((670, 67), 44890)

In [35]:
data.head()

Unnamed: 0_level_0,ba_AECI,ba_AVA,ba_AZPS,ba_BANC,ba_BPAT,ba_CAL,ba_CAR,ba_CENT,ba_CHPD,ba_CISO,...,ba_TEN,ba_TEPC,ba_TEX,ba_TIDC,ba_TPWR,ba_TVA,ba_US48,ba_WACM,ba_WALC,ba_WAUW
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,59909.0,38266.0,74476.0,39939.0,167930.0,621178.0,478148.0,644349.0,7685.0,512043.0,...,342521.0,31528.0,899540.0,5677.0,14380.0,342521.0,9389054.0,87420.0,18327.0,2469.0
2023-01-02,58056.0,40165.0,76802.0,46635.0,187046.0,692440.0,519117.0,669956.0,8058.0,572456.0,...,354751.0,33047.0,970461.0,6594.0,16639.0,354751.0,9984103.0,93155.0,19378.0,2751.0
2023-01-03,58551.0,43507.0,77934.0,47536.0,185754.0,736465.0,530603.0,719373.0,7708.0,610457.0,...,367694.0,34068.0,965338.0,7051.0,16742.0,367694.0,10451658.0,95885.0,19580.0,2914.0
2023-01-04,73122.0,42674.0,75801.0,47250.0,188878.0,732759.0,524179.0,760561.0,7763.0,606172.0,...,374897.0,32894.0,981661.0,7263.0,16474.0,374897.0,10528590.0,99419.0,18547.0,2835.0
2023-01-05,77401.0,41295.0,77519.0,45100.0,173307.0,718498.0,538507.0,771291.0,7668.0,598329.0,...,429229.0,34291.0,1011497.0,7123.0,14473.0,429229.0,10760439.0,98267.0,18793.0,2829.0


### Plotting data for one or many BAs

In [42]:
from typing import Optional, List
import plotly.express as px

def plot_demand(data: pd.DataFrame, bas: Optional[List[int]] = None):
    # Wrangling data for plotly
    df_plot = data.reset_index()
    df_melted = df_plot.melt(id_vars=df_plot.columns[0], var_name="ba_code", value_name="demand")
    df_melted = df_melted.sort_values(by=['ba_code', 'datetime'])
    df_melted = df_melted[['datetime', 'demand', 'ba_code']]

    df_melted = df_melted[df_melted["ba_code"].isin(bas)] if bas else df_melted

    fig = px.line(
        df_melted,
        x="datetime",
        y="demand",
        color="ba_code",
        template="none",
    )

    fig.show()

In [37]:
plot_demand(data, bas=["ba_AECI", "ba_AVA", "ba_SEC"])

In [None]:
# Series where we know we have missing data
plot_demand(data, bas=["ba_SEC"])

In [39]:
plot_demand(data, bas=["ba_AVA"])

In [40]:
plot_demand(data)

From a quick look, it seems as though we have some bi-annual seasonality with peaks in Jan and Jul, which makes sense - when it's hot people use air-con more, when it's cold people turn on the heating. There is also some weekly and monthly structure. 