In [39]:
import pandas as pd
import numpy as np
import scipy
import einops
from thefuzz import process
from tqdm import tqdm
import requests
import json
import pickle

We first download the "Ausgrid average electricity consumption by LGA 2021 excel" file from https://www.ausgrid.com.au/Industry/Our-Research/Data-to-share/Average-electricity-use and save to the data folder

In [40]:
elec_con_data = pd.read_excel(
    "data/Ausgrid average electricity consumption by LGA 2021.xlsx",
    usecols="B:Q",
    skiprows=29,
    nrows=9,
    names=[
        "LGA", "Residential Daily average (kWh)", "Residential General Supply (MWh)", "Residential Off Peak Hot Water (MWh)", "Residential Total (MWh)", "Off Peak Residential Customer number", "Total Residential Customer number",
        "Number of Residential Solar Customers", "Number of Non-Residential Solar Customers", "Residential Generation Capacity (kWp)", "Non-Residential Generation Capacity (kWp)", "Energy exported to the grid (MWh)",
        "Non-Residential Small-Medium Sites MWh", "Number of Non-Residential Small-Medium Sites",
        "Non-Residential Large Sites MWh", "Number of Non-Residential Large Sites",
    ]
)
elec_con_data

Unnamed: 0,LGA,Residential Daily average (kWh),Residential General Supply (MWh),Residential Off Peak Hot Water (MWh),Residential Total (MWh),Off Peak Residential Customer number,Total Residential Customer number,Number of Residential Solar Customers,Number of Non-Residential Solar Customers,Residential Generation Capacity (kWp),Non-Residential Generation Capacity (kWp),Energy exported to the grid (MWh),Non-Residential Small-Medium Sites MWh,Number of Non-Residential Small-Medium Sites,Non-Residential Large Sites MWh,Number of Non-Residential Large Sites
0,CENTRAL COAST*,16.508265,749993.75163,175439.24757,925432.9992,80849.438356,153585.665753,27926,866,129951.235,25729.24,94069.939235,208282.67026,12157.939726,507257.00212,709.723288
1,CESSNOCK,18.009241,145062.19126,22340.42324,167402.6145,10649.386301,25466.775342,6119,269,34436.875,11315.32,32556.795154,38708.8568,1843.106849,73069.18906,135.123288
2,LAKE MACQUARIE,16.913623,433830.86633,106992.88358,540823.74991,53160.884932,87604.465753,19815,539,96355.52,12551.95,71404.042118,118244.94451,6795.572603,189172.40852,391.958904
3,MAITLAND,16.812069,186156.22587,27389.42481,213545.65068,13031.569863,34799.79726,8541,268,46886.14,8038.03,36534.991861,59161.52808,2959.734247,91106.93447,170.60274
4,MUSWELLBROOK,20.195877,43567.83267,10159.80349,53727.63616,4443.449315,7288.567123,1306,94,7703.79,1825.385,6401.202701,28306.77256,1347.082192,19973.42688,52.008219
5,NEWCASTLE,13.670967,311806.04897,53494.28002,365300.32899,29693.865753,73207.90137,11139,561,49787.295,20393.255,46486.583101,156675.75499,8035.923288,263849.58008,515.241096
6,PORT STEPHENS,16.915889,169997.82293,46769.07573,216766.89866,23513.29863,35107.934247,8151,245,40727.0,5935.905,31131.420972,53077.87271,2985.665753,89789.91025,194.471233
7,SINGLETON,21.464937,64190.69011,13978.25635,78168.94646,6087.583562,9977.271233,2447,144,15021.225,5561.59,12941.132405,27270.28062,1676.934247,32676.20314,92.00274
8,UPPER HUNTER,20.126945,34744.97519,7854.87092,42599.84611,3524.008219,5798.789041,1143,109,6751.025,1868.59,5986.931273,20682.52559,1251.331507,8937.4913,26.515068


Allocating residential usages according to five classes

In [41]:
def alloc_mean(x, allocation):
    return (1 / len(allocation)) * sum([x * a for a in allocation])

def f(x, allocation, mu):
    return sum(abs(alloc_mean(x, allocation) - mu))

N = 5
rng = np.random.default_rng(50)
allocation = rng.dirichlet([1.5 for i in range(N)])
mu0 = scipy.optimize.minimize(f, np.repeat(0, len(elec_con_data)), (allocation, elec_con_data.loc[:, "Residential Daily average (kWh)"]))['x']

print(f"{allocation=}, {mu0=}: {alloc_mean(mu0, allocation)=}")
print(f"{np.array([mu0 * a for a in allocation])=}")

allocation=array([0.15889974, 0.18347015, 0.36256272, 0.1651147 , 0.12995269]), mu0=array([ 82.54132513,  90.04620605,  84.56811482,  84.06034257,
       100.97938367,  68.35483605,  84.57944433, 107.32468435,
       100.63472418]): alloc_mean(mu0, allocation)=array([16.50826503, 18.00924121, 16.91362296, 16.81206851, 20.19587673,
       13.67096721, 16.91588887, 21.46493687, 20.12694484])
np.array([mu0 * a for a in allocation])=array([[13.11579506, 14.30831869, 13.43785142, 13.35716654, 16.04559776,
        10.86156564, 13.43965167, 17.05386439, 15.99083146],
       [15.14386933, 16.52079096, 15.51572474, 15.42256369, 18.52670271,
        12.54107205, 15.51780337, 19.69087597, 18.46346798],
       [29.92640729, 32.64739733, 30.66124567, 30.47714639, 36.61135994,
        24.78291523, 30.66535333, 38.9119294 , 36.48639925],
       [13.62878632, 14.8679525 , 13.9634391 , 13.87959843, 16.67318087,
        11.2863884 , 13.96530977, 17.7208833 , 16.61627252],
       [10.72646712, 11.7017465

Now we generate synthetic data based on these statistics

In [50]:
res_X = {}
for (i, elec_con_row), m0, in zip(elec_con_data.iterrows(), mu0):
    res_X[i] = rng.normal(
        loc=np.array([m0 * a for a in allocation]),
        scale=5,
        size=(365, round(elec_con_row["Total Residential Customer number"]) - round(elec_con_row["Number of Residential Solar Customers"]), N)
    ).T

Doing the same with small-medium non-residential but this time with only 2 classes (small and medium)

In [52]:
N = 2
allocation = rng.dirichlet([1.5 for i in range(N)])
non_res_small_med_daily_avg = elec_con_data.loc[:, "Non-Residential Small-Medium Sites MWh"] / 365 * 1000 / elec_con_data.loc[:, "Number of Non-Residential Small-Medium Sites"]
mu0 = scipy.optimize.minimize(f, np.repeat(0, len(elec_con_data)), (allocation, non_res_small_med_daily_avg))['x']

print(f"{allocation=}, {mu0=}: {alloc_mean(mu0, allocation)=}")
print(f"{np.array([mu0 * a for a in allocation])=}")

allocation=array([0.66117258, 0.33882742]), mu0=array([ 93.87074876, 115.07923428,  95.34406326, 109.52765674,
       115.14190003, 106.83243939,  97.41132554,  89.10677058,
        90.56665377]): alloc_mean(mu0, allocation)=array([46.93537438, 57.53961714, 47.67203163, 54.76382837, 57.57095002,
       53.4162197 , 48.70566277, 44.55338529, 45.28332689])
np.array([mu0 * a for a in allocation])=array([[62.06476483, 76.08723386, 63.03887998, 72.41668302, 76.12866673,
        70.63467923, 64.4056971 , 58.9149531 , 59.88018784],
       [31.80598393, 38.99200043, 32.30518328, 37.11097371, 39.0132333 ,
        36.19776017, 33.00562843, 30.19181747, 30.68646594]])


And the synthetic data for the small-medium sites

In [53]:
small_med_non_res_X = {}
for (i, elec_con_row), m0, in zip(elec_con_data.iterrows(), mu0):
    small_med_non_res_X[i] = rng.normal(
        loc=np.array([m0 * a for a in allocation]),
        scale=5,
        size=(365, round(elec_con_row["Number of Non-Residential Small-Medium Sites"]) - round(5 * elec_con_row["Number of Non-Residential Solar Customers"] / 6), N)
    ).T
small_med_non_res_X

{0: array([[[71.61005456, 69.13544554, 60.51494235, ..., 57.25557434,
          71.44333713, 67.64519141],
         [61.6762315 , 62.89907559, 65.74716916, ..., 67.93257326,
          53.42558306, 69.47573083],
         [60.83282556, 61.9730405 , 67.11701734, ..., 63.576038  ,
          70.21234047, 64.02189258],
         ...,
         [72.87756792, 56.60884539, 68.94775888, ..., 63.88463473,
          70.85693396, 63.08144733],
         [62.39762398, 59.47918334, 63.73209922, ..., 60.04198933,
          71.09229023, 68.36558743],
         [61.14882118, 63.1229046 , 59.125683  , ..., 57.96508242,
          54.54872983, 60.81216529]],
 
        [[29.28336495, 47.44799502, 26.72505937, ..., 33.34923564,
          31.62238158, 35.61646087],
         [30.7255659 , 24.44742996, 33.9173428 , ..., 37.68335045,
          25.22784346, 31.201507  ],
         [31.52860891, 31.9428398 , 35.89309777, ..., 33.17335483,
          27.46487689, 40.32334541],
         ...,
         [28.93327394, 39.1077

We will assign a single mean for large non-residential sites

In [56]:
non_res_large_daily_avg = elec_con_data.loc[:, "Non-Residential Large Sites MWh"] / 365 * 1000 / elec_con_data.loc[:, "Number of Non-Residential Large Sites"]
print(f"{non_res_large_daily_avg.tolist()=}")

non_res_large_daily_avg.tolist()=[1958.1507827476653, 1481.5326248986207, 1322.2829379652603, 1463.09514164124, 1052.174412895749, 1402.985063941339, 1264.9673191795107, 973.0562859950568, 923.4853585451541]


And finally, we generate the synthetic data

In [57]:
large_non_res_X = {}
for (i, elec_con_row), avg in zip(elec_con_data.iterrows(), non_res_large_daily_avg):
    large_non_res_X[i] = rng.normal(
        loc=avg,
        scale=5,
        size=(365, round(elec_con_row["Number of Non-Residential Large Sites"]) - round(elec_con_row["Number of Non-Residential Solar Customers"] / 6))
    ).T
large_non_res_X

{0: array([[1962.15327214, 1955.77680335, 1961.10932698, ..., 1955.01360511,
         1967.49938772, 1953.370509  ],
        [1960.32006132, 1961.95589797, 1951.93057749, ..., 1974.33693721,
         1954.36774498, 1961.72619166],
        [1960.77805915, 1951.5915398 , 1952.83765493, ..., 1953.20740795,
         1968.84237615, 1958.47266516],
        ...,
        [1960.51454079, 1963.48899019, 1961.47425819, ..., 1955.1378557 ,
         1954.22303899, 1965.80603254],
        [1961.78311079, 1964.64147144, 1950.70212432, ..., 1959.1660014 ,
         1961.8282335 , 1959.58203799],
        [1961.74557133, 1968.30673232, 1959.83234873, ..., 1956.73113877,
         1953.61591472, 1950.24839353]]),
 1: array([[1477.29419535, 1480.58434751, 1483.60083326, ..., 1485.71159194,
         1472.49132022, 1478.07548192],
        [1485.12327161, 1490.21369135, 1475.63437097, ..., 1486.17324224,
         1479.92238122, 1479.63447994],
        [1479.5578991 , 1485.43929932, 1485.22577333, ..., 1481.154

Finding solar energy exported, by data and split across residential and non-residential

In [58]:
n_res_solar_customers = elec_con_data.loc[:, "Number of Residential Solar Customers"]
n_non_res_solar_customers = elec_con_data.loc[:, "Number of Non-Residential Solar Customers"]

In [59]:
total_solar_generation_capacity = elec_con_data.loc[:, "Residential Generation Capacity (kWp)"] + elec_con_data.loc[0, "Non-Residential Generation Capacity (kWp)"]
p_res_solar_gen_capacity = elec_con_data.loc[:, "Residential Generation Capacity (kWp)"] / total_solar_generation_capacity
p_non_res_solar_gen_capacity = elec_con_data.loc[:, "Non-Residential Generation Capacity (kWp)"] / total_solar_generation_capacity

In [60]:
avg_res_solar_gen = elec_con_data.loc[:, "Energy exported to the grid (MWh)"] * p_res_solar_gen_capacity / 365 * 1000 / n_res_solar_customers
avg_res_solar_gen

0    7.703632
1    8.343343
2    7.792037
3    7.566992
4    3.094242
5    7.538154
6    6.412714
7    5.340947
8    2.982740
dtype: float64

Generate synthetic data

In [61]:
res_solar_X = {}
for (i, elec_con_row), avg in zip(elec_con_data.iterrows(), avg_res_solar_gen):
    res_solar_X[i] = rng.normal(
        loc=-avg,
        scale=5,
        size=(365, round(elec_con_row["Number of Residential Solar Customers"]))
    ).T
res_solar_X

{0: array([[ -4.75892407, -14.96734692, -17.30421382, ...,  -2.93737023,
           4.36344549,  -2.50956013],
        [ -8.29921024,  -7.68318808,  -6.77703515, ...,  -6.69780631,
         -12.05994073,  -2.81123263],
        [  3.26053785, -10.13935389,  -8.95443816, ...,  -1.66317238,
          -7.38839467, -13.72328239],
        ...,
        [-15.0978819 , -11.9150609 , -12.910875  , ...,  -9.26399008,
         -12.47674536,  -0.0203913 ],
        [ -7.71978524,  -6.23063513, -10.01962732, ...,  -5.80398008,
          -5.40053128, -10.43260204],
        [-11.35884342,  -6.19883705,  -6.63359714, ...,  -6.78986017,
           1.27728278,   2.50258893]]),
 1: array([[ -5.83404647, -10.51105468,  -0.38589545, ...,  -9.99871074,
         -13.63480758,  -9.5239177 ],
        [ -4.62562415, -16.56084279,  -7.39916096, ..., -16.30276723,
          -6.63178435,  -3.42063059],
        [-15.3913991 ,  -2.56274081,  -2.66637467, ...,  -5.96250436,
          -8.99634535, -11.93856329],
       

In [63]:
avg_non_res_solar_gen = elec_con_data.loc[:, "Energy exported to the grid (MWh)"] * p_non_res_solar_gen_capacity / 365 * 1000 / n_non_res_solar_customers
avg_non_res_solar_gen

0    49.185027
1    62.360756
2    37.315642
3    41.343036
4    10.186374
5    61.307894
6    31.095044
7    33.603408
8     8.657238
dtype: float64

Split non-residential into three groups

In [64]:
N = 3
allocation = rng.dirichlet([1.5 for i in range(N)])
mu0 = scipy.optimize.minimize(f, np.repeat(0, len(elec_con_data)), (allocation, avg_non_res_solar_gen))['x']

print(f"{allocation=}, {mu0=}: {alloc_mean(mu0, allocation)=}")
print(f"{np.array([mu0 * a for a in allocation])=}")

allocation=array([0.45034447, 0.43151424, 0.1181413 ]), mu0=array([147.55508199, 187.08226883, 111.94692518, 124.02910777,
        30.55912084, 183.92368075,  93.28513103, 100.81022395,
        25.97171458]): alloc_mean(mu0, allocation)=array([49.18502733, 62.36075628, 37.31564173, 41.34303592, 10.18637361,
       61.30789358, 31.09504368, 33.60340798,  8.65723819])
np.array([mu0 * a for a in allocation])=array([[66.45061476, 84.25146465, 50.41467836, 55.85582243, 13.76213099,
        82.82901198, 42.01044262, 45.39932658, 11.69621796],
       [63.6721186 , 80.72866247, 48.30669199, 53.52032579, 13.18669571,
        79.36568674, 40.25386213, 43.50104686, 11.2071646 ],
       [17.43234863, 22.10214172, 13.22555484, 14.65295955,  3.61029414,
        21.72898203, 11.02082629, 11.90985052,  3.06833202]])


Generate synthetic data

In [65]:
non_res_solar_X = {}
for (i, elec_con_row), m0, in zip(elec_con_data.iterrows(), mu0):
    non_res_solar_X[i] = rng.normal(
        loc=np.array([-m0 * a for a in allocation]),
        scale=5,
        size=(365, round(elec_con_row["Number of Non-Residential Solar Customers"]), N)
    ).T
non_res_solar_X

{0: array([[[-60.22913128, -70.76000731, -57.2622391 , ..., -60.43005869,
          -61.62592249, -67.99014011],
         [-66.40704066, -68.54076362, -56.84868875, ..., -68.83599271,
          -67.6227771 , -75.28879601],
         [-69.61063173, -63.39426001, -69.73626203, ..., -67.23650053,
          -60.84646951, -62.24493121],
         ...,
         [-56.8985318 , -70.25393647, -64.31852557, ..., -62.90281253,
          -62.68776919, -72.94557182],
         [-63.57309513, -72.56443409, -72.45823157, ..., -69.18330428,
          -63.36731412, -72.11040276],
         [-65.74083236, -71.02076006, -70.2597085 , ..., -60.42184423,
          -73.32020709, -64.3622283 ]],
 
        [[-64.12688486, -59.76698279, -68.27378716, ..., -65.71754101,
          -69.50603268, -60.54702992],
         [-71.87087597, -70.51017351, -60.25023766, ..., -65.77300971,
          -58.94385695, -74.45147412],
         [-58.26225658, -68.39270376, -68.10127709, ..., -61.18510816,
          -49.38403328, -53.3

And save all synthetic data into a pickle file

In [67]:
X = {
    "residential consumption": res_X,
    "residential generation": res_solar_X,
    "non-residential small-medium site consumption": small_med_non_res_X,
    "non-residential large size consumption": large_non_res_X,
    "non-residential generation": non_res_solar_X,
}
with open("data/electricity_consumption_2020-2021.pkl", 'wb') as f:
    pickle.dump(X, f)

In [70]:
total = elec_con_data["Total Residential Customer number"].round().sum() + elec_con_data["Number of Residential Solar Customers"].round().sum() + elec_con_data["Number of Non-Residential Small-Medium Sites"].round().sum() + elec_con_data["Number of Non-Residential Large Sites"].round().sum()
total * 365

204679955.0

We will save weather data for each of the LGAs

First we get LGA information from https://www.olg.nsw.gov.au/public/local-government-directory/ by downloading the "All NSW Council Contact Details – XLS" file and saving to the data folder. Then we match
LGAs to those in `elec_con_data` and get the postcodes

In [71]:
lga_contacts = pd.read_excel("data/LGDGPALL.xls")
lga_contacts

Unnamed: 0,ABS,ORGNAME,POSTAL_ADD1,POSTAL_ADD2,POSTAL_SUBURB,POSTAL_STATE,POSTAL_PCODE,STREET_ADD1,STREET_ADD2,STREET_SUBURB,...,MAYOR_SAL,MAYOR_FIRST,MAYOR_LAST,MAYOR_AWARD,EMAIL,WEB,AREA,POPULATION,ABN,METRO
0,10050,Albury City Council,PO Box 323,,ALBURY,NSW,2640,553 Kiewa Street,,Albury,...,Clr,Kylie,King,,info@alburycity.nsw.gov.au,http://www.alburycity.nsw.gov.au,306,52949,92 965 474 349,0
1,10180,Armidale Regional Council,PO Box 75A,,ARMIDALE,NSW,2350,135 Rusden Street,,Armidale,...,Clr,Sam,Coupland,,council@armidale.nsw.gov.au,http://www.armidaleregional.nsw.gov.au,8621,30594,39 642 954 203,0
2,10250,Ballina Shire Council,PO Box 450,,BALLINA,NSW,2478,40 Cherry Street,,Ballina,...,Clr,Sharon,Cadwallader,,council@ballina.nsw.gov.au,http://www.ballina.nsw.gov.au,485,43457,53 929 887 369,0
3,10300,Balranald Shire Council,PO Box 120,,BALRANALD,NSW,2715,70 Market Street,,Balranald,...,,,,,council@balranald.nsw.gov.au,http://www.balranald.nsw.gov.au,21691,2341,74 678 751 581,0
4,10470,Bathurst Regional Council,Private Mail Bag 17,,BATHURST,NSW,2795,158 Russell Street,,Bathurst,...,Clr,Robert,Taylor,,council@bathurst.nsw.gov.au,http://www.bathurst.nsw.gov.au,3818,42779,42 173 522 302,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,18350,Wingecarribee Shire Council,PO Box 141,,MOSS VALE,NSW,2577,Civic Centre,68 Elizabeth Street,Moss Vale,...,,,,,mail@wsc.nsw.gov.au,http://www.wsc.nsw.gov.au,2689,49649,49 546 344 354,0
124,18400,Wollondilly Shire Council,PO Box 21,,PICTON,NSW,2571,62-64 Menangle Street,,Picton,...,Clr,Matthew,Gould,,council@wollondilly.nsw.gov.au,http://www.wollondilly.nsw.gov.au,2555,51002,93 723 245 808,1
125,18450,Wollongong City Council,Locked Bag 8821,,WOLLONGONG DC,NSW,2500,41 Burelli Street,,Wollongong,...,Clr,Gordon,Bradbery,AM,council@wollongong.nsw.gov.au,http://www.wollongong.nsw.gov.au,684,213132,63 139 525 939,0
126,18500,Woollahra Municipal Council,PO Box 61,,DOUBLE BAY,NSW,1360,536 New South Head Road,,Double Bay,...,Clr,Susan,Wynne,,records@woollahra.nsw.gov.au,http://www.woollahra.nsw.gov.au,12,58456,32 218 483 245,1


In [72]:
idx = []
for LGA in elec_con_data['LGA']:
    orgname, score, id = process.extractOne(LGA, lga_contacts['ORGNAME'])
    idx.append(id)
    print(f"{LGA=}: {orgname=}")
idx = np.array(idx)

LGA='CENTRAL COAST*': orgname='Central Coast Council'
LGA='CESSNOCK': orgname='Cessnock City Council'
LGA='LAKE MACQUARIE': orgname='Lake Macquarie City Council'
LGA='MAITLAND': orgname='Maitland City Council'
LGA='MUSWELLBROOK': orgname='Muswellbrook Shire Council'
LGA='NEWCASTLE': orgname='Newcastle City Council'
LGA='PORT STEPHENS': orgname='Port Stephens Council'
LGA='SINGLETON': orgname='Singleton Council'
LGA='UPPER HUNTER': orgname='Upper Hunter Shire Council'


In [73]:
lga_postcodes = lga_contacts.loc[idx, "POSTAL_PCODE"]

postcode_data = pd.read_csv(
    "data/AU.txt",
    sep='\t',
    names=["country code","postal code","place name","admin name1","admin code1","admin name2","admin code2","admin name3","admin code3","latitude","longitude","accuracy"]
)
postcode_data = postcode_data.drop(postcode_data[postcode_data['postal code'].duplicated()].index)  # we will use just one of the postcodes
postcode_data

Unnamed: 0,country code,postal code,place name,admin name1,admin code1,admin name2,admin code2,admin name3,admin code3,latitude,longitude,accuracy
0,AU,200,Australian National University,Australian Capital Territory,ACT,CANBERRA,,,,-35.2777,149.1189,1.0
1,AU,221,Barton,Australian Capital Territory,ACT,,,,,-35.3049,149.1412,4.0
2,AU,2540,Jervis Bay,Australian Capital Territory,ACT,NEW CNTRY WEST,,,,-35.1499,150.6969,4.0
5,AU,2600,Russell,Australian Capital Territory,ACT,CANBERRA,,,,-35.2991,149.1515,4.0
17,AU,2601,Canberra,Australian Capital Territory,ACT,CANBERRA,,,,-35.2835,149.1281,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16868,AU,6989,Maddington,Western Australia,WA,TANGNEY,,,,-32.0500,115.9833,4.0
16869,AU,6990,Gosnells,Western Australia,WA,TANGNEY,,,,-32.0810,116.0054,4.0
16870,AU,6991,Kelmscott,Western Australia,WA,TANGNEY,,,,-32.1243,116.0259,4.0
16871,AU,6992,Armadale,Western Australia,WA,TANGNEY,,,,-32.1461,116.0093,4.0


In [74]:
weather_data = {
    "structure": {"postcode": {"day": ["time","temperature_2m_max","temperature_2m_min","sunrise","sunset","precipitation_sum","precipitation_hours","shortwave_radiation_sum"]}},
    "daily units": {"time":"iso8601","temperature_2m_max":"Celcius","temperature_2m_min":"Celcius","sunrise":"iso8601","sunset":"iso8601","precipitation_sum":"mm","precipitation_hours":"h","shortwave_radiation_sum":"MJ/m^2"}
}

for i, postcode in (pbar := tqdm(enumerate(lga_postcodes))):
    pbar.set_postfix_str(f"{postcode=}")
    postcode_row = postcode_data[postcode == postcode_data['postal code']]
    latitude, longitude = postcode_row.latitude.item(), postcode_row.longitude.item()
    r = requests.get(f"https://archive-api.open-meteo.com/v1/era5?latitude={latitude}&longitude={longitude}&start_date=2021-07-01&end_date=2022-06-30&daily=temperature_2m_max,temperature_2m_min,sunrise,sunset,precipitation_sum,precipitation_hours,shortwave_radiation_sum&timezone=Australia%2FSydney")
    if r.ok:
        postcode_weather = r.json()
        # sunset, sunrise datetime.fromisoformat('2021-07-01T16:58').hour * 100 + datetime.fromisoformat('2021-07-01T06:59').minute?
        weather_data[str(i)] = {day: other_data for day, *other_data in zip(*[v for v in postcode_weather['daily'].values()])}
    else:
        tqdm.write(f"Postcode {postcode} weather data request failed")

with open(f"data/central_coast_hunter_weather.json", 'w') as f:
    json.dump(weather_data, f)

9it [00:09,  1.03s/it, postcode=2337]


## Sampling

In [75]:
X['residential consumption'][0].shape

(2, 125660, 365)

In [82]:
X['residential consumption'][0][0, 0, :30]

array([30.7802307 , 28.83134138, 35.45729311, 22.63807845, 28.47036361,
       38.86841037, 22.49974834, 35.27420259, 30.08803284, 29.85172371,
       37.37146585, 27.13440747, 38.58892723, 28.19652753, 37.93482761,
       33.19726599, 25.50426673, 37.2643837 , 35.35167426, 33.64825411,
       25.25335986, 28.10364576, 35.54124667, 37.34611573, 38.15817761,
       36.78707855, 34.51825718, 37.03982018, 42.7515706 , 35.24611405])

In [76]:
weather_data

{'structure': {'postcode': {'day': ['time',
    'temperature_2m_max',
    'temperature_2m_min',
    'sunrise',
    'sunset',
    'precipitation_sum',
    'precipitation_hours',
    'shortwave_radiation_sum']}},
 'daily units': {'time': 'iso8601',
  'temperature_2m_max': 'Celcius',
  'temperature_2m_min': 'Celcius',
  'sunrise': 'iso8601',
  'sunset': 'iso8601',
  'precipitation_sum': 'mm',
  'precipitation_hours': 'h',
  'shortwave_radiation_sum': 'MJ/m^2'},
 '0': {'2021-07-01': [15.5,
   9.0,
   '2021-07-01T06:59',
   '2021-07-01T16:58',
   5.7,
   6.0,
   6.04],
  '2021-07-02': [16.7,
   10.6,
   '2021-07-02T06:59',
   '2021-07-02T16:59',
   0.0,
   0.0,
   9.91],
  '2021-07-03': [16.4,
   7.7,
   '2021-07-03T06:58',
   '2021-07-03T16:59',
   0.0,
   0.0,
   10.78],
  '2021-07-04': [13.3,
   5.4,
   '2021-07-04T06:58',
   '2021-07-04T16:59',
   0.0,
   0.0,
   12.09],
  '2021-07-05': [13.7,
   5.0,
   '2021-07-05T06:58',
   '2021-07-05T17:00',
   0.0,
   0.0,
   11.44],
  '2021-07-06

In [84]:
weather_data['0']['2021-07-01']

[15.5, 9.0, '2021-07-01T06:59', '2021-07-01T16:58', 5.7, 6.0, 6.04]

In [None]:
def get_customer_data(customer=1):
    idx = np.arange(24, len(sorted_full_data[customer]))
    expanded_idx = np.array([np.arange(i - 24, i - 1) for i in idx])
    return sorted_full_data[customer][expanded_idx].reshape(len(sorted_full_data[customer]) - 24, -1), sorted_full_data[customer][idx, 0]