In [1]:
import configparser
config = configparser.ConfigParser()
config.read("C:/KV/variable.ini")
postgresql_db               = config.get("myvars", "postgresql_db")
AZURE_STORAGE_ACCOUNT_KEY   = config.get("myvars", "AccountKey")
azure_storage_tenant_id     = config.get("myvars", "tenantId")
CONNECTION_STRING           = config.get("myvars", "CONNECTION_STRING")
account_name                = config.get("myvars", "account_name")
table_location              = config.get("myvars", "table_location_azure")

**<mark>Install Package</mark>**

In [2]:
#!pip install -q pyiceberg[adlfs]
#!pip install -q boto3
#!pip install -q getdaft
#!pip install -q sqlalchemy
#!pip install -q psycopg2

In [3]:
from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.io.fsspec import FsspecFileIO
import duckdb
import os
from   datetime import datetime
import glob
import os
from   psutil import *
import re 
import requests
from   shutil import unpack_archive
from   urllib.request import urlopen

**<mark>Connect to the Catalog</mark>**

In [4]:
def connect_catalog():
      catalog = SqlCatalog(
      "default",
      **{
          "uri"                : postgresql_db,
          "adlfs.account-name" : account_name ,
          "adlfs.account-key"  : AZURE_STORAGE_ACCOUNT_KEY,
          "adlfs.tenant-id"    : azure_storage_tenant_id,
          "py-io-impl"         : "pyiceberg.io.fsspec.FsspecFileIO",
          "legacy-current-snapshot-id": True
      },
                        )
      return catalog 

In [5]:
catalog  = connect_catalog()

In [6]:
db = "azure"
catalog.create_namespace_if_not_exists(db)

In [7]:
def get_table_files(db,table):
    table = catalog.load_table(db+'.'+table)
    zzz = table.scan(selected_fields=("file", )).to_arrow_batch_reader()
    table_files = duckdb.sql(f"select distinct file  from  zzz ").df()['file'].tolist()
    return table_files

In [8]:
def download(url,Path,total_files):
    if not os.path.exists(Path):
      os.makedirs(Path, exist_ok=True)
    result = urlopen(url).read().decode('utf-8')
    pattern = re.compile(r'[\w.]*.zip')
    filelist1 = pattern.findall(result)
    filelist_unique = dict.fromkeys(filelist1)
    filelist =sorted(filelist_unique, reverse=True)
    current =  [os.path.basename(x) for x in glob.glob(Path+'*.zip')]
    files_to_upload = list(set(filelist) - set(current))
    files_to_upload = list(dict.fromkeys(files_to_upload))[:total_files] 
    print(str(len(files_to_upload)) + ' New File Loaded')
    if len(files_to_upload) != 0 :
      for x in files_to_upload:
           with requests.get(url+x, stream=True) as resp:
            if resp.ok:
              with open(f"{Path}{x}", "wb") as f:
               for chunk in resp.iter_content(chunk_size=4096):
                f.write(chunk)
    return "done"

In [9]:
def unzip(Source, Destination):
    if not os.path.exists(Destination):
      os.makedirs(Destination, exist_ok=True)
    filelist=[os.path.basename(x) for x in glob.glob(Source+'*.zip')]
    ### checl the unzipped files already
    current = [os.path.basename(x) for x in glob.glob(Destination+'*.CSV')]
    current = [w.replace('.CSV','.zip') for w in current]
    #unzip only the delta
    files_to_upload = list(set(filelist) - set(current))
    files_to_upload = list(dict.fromkeys(files_to_upload))
    print(str(len(files_to_upload)) + ' New File uncompressed')
    if len(files_to_upload) != 0 :
      for x in files_to_upload:
        unpack_archive(str(Source+x), str(Destination), 'zip')
      return "done"
    else:
     return "nothing to see here"

**<mark>Check files already Ingested</mark>**

In [10]:
def get_Path(Source,Destination):
 if catalog.table_exists(db+"."+Destination):
  existing_files = get_table_files(db,Destination)
 else:
  existing_files = []
 print(len(existing_files))
 filelist_csv = [os.path.basename(x) for x in glob.glob(Source+'*.CSV')]
 files_to_upload = list(set(filelist_csv) - set(existing_files))
 files_to_upload = list(dict.fromkeys(files_to_upload))
 files_to_upload_full_Path = [Source + i for i in files_to_upload]
 return files_to_upload_full_Path

In [11]:
def clean_scada(files_to_upload_full_Path):
    raw =duckdb.sql(F"""from read_csv({files_to_upload_full_Path},
    Skip=1,header =0,all_varchar=1,
    columns={{
    'I': 'VARCHAR','UNIT': 'VARCHAR','XX': 'VARCHAR','VERSION': 'VARCHAR','SETTLEMENTDATE': 'VARCHAR','RUNNO': 'VARCHAR',
    'DUID': 'VARCHAR','INTERVENTION': 'VARCHAR','DISPATCHMODE': 'VARCHAR','AGCSTATUS': 'VARCHAR','INITIALMW': 'VARCHAR',
    'TOTALCLEARED': 'VARCHAR','RAMPDOWNRATE': 'VARCHAR','RAMPUPRATE': 'VARCHAR','LOWER5MIN': 'VARCHAR',
    'LOWER60SEC': 'VARCHAR','LOWER6SEC': 'VARCHAR','RAISE5MIN': 'VARCHAR','RAISE60SEC': 'VARCHAR',
    'RAISE6SEC': 'VARCHAR','MARGINAL5MINVALUE': 'VARCHAR','MARGINAL60SECVALUE': 'VARCHAR',
    'MARGINAL6SECVALUE': 'VARCHAR','MARGINALVALUE': 'VARCHAR','VIOLATION5MINDEGREE': 'VARCHAR',
    'VIOLATION60SECDEGREE': 'VARCHAR','VIOLATION6SECDEGREE': 'VARCHAR','VIOLATIONDEGREE': 'VARCHAR',
    'LOWERREG': 'VARCHAR','RAISEREG': 'VARCHAR','AVAILABILITY': 'VARCHAR','RAISE6SECFLAGS': 'VARCHAR',
    'RAISE60SECFLAGS': 'VARCHAR','RAISE5MINFLAGS': 'VARCHAR','RAISEREGFLAGS': 'VARCHAR',
    'LOWER6SECFLAGS': 'VARCHAR','LOWER60SECFLAGS': 'VARCHAR','LOWER5MINFLAGS': 'VARCHAR',
    'LOWERREGFLAGS': 'VARCHAR','RAISEREGAVAILABILITY': 'VARCHAR','RAISEREGENABLEMENTMAX': 'VARCHAR',
    'RAISEREGENABLEMENTMIN': 'VARCHAR','LOWERREGAVAILABILITY': 'VARCHAR','LOWERREGENABLEMENTMAX': 'VARCHAR',
    'LOWERREGENABLEMENTMIN': 'VARCHAR','RAISE6SECACTUALAVAILABILITY': 'VARCHAR',
    'RAISE60SECACTUALAVAILABILITY': 'VARCHAR','RAISE5MINACTUALAVAILABILITY': 'VARCHAR',
    'RAISEREGACTUALAVAILABILITY': 'VARCHAR','LOWER6SECACTUALAVAILABILITY': 'VARCHAR',
    'LOWER60SECACTUALAVAILABILITY': 'VARCHAR','LOWER5MINACTUALAVAILABILITY': 'VARCHAR','LOWERREGACTUALAVAILABILITY': 'VARCHAR'
    }},
    filename =1,null_padding = true,ignore_errors=1,auto_detect=false)
    where I='D' and UNIT ='DUNIT' AND VERSION = '3'                  """)
    columns = list(set(raw.columns) - {'SETTLEMENTDATE','DUID','I','filename','UNIT'})
    exprs = [
      duckdb.ColumnExpression(x).cast(duckdb.typing.DOUBLE).alias(x)
      for x in columns
    ]
    rel2 = raw.select('SETTLEMENTDATE','DUID','I','filename','UNIT',*exprs)
    final=duckdb.sql(""" select *exclude(SETTLEMENTDATE,I,XX,filename),cast (SETTLEMENTDATE as timestamp) as SETTLEMENTDATE,
    parse_filename(filename) as file,isoyear (cast (SETTLEMENTDATE as timestamp)) as YEAR  from rel2  """)
    ####################
    return final.arrow()

In [12]:
# @title
def clean_price(files_to_upload_full_Path):
  raw =duckdb.sql(F"""from read_csv({files_to_upload_full_Path},
  Skip=1,header =0,all_varchar=1,
  columns={{
  'I': 'VARCHAR','UNIT': 'VARCHAR','XX': 'VARCHAR','VERSION': 'VARCHAR','SETTLEMENTDATE': 'VARCHAR','RUNNO': 'VARCHAR',
  'REGIONID': 'VARCHAR','INTERVENTION': 'VARCHAR','RRP': 'VARCHAR','EEP': 'VARCHAR','ROP': 'VARCHAR','APCFLAG': 'VARCHAR',
  'MARKETSUSPENDEDFLAG': 'VARCHAR','TOTALDEMAND': 'VARCHAR','DEMANDFORECAST': 'VARCHAR','DISPATCHABLEGENERATION': 'VARCHAR',
  'DISPATCHABLELOAD': 'VARCHAR','NETINTERCHANGE': 'VARCHAR','EXCESSGENERATION': 'VARCHAR','LOWER5MINDISPATCH': 'VARCHAR',
  'LOWER5MINIMPORT': 'VARCHAR','LOWER5MINLOCALDISPATCH': 'VARCHAR','LOWER5MINLOCALPRICE': 'VARCHAR','LOWER5MINLOCALREQ': 'VARCHAR',
  'LOWER5MINPRICE': 'VARCHAR','LOWER5MINREQ': 'VARCHAR','LOWER5MINSUPPLYPRICE': 'VARCHAR','LOWER60SECDISPATCH': 'VARCHAR','LOWER60SECIMPORT': 'VARCHAR',
  'LOWER60SECLOCALDISPATCH': 'VARCHAR','LOWER60SECLOCALPRICE': 'VARCHAR','LOWER60SECLOCALREQ': 'VARCHAR','LOWER60SECPRICE': 'VARCHAR',
  'LOWER60SECREQ': 'VARCHAR','LOWER60SECSUPPLYPRICE': 'VARCHAR','LOWER6SECDISPATCH': 'VARCHAR','LOWER6SECIMPORT': 'VARCHAR',
  'LOWER6SECLOCALDISPATCH': 'VARCHAR','LOWER6SECLOCALPRICE': 'VARCHAR','LOWER6SECLOCALREQ': 'VARCHAR','LOWER6SECPRICE': 'VARCHAR',
  'LOWER6SECREQ': 'VARCHAR','LOWER6SECSUPPLYPRICE': 'VARCHAR','RAISE5MINDISPATCH': 'VARCHAR','RAISE5MINIMPORT': 'VARCHAR',
  'RAISE5MINLOCALDISPATCH': 'VARCHAR','RAISE5MINLOCALPRICE': 'VARCHAR','RAISE5MINLOCALREQ': 'VARCHAR','RAISE5MINPRICE': 'VARCHAR',
  'RAISE5MINREQ': 'VARCHAR','RAISE5MINSUPPLYPRICE': 'VARCHAR','RAISE60SECDISPATCH': 'VARCHAR','RAISE60SECIMPORT': 'VARCHAR',
  'RAISE60SECLOCALDISPATCH': 'VARCHAR','RAISE60SECLOCALPRICE': 'VARCHAR','RAISE60SECLOCALREQ': 'VARCHAR','RAISE60SECPRICE': 'VARCHAR',
  'RAISE60SECREQ': 'VARCHAR','RAISE60SECSUPPLYPRICE': 'VARCHAR','RAISE6SECDISPATCH': 'VARCHAR','RAISE6SECIMPORT': 'VARCHAR',
  'RAISE6SECLOCALDISPATCH': 'VARCHAR','RAISE6SECLOCALPRICE': 'VARCHAR','RAISE6SECLOCALREQ': 'VARCHAR','RAISE6SECPRICE': 'VARCHAR',
  'RAISE6SECREQ': 'VARCHAR','RAISE6SECSUPPLYPRICE': 'VARCHAR','AGGREGATEDISPATCHERROR': 'VARCHAR','AVAILABLEGENERATION': 'VARCHAR',
  'AVAILABLELOAD': 'VARCHAR','INITIALSUPPLY': 'VARCHAR','CLEAREDSUPPLY': 'VARCHAR','LOWERREGIMPORT': 'VARCHAR','LOWERREGLOCALDISPATCH': 'VARCHAR',
  'LOWERREGLOCALREQ': 'VARCHAR','LOWERREGREQ': 'VARCHAR','RAISEREGIMPORT': 'VARCHAR','RAISEREGLOCALDISPATCH': 'VARCHAR','RAISEREGLOCALREQ': 'VARCHAR',
  'RAISEREGREQ': 'VARCHAR','RAISE5MINLOCALVIOLATION': 'VARCHAR','RAISEREGLOCALVIOLATION': 'VARCHAR','RAISE60SECLOCALVIOLATION': 'VARCHAR',
  'RAISE6SECLOCALVIOLATION': 'VARCHAR','LOWER5MINLOCALVIOLATION': 'VARCHAR','LOWERREGLOCALVIOLATION': 'VARCHAR','LOWER60SECLOCALVIOLATION': 'VARCHAR',
  'LOWER6SECLOCALVIOLATION': 'VARCHAR','RAISE5MINVIOLATION': 'VARCHAR','RAISEREGVIOLATION': 'VARCHAR','RAISE60SECVIOLATION': 'VARCHAR',
  'RAISE6SECVIOLATION': 'VARCHAR','LOWER5MINVIOLATION': 'VARCHAR','LOWERREGVIOLATION': 'VARCHAR','LOWER60SECVIOLATION': 'VARCHAR',
  'LOWER6SECVIOLATION': 'VARCHAR','RAISE6SECRRP': 'VARCHAR','RAISE6SECROP': 'VARCHAR','RAISE6SECAPCFLAG': 'VARCHAR','RAISE60SECRRP': 'VARCHAR',
  'RAISE60SECROP': 'VARCHAR','RAISE60SECAPCFLAG': 'VARCHAR','RAISE5MINRRP': 'VARCHAR','RAISE5MINROP': 'VARCHAR','RAISE5MINAPCFLAG': 'VARCHAR',
  'RAISEREGRRP': 'VARCHAR','RAISEREGROP': 'VARCHAR','RAISEREGAPCFLAG': 'VARCHAR','LOWER6SECRRP': 'VARCHAR','LOWER6SECROP': 'VARCHAR',
  'LOWER6SECAPCFLAG': 'VARCHAR','LOWER60SECRRP': 'VARCHAR','LOWER60SECROP': 'VARCHAR','LOWER60SECAPCFLAG': 'VARCHAR','LOWER5MINRRP': 'VARCHAR',
  'LOWER5MINROP': 'VARCHAR','LOWER5MINAPCFLAG': 'VARCHAR','LOWERREGRRP': 'VARCHAR','LOWERREGROP': 'VARCHAR','LOWERREGAPCFLAG': 'VARCHAR',
  'RAISE6SECACTUALAVAILABILITY': 'VARCHAR','RAISE60SECACTUALAVAILABILITY': 'VARCHAR','RAISE5MINACTUALAVAILABILITY': 'VARCHAR',
  'RAISEREGACTUALAVAILABILITY': 'VARCHAR','LOWER6SECACTUALAVAILABILITY': 'VARCHAR','LOWER60SECACTUALAVAILABILITY': 'VARCHAR',
  'LOWER5MINACTUALAVAILABILITY': 'VARCHAR','LOWERREGACTUALAVAILABILITY': 'VARCHAR','LORSURPLUS': 'VARCHAR','LRCSURPLUS': 'VARCHAR',
  }},
  filename =1,null_padding = true,ignore_errors=1,auto_detect=false)
  where I='D' and UNIT ='DREGION' AND VERSION = 3
                  """)
  columns = list(set(raw.columns) - {'SETTLEMENTDATE','REGIONID','I','filename','UNIT'})
  exprs = [
    duckdb.ColumnExpression(x).cast(duckdb.typing.DOUBLE).alias(x)
    for x in columns
  ]
  rel2 = raw.select('SETTLEMENTDATE','REGIONID','I','filename','UNIT',*exprs)
  final=duckdb.sql(""" select *exclude(SETTLEMENTDATE,I,XX,'filename'),cast (SETTLEMENTDATE as timestamp) as SETTLEMENTDATE,
   cast(SETTLEMENTDATE as date) as date,
   parse_filename(filename) as file,
   0 as PRIORITY ,
   isoyear (cast (SETTLEMENTDATE as timestamp)) as YEAR  from rel2  """)
  return final.arrow()

**<mark>Fact Tables</mark>**

In [13]:
Nbr_Files_to_Download = 60
Source                = "/lakehouse/default/Files/0_Source/ARCHIVE/Daily_Reports/"
Destination           = "/lakehouse/default/Files/1_Transform/CSV/Daily_Reports/"
while True:
    download("https://nemweb.com.au/Reports/Current/Daily_Reports/", Source, Nbr_Files_to_Download)
    unzip(Source, Destination)
    processed = False
    for tbl in ['scada', 'price']:
        catalog = connect_catalog()
        files_to_upload_full_Path = get_Path(Destination, tbl)
        if len(files_to_upload_full_Path) > 0:
            df = eval(f"clean_{tbl}(files_to_upload_full_Path)")
            catalog.create_table_if_not_exists(f'{db}.{tbl}', schema=df.schema, location=table_location + f'/{db}/{tbl}')
            catalog.load_table(f'{db}.{tbl}').append(df)
            print(f'{tbl} updated')
            processed = True
        else:
            print(f'{tbl} loaded already')
    if not processed:
        break

0 New File Loaded
0 New File uncompressed


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

94
scada loaded already
94
price loaded already


**<mark>CALENDAR</mark>**

In [14]:
# @title
tbl = db+"."+"calendar"
if not catalog.table_exists(tbl):
  df=duckdb.sql(""" SELECT cast(unnest(generate_series(cast ('2018-04-01' as date), cast('2024-12-31' as date), interval 1 day)) as date) as date,
            EXTRACT(year from date) as year,
            EXTRACT(month from date) as month
            """).arrow()
  catalog.create_table(tbl,schema=df.schema,location= table_location+f'/{db}/calendar')
  catalog.load_table(tbl).overwrite(df)
  print('calendar created')
else:
    print("table exist already")



calendar created


**<mark>DUID</mark>**

In [15]:
# @title
import requests
import duckdb
DUID_Path = "/lakehouse/default/Files/0_Source/Dimensions/DUID/"
import pathlib
pathlib.Path(DUID_Path).mkdir(parents=True, exist_ok=True)
url = "https://www.aemo.com.au/-/media/Files/Electricity/NEM/Participant_Information/NEM-Registration-and-Exemption-List.xls"
s = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
r = s.get(url,headers=headers)
r.content
output = open(DUID_Path+"NEM-Registration-and-Exemption-List.xls", 'wb')
output.write(r.content)
output.close()
duckdb.sql(f"""
INSTALL spatial;
LOAD spatial;
create or replace table DUID_raw as SELECT Region,DUID,first("Fuel Source - Descriptor") as FuelSourceDescriptor,first(Participant) as Participant
FROM st_read('{DUID_Path}NEM-Registration-and-Exemption-List.xls', layer = 'PU and Scheduled Loads',open_options = ['HEADERS=FORCE'])
group by all
""")

import requests
dls = "https://data.wa.aemo.com.au/datafiles/post-facilities/facilities.csv"
resp = requests.get(dls)
output = open(DUID_Path+"facilities_WA.csv", 'wb')
output.write(resp.content)
output.close()

duckdb.sql(f""" create or replace view x as select 'WA1' as Region  , "Facility Code" as DUID ,"Participant Name" as Participant from read_csv_auto('{DUID_Path}facilities_WA.csv')""")

duckdb.sql("""select x.Region,x.DUID, TECHNOLOGY as FuelSourceDescriptor,Participant from x
  left join (select * FROM read_csv_auto('https://github.com/djouallah/aemo_fabric/raw/main/WA_ENERGY.csv',header=1)) as z
  on x.duid=z.duid """).to_view('DUID_WA')

df=duckdb.sql(f""" with xx as (select * from DUID_raw union BY NAME select * from DUID_WA)
                select trim(DUID) as DUID,min(Region) as Region, min(FuelSourceDescriptor) as FuelSourceDescriptor,min(Participant) as Participant from xx group by all
                """).arrow()
catalog.create_table_if_not_exists(db+".duid",schema=df.schema,location= table_location+f'/{db}/duid')
catalog.load_table(db+".duid").overwrite(df)
print('duid updated')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

duid updated


# Read

In [16]:
tables_list = [t[1] for t in catalog.list_tables(db)]
tables_list

['scada', 'price', 'calendar', 'duid']

In [19]:
%%time
duckdb.sql(f"""
SET azure_transport_option_type = 'curl';
CREATE or replace SECRET secret1 ( TYPE AZURE,account_name {account_name}, CONNECTION_STRING '{CONNECTION_STRING}');
INSTALL iceberg;LOAD iceberg;
""")
for tbl in tables_list:
 duckdb.sql(f"""
 SET  VARIABLE last_snapshot_{tbl} = (select max(file) from glob ('{table_location}/{db}/{tbl}/metadata/*.metadata.json')) ;
 create or replace view {tbl} as select * from  iceberg_scan(getvariable('last_snapshot_{tbl}'))
 """)

CPU times: total: 1.19 s
Wall time: 3.31 s


In [20]:
duckdb.sql(""" select count(*) from scada """)

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     13551947 │
└──────────────┘

In [23]:
import daft
catalog  = connect_catalog()
df = daft.read_iceberg(catalog.load_table("azure.scada"))

In [24]:
df.count_rows()

ScanWithTask-Aggregate [Stage:2]:   0%|          | 0/1 [00:00<?, ?it/s]

ReduceMerge-Aggregate-Project-Project [Stage:1]:   0%|          | 0/1 [00:00<?, ?it/s]

13551947