# write

In [4]:
#%pip install pyparsing  --upgrade
#!pip install -q pyiceberg[adlfs]
#!pip install -q getdaft
#!pip install -q psycopg2

In [5]:
db = "daftaemo"

In [6]:
import configparser
config = configparser.ConfigParser()
config.read("/lakehouse/default/Files/KV/variable.ini")
# Postgres connection :  postgresql+psycopg2://user:password@xxxx.yyyyy.database.azure.com/db?sslmode=require
postgresql_db               = config.get("myvars", "pg_azure")
AZURE_STORAGE_ACCOUNT_KEY   = config.get("myvars", "AccountKey")
azure_storage_tenant_id     = config.get("myvars", "tenantId")
account_name                = config.get("myvars", "account_name")
table_location              = config.get("myvars", "table_location_azure")

In [7]:
from   pyiceberg.catalog import load_catalog
from   pyiceberg.catalog.sql import SqlCatalog
from   pyiceberg.io.fsspec import FsspecFileIO
import os
from   datetime import datetime
import glob
import os
from   psutil import *
import re 
import requests
from   shutil import unpack_archive
from   urllib.request import urlopen
import pathlib
import daft
from   daft import DataType, col

**<mark>Connect to the Catalog</mark>**

In [8]:
def connect_catalog():
      catalog = SqlCatalog(
      "default",
      **{
          "uri"                         : postgresql_db,
          "adlfs.account-name"          : account_name ,
          "adlfs.account-key"           : AZURE_STORAGE_ACCOUNT_KEY,
          "adlfs.tenant-id"             : azure_storage_tenant_id,
          "py-io-impl"                  : "pyiceberg.io.fsspec.FsspecFileIO",
          "legacy-current-snapshot-id"  : True ,
          "warehouse"                   : table_location 
      },
                        )
      return catalog 

In [9]:
catalog  = connect_catalog()
catalog.create_namespace_if_not_exists(db)

In [10]:
def download(url,Path,total_files):
    if not os.path.exists(Path):
      os.makedirs(Path, exist_ok=True)
    result = urlopen(url).read().decode('utf-8')
    pattern = re.compile(r'[\w.]*.zip')
    filelist1 = pattern.findall(result)
    filelist_unique = dict.fromkeys(filelist1)
    filelist =sorted(filelist_unique, reverse=True)
    current =  [os.path.basename(x) for x in glob.glob(Path+'*.zip')]
    files_to_upload = list(set(filelist) - set(current))
    files_to_upload = list(dict.fromkeys(files_to_upload))[:total_files] 
    print(str(len(files_to_upload)) + ' New File Loaded')
    if len(files_to_upload) != 0 :
      for x in files_to_upload:
           with requests.get(url+x, stream=True) as resp:
            if resp.ok:
              with open(f"{Path}{x}", "wb") as f:
               for chunk in resp.iter_content(chunk_size=4096):
                f.write(chunk)
    return "done"

In [11]:
def unzip(Source, Destination):
    if not os.path.exists(Destination):
      os.makedirs(Destination, exist_ok=True)
    filelist=[os.path.basename(x) for x in glob.glob(Source+'*.zip')]
    ### checl the unzipped files already
    current = [os.path.basename(x) for x in glob.glob(Destination+'*.CSV')]
    current = [w.replace('.CSV','.zip') for w in current]
    #unzip only the delta
    files_to_upload = list(set(filelist) - set(current))
    files_to_upload = list(dict.fromkeys(files_to_upload))
    print(str(len(files_to_upload)) + ' New File uncompressed')
    if len(files_to_upload) != 0 :
      for x in files_to_upload:
        unpack_archive(str(Source+x), str(Destination), 'zip')
      return "done"
    else:
     return "nothing to see here"

In [12]:
def get_table_files(db,table):
    table = daft.read_iceberg(catalog.load_table(db+'.'+table))
    table_files = table.select('file').distinct().to_pylist()
    table_files = [entry['file'] for entry in table_files]
    return table_files

In [13]:
def get_Path(Source,Destination,Nbr_Files_to_Download):
 if catalog.table_exists(db+"."+Destination):
  existing_files = get_table_files(db,Destination)
 else:
  existing_files = []
 print(len(existing_files))
 filelist_csv = [os.path.basename(x) for x in glob.glob(Source+'*.CSV')]
 files_to_upload = list(set(filelist_csv) - set(existing_files))
 files_to_upload = list(dict.fromkeys(files_to_upload))
 files_to_upload_full_Path = [Source + i for i in files_to_upload][:Nbr_Files_to_Download]
 return files_to_upload_full_Path

In [14]:
def clean_scada(files_to_upload_full_Path):
    schema={
          'I': DataType.string(),'UNIT': DataType.string(),'XX': DataType.string(),'VERSION': DataType.string(),'SETTLEMENTDATE': DataType.string(),'RUNNO': DataType.string(),
          'DUID': DataType.string(),'INTERVENTION': DataType.string(),'DISPATCHMODE': DataType.string(),'AGCSTATUS': DataType.string(),'INITIALMW': DataType.string(),
          'TOTALCLEARED': DataType.string(),'RAMPDOWNRATE': DataType.string(),'RAMPUPRATE': DataType.string(),'LOWER5MIN': DataType.string(),
          'LOWER60SEC': DataType.string(),'LOWER6SEC': DataType.string(),'RAISE5MIN': DataType.string(),'RAISE60SEC': DataType.string(),
          'RAISE6SEC': DataType.string(),'MARGINAL5MINVALUE': DataType.string(),'MARGINAL60SECVALUE': DataType.string(),
          'MARGINAL6SECVALUE': DataType.string(),'MARGINALVALUE': DataType.string(),'VIOLATION5MINDEGREE': DataType.string(),
          'VIOLATION60SECDEGREE': DataType.string(),'VIOLATION6SECDEGREE': DataType.string(),'VIOLATIONDEGREE': DataType.string(),
          'LOWERREG': DataType.string(),'RAISEREG': DataType.string(),'AVAILABILITY': DataType.string(),'RAISE6SECFLAGS': DataType.string(),
          'RAISE60SECFLAGS': DataType.string(),'RAISE5MINFLAGS': DataType.string(),'RAISEREGFLAGS': DataType.string(),
          'LOWER6SECFLAGS': DataType.string(),'LOWER60SECFLAGS': DataType.string(),'LOWER5MINFLAGS': DataType.string(),
          'LOWERREGFLAGS': DataType.string(),'RAISEREGAVAILABILITY': DataType.string(),'RAISEREGENABLEMENTMAX': DataType.string(),
          'RAISEREGENABLEMENTMIN': DataType.string(),'LOWERREGAVAILABILITY': DataType.string(),'LOWERREGENABLEMENTMAX': DataType.string(),
          'LOWERREGENABLEMENTMIN': DataType.string(),'RAISE6SECACTUALAVAILABILITY': DataType.string(),
          'RAISE60SECACTUALAVAILABILITY': DataType.string(),'RAISE5MINACTUALAVAILABILITY': DataType.string(),
          'RAISEREGACTUALAVAILABILITY': DataType.string(),'LOWER6SECACTUALAVAILABILITY': DataType.string(),
          'LOWER60SECACTUALAVAILABILITY': DataType.string(),'LOWER5MINACTUALAVAILABILITY': DataType.string(),'LOWERREGACTUALAVAILABILITY': DataType.string()}
    df = daft.read_csv(files_to_upload_full_Path,schema=schema, infer_schema=False, has_headers=False, allow_variable_columns=True,file_path_column="fullpath")
    df = df.where((df["UNIT"] == 'DUNIT' ) & (df["VERSION"] == '3') & (df["I"] == 'D'))
    df = df.with_column('file',df["fullpath"].str.extract(r"[^\/]*\.CSV"))
    df = df.exclude('I','XX','fullpath')
    df_cols = list(set(df.column_names) - {'SETTLEMENTDATE','DUID','file','UNIT','file'})
    for col_name in df_cols:
        df = df.with_column(col_name, col(col_name).cast(DataType.float64()))
    df = df.with_column("SETTLEMENTDATE", df["SETTLEMENTDATE"].str.to_datetime("%Y/%m/%d %H:%M:%S"))
    df = df.with_column('DATE', col('SETTLEMENTDATE').cast(DataType.date()))
    df = df.with_column('year', col('SETTLEMENTDATE').dt.year())
    return df

**<mark>Main Logic</mark>**

In [15]:
Nbr_Files_to_process  = 1000
Source                = "/lakehouse/default/Files/0_Source/ARCHIVE/Daily_Reports/"
Destination           = "/lakehouse/default/Files/Daily_Reports/"
download("https://nemweb.com.au/Reports/Current/Daily_Reports/", Source, Nbr_Files_to_process)
unzip(Source, Destination)
while True:
    processed = False
    for tbl in ['scada']:
        catalog = connect_catalog()
        files_to_upload_full_Path = get_Path(Destination, tbl,Nbr_Files_to_process)
        if len(files_to_upload_full_Path) > 0:
            df = eval(f"clean_{tbl}(files_to_upload_full_Path)")
            catalog.create_table_if_not_exists(f'{db}.{tbl}', schema=df.schema().to_pyarrow_schema())
            df.write_iceberg(catalog.load_table(f'{db}.{tbl}'))
            print(f'{tbl} updated')
            processed = True
        else:
            print(f'{tbl} loaded already')
    if not processed:
        break

1 New File Loaded
1 New File uncompressed


ScanWithTask-Aggregate-FanoutHash [Stage:2]:   0%|          | 0/1 [00:00<?, ?it/s]

ReduceMerge-Aggregate [Stage:1]:   0%|          | 0/1 [00:00<?, ?it/s]

2198


ScanWithTask-Project-Project-WriteIceberg [Stage:3]:   0%|          | 0/1 [00:00<?, ?it/s]

scada updated


ScanWithTask-Aggregate-FanoutHash [Stage:5]:   0%|          | 0/1 [00:00<?, ?it/s]

ReduceMerge-Aggregate [Stage:4]:   0%|          | 0/1 [00:00<?, ?it/s]

2208
scada loaded already


# Read

In [16]:
catalog     = connect_catalog()
scada       = daft.read_iceberg(catalog.load_table(db+".scada"))
scada.show()

UNIT Utf8,VERSION Float64,"SETTLEMENTDATE Timestamp(Microseconds, None)",RUNNO Float64,DUID Utf8,INTERVENTION Float64,DISPATCHMODE Float64,AGCSTATUS Float64,INITIALMW Float64,TOTALCLEARED Float64,RAMPDOWNRATE Float64,RAMPUPRATE Float64,LOWER5MIN Float64,LOWER60SEC Float64,LOWER6SEC Float64,RAISE5MIN Float64,RAISE60SEC Float64,RAISE6SEC Float64,MARGINAL5MINVALUE Float64,MARGINAL60SECVALUE Float64,MARGINAL6SECVALUE Float64,MARGINALVALUE Float64,VIOLATION5MINDEGREE Float64,VIOLATION60SECDEGREE Float64,VIOLATION6SECDEGREE Float64,VIOLATIONDEGREE Float64,LOWERREG Float64,RAISEREG Float64,AVAILABILITY Float64,RAISE6SECFLAGS Float64,RAISE60SECFLAGS Float64,RAISE5MINFLAGS Float64,RAISEREGFLAGS Float64,LOWER6SECFLAGS Float64,LOWER60SECFLAGS Float64,LOWER5MINFLAGS Float64,LOWERREGFLAGS Float64,RAISEREGAVAILABILITY Float64,RAISEREGENABLEMENTMAX Float64,RAISEREGENABLEMENTMIN Float64,LOWERREGAVAILABILITY Float64,LOWERREGENABLEMENTMAX Float64,LOWERREGENABLEMENTMIN Float64,RAISE6SECACTUALAVAILABILITY Float64,RAISE60SECACTUALAVAILABILITY Float64,RAISE5MINACTUALAVAILABILITY Float64,RAISEREGACTUALAVAILABILITY Float64,LOWER6SECACTUALAVAILABILITY Float64,LOWER60SECACTUALAVAILABILITY Float64,LOWER5MINACTUALAVAILABILITY Float64,LOWERREGACTUALAVAILABILITY Float64,file Utf8,DATE Date,year Int32
DUNIT,3,2024-10-29 04:05:00,1,ADPBA1,0,0,1,0.103,0.0,93.12,93.12,2,2,2,2,2,0,,,,,,,,,0,0,6.0,1,1,1,1,1,1,1,1,7.75999,6,-6,7.75999,6,-6,3,3,3,4,3,3,3,4,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,ADPPV1,0,0,0,0.0,0.0,120.0,120.0,0,0,0,0,0,0,,,,,,,,,0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,AGLHAL,0,0,0,0.0,0.0,720.0,720.0,0,0,0,0,0,0,,,,,,,,,0,0,168.0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,AGLSOM,0,0,0,0.0,0.0,480.0,480.0,0,0,0,0,0,0,,,,,,,,,0,0,126.0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,ANGAST1,0,0,0,0.0,0.0,840.0,840.0,0,0,0,0,0,0,,,,,,,,,0,0,21.0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,APD01,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,65,21,,,,,,,,,0,0,0.0,1,1,1,0,0,0,0,0,0.0,0,0,0.0,0,0,450,450,450,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,ARWF1,0,0,0,53.7,53.35477,240.0,1200.0,0,0,0,0,0,0,,,,,,,,,0,0,53.35477,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
DUNIT,3,2024-10-29 04:05:00,1,ASDLBY01,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,,,,,,,,,0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,PUBLIC_DAILY_202410290000_20241030040503.CSV,2024-10-29,2024
