In [None]:
!pip install deltalake==0.18.2
!pip install duckdb
!pip install azure-identity

In [None]:
Nbr_Files_to_Download  =  600
schema                 = 'aemo' 
ws                     = 'sqlengines'  
lh                     = 'power'
engine                 = 'pyarrow'
table_base_url         =  f'abfss://{ws}@onelake.dfs.fabric.microsoft.com/{lh}.Lakehouse/Tables/{schema}/'
onelake_file_root_path =  f"{lh}.Lakehouse/Files/{schema}"
runtime                = "fabric"

**_<u><mark>Import Python Libraries</mark></u>_**

In [None]:
from   deltalake.writer           import try_get_deltatable , write_deltalake
from   deltalake                  import DeltaTable
import pyarrow                    as pa
import os
import glob
from   azure.core.credentials     import AccessToken
from   azure.storage.filedatalake import DataLakeServiceClient
from   azure.identity             import DefaultAzureCredential
import duckdb
from   datetime                   import date, timedelta
import time
from   psutil                     import *
import re
import requests
from   shutil                     import unpack_archive
from   urllib.request             import urlopen
from   concurrent.futures         import ThreadPoolExecutor, as_completed

**_<u><mark>Authentication</mark></u>_**

In [None]:
if runtime == 'local':
    from   azure.identity import InteractiveBrowserCredential
    os.environ['azure_storage_token'.upper()] = InteractiveBrowserCredential().get_token("https://storage.azure.com/.default").token
    duckdb.sql(f""" CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv('azure_storage_token')}')   """)
else:
    os.environ['azure_storage_token'] = notebookutils.credentials.getToken('storage')
    print(' all good you are in fabric notebook')

<mark>**Optimize delta**</mark>

In [None]:
def optimize(tables):
    results = {}
    for tbl in tables:
        try:
            dt = DeltaTable(table_base_url + tbl)
            dt.optimize.compact()
            dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
            dt.cleanup_metadata()
            results[tbl] = "done"
        except Exception as e:
            results[tbl] = f"error: {str(e)}"
    return results
def vacuum(tables):
    results = {}
    for tbl in tables:
        try:
            dt = DeltaTable(table_base_url + tbl)
            dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
            dt.cleanup_metadata()
            results[tbl] = "done"
        except Exception as e:
            results[tbl] = f"error: {str(e)}"
    return results

**<mark>Download Some Data from the web</mark>**

In [None]:
def create_directory_if_not_exist(fs,abfss_directory) :
    directory_client = fs.get_directory_client(abfss_directory)
    try:
        directory_client.get_directory_properties()
        print(f"Directory '{abfss_directory}' exist already.")
    except :
        directory_client.create_directory()
        print(f"Directory '{abfss_directory}' created successfully.")

In [None]:
def download(url,Path,total_files):
    if not os.path.exists('/tmp'+Path):
      os.makedirs('/tmp'+Path, exist_ok=True)
    

    result = urlopen(url).read().decode('utf-8')
    pattern = re.compile(r'[\w.]*.zip')
    filelist1 = pattern.findall(result)
    filelist_unique = dict.fromkeys(filelist1)
    filelist =sorted(filelist_unique, reverse=True)

    onelake_folder = onelake_file_root_path + Path
    create_directory_if_not_exist(fs,onelake_folder)
    paths = fs.get_paths(path=onelake_folder)
    current = [os.path.basename(path.name) for path in paths ]

    #current =  [os.path.basename(x) for x in glob.glob(Path+'*.zip')]


    files_to_upload = list(set(filelist) - set(current))
    files_to_upload = list(dict.fromkeys(files_to_upload))[:total_files]
    print(str(len(files_to_upload)) + ' New File Downloaded')
    if len(files_to_upload) != 0 :
      for x in files_to_upload:
           with requests.get(url+x, stream=True) as resp:
            if resp.ok:
              with open(f"/tmp{Path}{x}", "wb") as f:
               for chunk in resp.iter_content(chunk_size=4096):
                f.write(chunk)
    return "done"

**<mark>Unzip</mark>**

In [None]:
def uncompress(zip_file, source_path, destination_path):
    """Uncompress a zip file to the destination path."""
    try:
        unpack_archive(os.path.join(source_path, zip_file), destination_path, 'zip')
        print(f"Uncompressed: {zip_file}")
    except Exception as e:
        print(f"Failed to uncompress {zip_file}: {e}")

def unzip(source, destination):
    """Unzip all zip files from the source directory to the destination directory."""
    if not os.path.exists(destination):
        os.makedirs(destination, exist_ok=True)

    # Get list of zip files in the source directory
    zip_files = [os.path.basename(x) for x in glob.glob(os.path.join(source, '*.zip'))]

    # Get list of already uncompressed files in the destination directory
    uncompressed_files = [
        os.path.basename(x).replace('.CSV', '.zip')
        for x in glob.glob(os.path.join(destination, '*.[Cc][Ss][Vv]')) + 
                  glob.glob(os.path.join(destination, '*.json'))
    ]

    # Determine files to uncompress (delta)
    files_to_uncompress = list(set(zip_files) - set(uncompressed_files))
    files_to_uncompress = list(dict.fromkeys(files_to_uncompress))  # Remove duplicates

    print(f"{len(files_to_uncompress)} new file(s) to uncompress.")

    if files_to_uncompress:
        with ThreadPoolExecutor() as executor:
            # Use lambda to pass additional arguments to uncompress
            executor.map(lambda f: uncompress(f, source, destination), files_to_uncompress)
        return "Done"
    else:
        return "Nothing to uncompress"

In [None]:
def get_Path(Source,Table):
 three_days_ago = (date.today() - timedelta(days=3)).isoformat()
 dt =try_get_deltatable(Table,storage_options=None)
 if dt is not None:
    existing_files = duckdb.sql(f""" select distinct file as file from delta_scan('{Table}') WHERE date > '{three_days_ago}' """).df()['file'].tolist()
 else:
  existing_files=[]
 print(len(existing_files))
 list_files_csv=[os.path.basename(x) for x in glob.glob(Source+'*.CSV')]
 list_files_json=[os.path.basename(x) for x in glob.glob(Source+'*.json')]
 filelist = list_files_csv + list_files_json

 files_to_upload = list(set(filelist) - set(existing_files))
 files_to_upload = list(dict.fromkeys(files_to_upload))
 files_to_upload_full_Path = [Source + i for i in files_to_upload]
 return files_to_upload_full_Path[:Nbr_Files_to_Download]

In [None]:
def get_onelake_fs(file_system_name):
    class CustomTokenCredential:
        def __init__(self, token):
            self.token = token

        def get_token(self, *scopes, **kwargs):
            return AccessToken(self.token, expires_on=9999999999)  # Set a far future expiration

    credential = CustomTokenCredential(os.getenv('azure_storage_token'))
    service_client = DataLakeServiceClient(account_url=f"https://onelake.dfs.fabric.microsoft.com", credential= credential)
    return service_client.get_file_system_client(file_system_name)

In [None]:
def List_files_to_upload(fs,local_folder,onelake_folder):
    create_directory_if_not_exist(fs,onelake_folder)
    paths = fs.get_paths(path=onelake_folder)
    onelake_files = [os.path.basename(path.name) for path in paths ]
    local_files=[os.path.basename(x) for x in glob.glob(local_folder+'*.*')]
    filtes_to_upload  = list(  set(local_files) - set(onelake_files))
    filtes_to_upload  = list(dict.fromkeys(filtes_to_upload))
    filtes_to_upload  = sorted(filtes_to_upload, reverse=True)
    filtes_to_upload = [local_folder+'/' + i for i in filtes_to_upload]
    return filtes_to_upload

In [None]:
def upload_file(file_system_client, local_file, remote_directory):
    """
    Helper function to upload a single file to a remote directory in ADLS/OneLake.
    
    Args:
        file_system_client: The file system client.
        local_file: Local file path to upload.
        remote_directory: The remote directory where the file will be uploaded.
    """
    try:
        # Extract the file name from the local file path
        file_name = os.path.basename(local_file)
        
        # Define the remote file path
        remote_file_path = f"{remote_directory}{file_name}"
        
        # Upload the file
        print(f"Uploading {local_file} to {remote_file_path}")
        file_client = file_system_client.get_file_client(remote_file_path)
        with open(local_file, "rb") as local_file_data:
            file_client.upload_data(local_file_data, overwrite=True)
        print(f"Uploaded {local_file} successfully!")
        return local_file, None  # Return the file and no error
    except Exception as e:
        print(f"Failed to upload {local_file}: {e}")
        return local_file, str(e)  # Return the file and the error message

def upload_files(file_system_client, local_folder, remote_directory, max_workers=5):
    """
    Upload a list of files to a remote directory in ADLS/OneLake using multi-threading.
    
    Args:
        file_system_client: The file system client.
        local_files: List of local file paths to upload.
        remote_directory: The remote directory where files will be uploaded.
        max_workers: Maximum number of threads to use for parallel uploads.
    """
    local_files =     List_files_to_upload(file_system_client,local_folder,remote_directory)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit upload tasks to the thread pool
        futures = [
            executor.submit(upload_file, file_system_client, local_file, remote_directory)
            for local_file in local_files
        ]
        
        # Wait for all tasks to complete and handle results
        for future in as_completed(futures):
            local_file, error = future.result()
            if error:
                print(f"Error uploading {local_file}: {error}")
            else:
                print(f"Successfully uploaded {local_file}")

**<mark>Price Today </mark>**

In [None]:
def get_price(files_to_upload_full_Path):
  raw =duckdb.sql(f"""from read_csv({files_to_upload_full_Path},
  Skip=1,header =0,all_varchar=1,
  columns={{
  'I' : 'VARCHAR', 'DISPATCH' : 'VARCHAR', 'PRICE' : 'VARCHAR', 'xx' : 'VARCHAR', 'SETTLEMENTDATE' : 'VARCHAR', 'RUNNO' : 'VARCHAR', 'REGIONID' : 'VARCHAR',
   'DISPATCHINTERVAL' : 'VARCHAR', 'INTERVENTION' : 'VARCHAR', 'RRP' : 'VARCHAR', 'EEP' : 'VARCHAR', 'ROP' : 'VARCHAR', 'APCFLAG' : 'VARCHAR',
    'MARKETSUSPENDEDFLAG' : 'VARCHAR', 'LASTCHANGED' : 'VARCHAR', 'RAISE6SECRRP' : 'VARCHAR', 'RAISE6SECROP' : 'VARCHAR', 'RAISE6SECAPCFLAG' : 'VARCHAR',
    'RAISE60SECRRP' : 'VARCHAR', 'RAISE60SECROP' : 'VARCHAR', 'RAISE60SECAPCFLAG' : 'VARCHAR', 'RAISE5MINRRP' : 'VARCHAR', 'RAISE5MINROP' : 'VARCHAR',
    'RAISE5MINAPCFLAG' : 'VARCHAR', 'RAISEREGRRP' : 'VARCHAR', 'RAISEREGROP' : 'VARCHAR', 'RAISEREGAPCFLAG' : 'VARCHAR', 'LOWER6SECRRP' : 'VARCHAR',
     'LOWER6SECROP' : 'VARCHAR', 'LOWER6SECAPCFLAG' : 'VARCHAR', 'LOWER60SECRRP' : 'VARCHAR', 'LOWER60SECROP' : 'VARCHAR', 'LOWER60SECAPCFLAG' : 'VARCHAR',
     'LOWER5MINRRP' : 'VARCHAR', 'LOWER5MINROP' : 'VARCHAR', 'LOWER5MINAPCFLAG' : 'VARCHAR', 'LOWERREGRRP' : 'VARCHAR', 'LOWERREGROP' : 'VARCHAR',
      'LOWERREGAPCFLAG' : 'VARCHAR', 'PRICE_STATUS' : 'VARCHAR', 'PRE_AP_ENERGY_PRICE' : 'VARCHAR', 'PRE_AP_RAISE6_PRICE' : 'VARCHAR', 'PRE_AP_RAISE60_PRICE' : 'VARCHAR',
       'PRE_AP_RAISE5MIN_PRICE' : 'VARCHAR', 'PRE_AP_RAISEREG_PRICE' : 'VARCHAR', 'PRE_AP_LOWER6_PRICE' : 'VARCHAR', 'PRE_AP_LOWER60_PRICE' : 'VARCHAR',
        'PRE_AP_LOWER5MIN_PRICE' : 'VARCHAR', 'PRE_AP_LOWERREG_PRICE' : 'VARCHAR', 'RAISE1SECRRP' : 'VARCHAR', 'RAISE1SECROP' : 'VARCHAR', 'RAISE1SECAPCFLAG' : 'VARCHAR',
         'LOWER1SECRRP' : 'VARCHAR', 'LOWER1SECROP' : 'VARCHAR', 'LOWER1SECAPCFLAG' : 'VARCHAR', 'PRE_AP_RAISE1_PRICE' : 'VARCHAR',
          'PRE_AP_LOWER1_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_ENERGY_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_RAISE6_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_RAISE60_PRICE' : 'VARCHAR',
          'CUMUL_PRE_AP_RAISE5MIN_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_RAISEREG_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_LOWER6_PRICE' : 'VARCHAR',
          'CUMUL_PRE_AP_LOWER60_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_LOWER5MIN_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_LOWERREG_PRICE' : 'VARCHAR',
          'CUMUL_PRE_AP_RAISE1_PRICE' : 'VARCHAR', 'CUMUL_PRE_AP_LOWER1_PRICE' : 'VARCHAR', 'OCD_STATUS' : 'VARCHAR', 'MII_STATUS' : 'VARCHAR',
  }},
  filename =1,null_padding = true,ignore_errors=1,auto_detect=false)
  where I='D' and PRICE ='PRICE'

                  """)
  columns = list(set(raw.columns) - {'SETTLEMENTDATE','REGIONID','I','PRICE','filename','OCD_STATUS','MII_STATUS','DISPATCH','PRICE_STATUS','LASTCHANGED'})

  exprs = [
    duckdb.ColumnExpression(x).cast(duckdb.typing.DOUBLE).alias(x)
    for x in columns
  ]
  rel2 = raw.select('SETTLEMENTDATE','REGIONID','I','PRICE','filename','OCD_STATUS','MII_STATUS','DISPATCH','PRICE_STATUS','LASTCHANGED',*exprs)
  final=duckdb.sql(""" select *exclude(SETTLEMENTDATE,I,xx,'PRICE','filename'),
  cast (SETTLEMENTDATE as TIMESTAMPTZ) as SETTLEMENTDATE,
  cast(SETTLEMENTDATE as date) as date,
  parse_filename(filename) as file,
  0 as PRIORITY,
  isoyear (cast (SETTLEMENTDATE as TIMESTAMPTZ)) as YEAR
  from rel2  """)
  return final.arrow()

**<mark>SCADA Today </mark>**

In [None]:
def get_scada(files_to_upload_full_Path):
  raw =duckdb.sql(f"""from read_csv({files_to_upload_full_Path},
  Skip=1,header =0,all_varchar=1,
  columns={{
  'I' : 'VARCHAR', 'DISPATCH' : 'VARCHAR', 'UNIT_SCADA' : 'VARCHAR', 'xx' : 'VARCHAR', 'SETTLEMENTDATE' : 'timestamp', 'DUID' : 'VARCHAR', 'SCADAVALUE' : 'double','LASTCHANGED' : 'timestamp'
  }},
  filename =1,null_padding = true,ignore_errors=1,auto_detect=false)
  where I='D' and SCADAVALUE !=0
                  """)
  scada=duckdb.sql(""" select  DUID,SCADAVALUE as INITIALMW, cast(0 as double ) as INTERVENTION,
   cast (SETTLEMENTDATE as TIMESTAMPTZ) as SETTLEMENTDATE,
   cast(SETTLEMENTDATE as date) as date,
   parse_filename(filename) as file,
   0 as PRIORITY ,
   isoyear (cast (SETTLEMENTDATE as timestamp)) as YEAR
   from raw
    """)
  return scada.arrow()

**<mark>DUID </mark>**

In [None]:
def get_duid(Onelake_table):
    DUID_Path = "/tmp/default/Files/0_Source/Dimensions/DUID/"
    import pathlib
    import requests
    pathlib.Path(DUID_Path).mkdir(parents=True, exist_ok=True)
    url = "https://www.aemo.com.au/-/media/Files/Electricity/NEM/Participant_Information/NEM-Registration-and-Exemption-List.xls"
    s = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
    r = s.get(url,headers=headers)
    r.content
    output = open(DUID_Path+"NEM-Registration-and-Exemption-List.xls", 'wb')
    output.write(r.content)
    output.close()
    duckdb.sql(f"""
    INSTALL spatial;
    LOAD spatial;
    create or replace table DUID as
    SELECT Region,DUID,first("Fuel Source - Descriptor") as FuelSourceDescriptor,first(Participant) as Participant
    FROM st_read('{DUID_Path}NEM-Registration-and-Exemption-List.xls', layer = 'PU and Scheduled Loads',open_options = ['HEADERS=FORCE'])
    group by all
    """)

    import requests
    dls = "https://data.wa.aemo.com.au/datafiles/post-facilities/facilities.csv"
    resp = requests.get(dls)
    output = open(DUID_Path+"facilities_WA.csv", 'wb')
    output.write(resp.content)
    output.close()

    duckdb.sql(f""" select 'WA1' as Region  , "Facility Code" as DUID ,"Participant Name" as Participant from read_csv_auto('{DUID_Path}facilities_WA.csv')""").to_view('x')

    duckdb.sql("""select x.Region,x.DUID, TECHNOLOGY as FuelSourceDescriptor,Participant from x
    left join (select * FROM read_csv_auto('https://github.com/djouallah/aemo_fabric/raw/main/WA_ENERGY.csv',header=1)) as z
    on x.duid=z.duid """).to_view('DUID_WA')

    duckdb.sql("""
    create or replace table states(RegionID varchar, States varchar) ;
    insert into states values
    ('WA1' , 'Western Australia') ,
    ('QLD1' , 'Queensland')  ,
    ('NSW1' , 'New South Walles')  ,
    ('TAS1' , 'Tasmania')  ,
    ('SA1' , 'South Australia')  ,
    ('VIC1' , 'Victoria')
    """)

    df =duckdb.sql(f""" with xx as (select * from DUID union BY NAME select * from DUID_WA)
                select States,trim(DUID) as DUID,min(Region) as Region, min(FuelSourceDescriptor) as FuelSourceDescriptor,
                min(Participant) as Participant
                from xx
                JOIN states on xx.Region = states.RegionID
                where length(trim(DUID)) > 2
                group by all
                """).arrow()
    write_deltalake(Onelake_table, df,mode="overwrite",engine=engine)

_**<mark>Summary</mark>**_

In [None]:
sch = pa.schema([
                    ("date",        pa.date32()),
                    ("time",        pa.int16()),
                    ("cutoff",      pa.timestamp("us","UTC")),
                    ("DUID",        pa.string()),
                    ("mw",          pa.decimal128(18, 4)),
                    ("price",       pa.decimal128(18, 4)),
               ])
try:
  DeltaTable(table_base_url+"summary",without_files =True)
  print('summary exist already')
except:
  DeltaTable.create( table_base_url+"summary", schema=sch,mode="overwrite",configuration = {"delta.logRetentionDuration": "1 hours" })

In [None]:
def get_max(tbl):
    result = duckdb.sql(f""" SELECT  STRFTIME( max(cutoff), '%Y-%m-%d %H:%M:%S') FROM delta_scan('{tbl}') """).fetchone()[0]
    return result if result is not None else '1900-01-01'

In [None]:
full_refresh_sql = f"""
      CREATE OR REPLACE VIEW {schema}.scada AS SELECT * FROM delta_scan('{table_base_url}scada') where INTERVENTION = 0 and INITIALMW <> 0 ;
      CREATE OR REPLACE VIEW {schema}.price AS SELECT * FROM delta_scan('{table_base_url}price') where INTERVENTION = 0;
      select
        s.date,
        cast(strftime(s.SETTLEMENTDATE, '%H%M') AS INT16)                       as time ,
        (select max(cast(settlementdate as TIMESTAMPTZ) ) from {schema}.scada)  as cutoff ,
        s.DUID,
        cast(max(s.INITIALMW) AS DECIMAL(18, 4))                                as mw,
        cast(max(p.RRP) AS DECIMAL(18, 4))                                      as price
      from  {schema}.scada   s
            LEFT JOIN {schema}.duid d    ON s.DUID = d.DUID
            LEFT JOIN {schema}.price   p ON s.SETTLEMENTDATE = p.SETTLEMENTDATE AND d.Region = p.REGIONID
      where  s.settlementdate >= '2000-01-01'
      group by all
      order by  s.date, s.DUID, time,price
    """

In [None]:
def Incremental_refresh(schema,max_timestamp):
    return f"""
      select
        s.date,
        cast(strftime(s.SETTLEMENTDATE, '%H%M') AS INT16)                                                                               as time ,
        (select max(cast(settlementdate as TIMESTAMPTZ) ) from {schema}.scada_today where date  >= cast('{max_timestamp}' as date)   )  as cutoff ,
        s.DUID,
        CAST(max(s.INITIALMW) AS DECIMAL(18, 4))                                                                                         as mw,
        CAST(max(p.RRP) AS DECIMAL(18, 4))                                                                                               as price
      from
        {schema}.scada_today  s
        JOIN {schema}.duid        d ON s.DUID = d.DUID
        JOIN (select * from {schema}.price_today where INTERVENTION  = 0 and date  >= cast('{max_timestamp}' as date)) p
        ON s.SETTLEMENTDATE = p.SETTLEMENTDATE AND d.Region = p.REGIONID
      where
        s.INTERVENTION      = 0
        and INITIALMW      <> 0
        and s.settlementdate > '{max_timestamp}' and p.settlementdate > '{max_timestamp}'
        and s.date           >= cast('{max_timestamp}' as date)
      group by
        all
      order by s.date
    """

# Process Data

In [None]:
Onelake_table = table_base_url+'duid'
try:
    dt =DeltaTable(Onelake_table)
    print('duid loaded already')
except:
    get_duid(Onelake_table)

In [None]:
Onelake_table = table_base_url +'mstdatetime'
try:
    dt =DeltaTable(Onelake_table)
    print('mstdatetime loaded already')
except:
    df=duckdb.sql(""" SELECT cast(unnest(generate_series(cast ('2018-04-01' as date), cast('2026-12-31' as date), interval 5 minute)) as TIMESTAMPTZ) as SETTLEMENTDATE,
            strftime(SETTLEMENTDATE, '%I:%M:%S %p') as time,
            cast(SETTLEMENTDATE as date ) as date,
            EXTRACT(year from date) as year,
            EXTRACT(month from date) as month
            """).arrow()
    write_deltalake(Onelake_table, df,mode="overwrite",engine=engine)

In [None]:
Onelake_table = table_base_url+'calendar'
try:
    dt =DeltaTable(Onelake_table)
    print('calendar loaded already')
except:
    df=duckdb.sql(""" SELECT cast(unnest(generate_series(cast ('2018-04-01' as date), cast('2026-12-31' as date), interval 1 day)) as date) as date,
            EXTRACT(year from date) as year,
            EXTRACT(month from date) as month
            """).arrow()
    write_deltalake(Onelake_table, df,mode="overwrite",engine=engine )

In [None]:
Onelake_table = table_base_url +'mstime'
try:
    dt =DeltaTable(Onelake_table)
    print('mstime loaded already')
except:
    xx=duckdb.sql(""" SELECT cast(unnest(generate_series(cast ('2018-04-01' as date), cast('2018-04-02' as date), interval 5 minute)) as TIMESTAMPTZ) as SETTLEMENTDATE,
        strftime(SETTLEMENTDATE, '%I:%M:%S %p') as time,CAST(strftime(SETTLEMENTDATE, '%H%M') AS INT16) AS id  """)
    df = duckdb.sql(""" select time, min(id) as id from xx group by time """).arrow()
    write_deltalake(Onelake_table, df,mode="overwrite",engine=engine )

In [None]:
fs = get_onelake_fs(f"{ws}")

In [25]:
%%time
Web_Path                = "http://nemweb.com.au/Reports/Current/DispatchIS_Reports/"
Zip_Path                = "/0_Source/Current/DispatchIS_Reports/"
uncompressed_Path       = "/1_Transform/CSV/DispatchIS_Reports/"
Onelake_table           = table_base_url+'price_today'
#############################################
download(Web_Path,Zip_Path,Nbr_Files_to_Download)
unzip('/tmp'+Zip_Path,'/tmp'+uncompressed_Path)
upload_files(fs, '/tmp'+Zip_Path, onelake_file_root_path + Zip_Path , max_workers= cpu_count())


KeyboardInterrupt: 

In [None]:
%%time
files_to_upload_full_Path = get_Path('/tmp'+uncompressed_Path,Onelake_table)

In [None]:
%%time
if len(files_to_upload_full_Path) >0 :
  df = get_price(files_to_upload_full_Path)
  write_deltalake(Onelake_table, df,mode="append",partition_by=['date'],engine=engine)
else:
  print('all loaded already')

In [None]:
%%time
Web_Path                = "http://nemweb.com.au/Reports/Current/Dispatch_SCADA/"
Zip_Path                = "/0_Source/Current/Dispatch_SCADA/"
uncompressed_Path       = "/1_Transform/CSV/Dispatch_SCADA/"
Onelake_table           = table_base_url+'scada_today'
#############################################
download(Web_Path,Zip_Path,Nbr_Files_to_Download)
unzip('/tmp'+Zip_Path,'/tmp'+uncompressed_Path)
upload_files(fs, '/tmp'+Zip_Path, onelake_file_root_path + Zip_Path , max_workers= cpu_count())

In [None]:
%%time
files_to_upload_full_Path = get_Path('/tmp'+uncompressed_Path,Onelake_table)

In [None]:
%%time
if len(files_to_upload_full_Path) >0 :
  df = get_scada(files_to_upload_full_Path)
  write_deltalake(Onelake_table, df,mode="append",partition_by=['date'],engine=engine)
else:
  print('all loaded already')

In [None]:
%%time
Summary_table_path = table_base_url+'summary'  
con = duckdb.connect()  
con.sql(f""" CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv('azure_storage_token')}')   """)          
con.sql(f"create SCHEMA IF NOT EXISTS {schema} ")
for tbl in ['scada_today','duid','price_today','summary']:
    con.sql(f"CREATE OR REPLACE VIEW {schema}.{tbl} AS SELECT * FROM delta_scan('{table_base_url}{tbl}');")
try:
 dt = DeltaTable(table_base_url+'scada')
 current_version = dt.version()
except:
 current_version = -1
dt = DeltaTable(Summary_table_path)
Previous_version = dt.history()[0].get('scada')
Previous_version = int(Previous_version or '0')
print(f'scada current  version {current_version}')
print(f'scada version used by Summary {Previous_version}')
### if scada table has changed (downstream) or the summary table has a lot of small files then rebuild everything
if (current_version != Previous_version and current_version !=-1) or len(DeltaTable(table_base_url+'summary').files()) > 90 :
    print('scada table was updated, trigger full refresh')
    get_duid(table_base_url+'duid')
    print('table maintenance')
    #### Table maintenance  ##############
    optimize(['price_today','scada_today'])
    vacuum(['summary','duid'])
    ########### Write Summary Table ######
    print('generate summary')
    df = con.sql(full_refresh_sql).record_batch()
    RG=8_000_000
    write_deltalake(Summary_table_path , df, mode="overwrite",max_rows_per_file = RG , max_rows_per_group = RG, min_rows_per_group = RG,
    custom_metadata = {'scada':str(current_version)},engine=engine)
    print('rows fully refreshed')
else:
    print('scada table was not updated, trigger incremental refresh')
    max_timestamp = get_max(Summary_table_path)
    print(max_timestamp)
    df = con.sql(Incremental_refresh(schema,max_timestamp)).arrow()
    r = df.num_rows
    if r >0 :
        write_deltalake(Summary_table_path,df, mode="append",custom_metadata = {'scada':str(current_version)},engine= engine)
        print(f'new {r} rows inserted')
    else :
        print("no new data")  