In [14]:
sql_folder           = './builtin/sql/'
workspace            = 'processing'
LH                   = 'test'
schema               = 'new'
compaction_threshold = 30

**<mark>Import Packages</mark>**

In [15]:
%%time
from   deltalake     import  write_deltalake,DeltaTable
from   datetime      import datetime, time
from   zoneinfo      import ZoneInfo
import duckdb
import re

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 18.4 µs


**<mark>Function to Run SQL Queries</mark>**

In [16]:
%%time
table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/'
def run_sql(list_files):
    successful_runs = 0
    for x in list_files:
        try:
            file_path = f'{sql_folder}/{x}.sql'
            with open(file_path, 'r') as file:
                sql_content = file.read()

            match = re.search(r"-- materialized:\s*\((.*?)\)", sql_content)

            if match:
                materialized_content = match.group(1)
                parts = [part.strip() for part in materialized_content.split(',')]

                if len(parts) >= 2:
                    table_name = parts[0]
                    mode       = parts[1]
                    try:
                        write_delta(sql_content,table_name,schema, mode)
                        duckdb.sql(f""" create or replace view {table_name} as select * from delta_scan('{table_base_url}{schema}/{table_name}') """)
                        successful_runs = successful_runs +1
                    except:
                        print(f"Data not updated in {x}")
                else:
                    try:
                            duckdb.sql(sql_content)
                            print(f"Data updated in {x}")
                            successful_runs = successful_runs +1
                    except:
                            print(f"Data not updated in {x}")

        except FileNotFoundError:
            print(f"Error: The file '{file_path}' was not found.")
        except Exception as e:
            print(f"An error occurred: {e}")      
    return successful_runs
def write_delta(sql_content,tbl,schema, mode):
            tbl_path = table_base_url + schema + '/' +tbl
            RG=8_000_000
            df = duckdb.sql(sql_content).arrow()
            num_rows = df.num_rows
            write_deltalake(
            tbl_path,
            df,
            mode=mode,
            max_rows_per_file = RG ,
            max_rows_per_group = RG,
            min_rows_per_group = RG,
            engine ='pyarrow'
            )
            print(f'{num_rows} rows inserted into {tbl}, mode {mode}')
            if mode =='append':
                dt = DeltaTable(tbl_path)
                if len(dt.files()) > compaction_threshold:
                    dt.optimize.compact()
                    dt.vacuum(retention_hours=1, dry_run=False, enforce_retention_duration=False)
                    dt.cleanup_metadata()
                    print("compaction completed")
            elif mode =='overwrite':
                dt = DeltaTable(tbl_path)
                dt.vacuum(retention_hours=1, dry_run=False, enforce_retention_duration=False)
                dt.cleanup_metadata()
                print("vaccum completed")
            return 'done'
def duckdb_attach_lakehouse():   
    list_tables = duckdb.sql(f""" SELECT  distinct(split_part(file, '_delta_log', 1)) as tables FROM glob ("abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/*/*/_delta_log/*.json") """).df()['tables'].tolist()
    for table_path in list_tables:
            parts = table_path.strip("/").split("/")
            table = parts[-1]
            try:
                duckdb.sql(f"""CREATE OR REPLACE view {table}
                 AS select * FROM delta_scan('abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/{schema}/{table}');""")
            except:
                pass
    duckdb.sql(""" select name ,column_names from (show all tables) where database='memory' """).show(max_width=120)
    return 'done'

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 11.7 µs


In [17]:
%%time
duckdb_attach_lakehouse()

┌─────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│    name     │                                              column_names                                              │
│   varchar   │                                               varchar[]                                                │
├─────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ calendar    │ [date, year, month]                                                                                    │
│ duid        │ [DUID, Region, FuelSourceDescriptor, Participant, State]                                               │
│ mstdatetime │ [SETTLEMENTDATE, time, date, year, month]                                                              │
│ price       │ [UNIT, REGIONID, VERSION, RUNNO, INTERVENTION, RRP, EEP, ROP, APCFLAG, MARKETSUSPENDEDFLAG, TOTALDEM…  │
│ price_today │ [REGIONID, RUNNO

'done'

In [18]:
%%time
now_brisbane = datetime.now(ZoneInfo("Australia/Brisbane")).time()
start = time(4, 0)   
end = time(5, 30) 
t = 0  
table_exists = duckdb.sql(""" select count(*) from (show all tables) where database='memory' and name in ('scada','price','duid') """).fetchone()[0]
if table_exists == 3: 
    print("table scada and price exists, will check in the morning if new files arrived")
    if start <= now_brisbane <= end:
        print("check if nightly load has arrived")
        t = run_sql(['price'])
        if t ==1 :
            t = t + run_sql(['scada'])
        print(f""" Nbr of tables changed {t}:  if both tables change do backup""")
else:
    print("table does not exists, load everything")
    run_sql(['price','scada'])
    t=2
if t == 2 :
    run_sql(['calendar','mstdatetime','duid','summary_backfill'])

table scada and price exists, will check in the morning if new files arrived
CPU times: user 13.6 ms, sys: 2.92 ms, total: 16.5 ms
Wall time: 10.6 ms


In [19]:
%%time
print("insert latest data")
run_sql(['scada_today','price_today','summary_incremental'])

insert latest data
Data not updated in scada_today
Data not updated in price_today
0 rows inserted into summary, mode append
CPU times: user 1.3 s, sys: 168 ms, total: 1.47 s
Wall time: 14.7 s


1