In [None]:
!pip install duckdb==1.2.2
import sys
sys.exit(0)

In [1]:
sql_folder           = 'https://github.com/djouallah/Fabric_Notebooks_Demo/raw/refs/heads/main/orchestration/'
workspace            = 'processing'
LH                   = 'test'
schema               = 'new'
compaction_threshold = 50

**<mark>Import Packages</mark>**

In [2]:
%%time
from   deltalake     import  write_deltalake,DeltaTable
from   datetime      import datetime, time
from   zoneinfo      import ZoneInfo
import duckdb
import requests
import re

CPU times: user 827 ms, sys: 138 ms, total: 965 ms
Wall time: 10.7 s


**<mark>Authentication</mark>**

In [3]:
try:
    from   azure.identity import DefaultAzureCredential
    import os
    os.environ['azure_storage_token'.upper()] = DefaultAzureCredential().get_token("https://storage.azure.com/.default").token
except: 
    os.environ['azure_storage_token'] = notebookutils.credentials.getToken('storage') 
    print("you are in Fabric notebook")     

you are in Fabric notebook


**<mark>Function to Run SQL Queries</mark>**

In [4]:
%%time
table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/'
con = duckdb.connect()   
con.sql(f""" CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv('azure_storage_token')}')   """)  
def run_sql(list_files):
    successful_runs = 0
    for x in list_files:
        try:
            file_path = f'{sql_folder}/{x}.sql'
            is_github = sql_folder.startswith("http")
            if is_github:
                sql_content = requests.get(file_path).text  
            else: 
                with open(file_path, 'r') as file:
                    sql_content = file.read()

            match = re.search(r"-- materialized:\s*\((.*?)\)", sql_content)

            if match:
                materialized_content = match.group(1)
                parts = [part.strip() for part in materialized_content.split(',')]

                if len(parts) >= 2:
                    table_name = parts[0]
                    mode       = parts[1]
                    try:
                        write_delta(sql_content,table_name,schema, mode)
                        con.sql(f""" create or replace view {table_name} as select * from delta_scan('{table_base_url}{schema}/{table_name}') """)
                        successful_runs = successful_runs +1
                    except:
                        print(f"Data not updated in {x}")
                else:
                    try:
                            con.sql(sql_content)
                            print(f"Data updated in {x}")
                            successful_runs = successful_runs +1
                    except:
                            print(f"Data not updated in {x}")

        except FileNotFoundError:
            print(f"Error: The file '{file_path}' was not found.")
        except Exception as e:
            print(f"An error occurred: {e}")      
    return successful_runs
def write_delta(sql_content,tbl,schema, mode):
            tbl_path = table_base_url + schema + '/' +tbl
            RG=8_000_000
            df = con.sql(sql_content).record_batch()
            write_deltalake(
            tbl_path,
            df,
            mode=mode,
            max_rows_per_file = RG ,
            max_rows_per_group = RG,
            min_rows_per_group = RG,
            engine ='pyarrow'
            )
            print(f'table {tbl} updated, Delta mode {mode}')
            if mode =='append':
                dt = DeltaTable(tbl_path)
                if len(dt.files()) > compaction_threshold:
                    dt.optimize.compact()
                    dt.vacuum(retention_hours=7, dry_run=False, enforce_retention_duration=False)
                    dt.cleanup_metadata()
                    print("compaction completed")
            elif mode =='overwrite':
                dt = DeltaTable(tbl_path)
                dt.vacuum(retention_hours=7, dry_run=False, enforce_retention_duration=False)
                dt.cleanup_metadata()
                print("vaccum completed")
            return 'done'
def duckdb_attach_lakehouse():   
    list_tables = con.sql(f""" SELECT  distinct(split_part(file, '_delta_log', 1)) as tables
                    FROM glob ("abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/*/*/_delta_log/*.json")
                     """).df()['tables'].tolist()
    for table_path in list_tables:
            parts = table_path.strip("/").split("/")
            table = parts[-1]
            try:
                con.sql(f"""CREATE OR REPLACE view {table}
                 AS select * FROM delta_scan('abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/{schema}/{table}');""")
            except:
                pass
    con.sql(""" select name ,column_names from (show all tables) where database='memory' """).show(max_width=120)
    return 'done'

CPU times: user 173 ms, sys: 12.1 ms, total: 185 ms
Wall time: 108 ms


In [5]:
%%time
duckdb_attach_lakehouse()

┌─────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│    name     │                                              column_names                                              │
│   varchar   │                                               varchar[]                                                │
├─────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ calendar    │ [date, year, month]                                                                                    │
│ duid        │ [DUID, Region, FuelSourceDescriptor, Participant, State]                                               │
│ mstdatetime │ [time, hour]                                                                                           │
│ price       │ [UNIT, REGIONID, VERSION, RUNNO, INTERVENTION, RRP, EEP, ROP, APCFLAG, MARKETSUSPENDEDFLAG, TOTALDEM…  │
│ price_today │ [REGIONID, RUNNO

'done'

In [6]:
%%time
now_brisbane = datetime.now(ZoneInfo("Australia/Brisbane")).time()
start = time(4, 0)   
end   = time(23, 30) 
t = 0  
table_exists = con.sql(""" select count(*) from (show all tables) where database='memory' and name in ('scada','price','duid') """).fetchone()[0]
if table_exists == 3: 
    print("table scada and price exists, will check in the morning if new files arrived")
    if start <= now_brisbane <= end:
        print("check if nightly load has arrived")
        t = run_sql(['price','scada'])
        print(f""" Nbr of tables changed {t}:  if both tables change do backup""")
else:
    print("table does not exists, load everything")
    run_sql(['price','scada'])
    t=2
if t == 2 :
    run_sql(['calendar','duid','mstdatetime','summary_backfill'])
else :
    print("incremental updates")
    run_sql(['scada_today','price_today','summary_incremental'])

table does not exists, load everything
Data not updated in price


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))