In [12]:
sql_folder           = 'https://github.com/djouallah/Fabric_Notebooks_Demo/raw/refs/heads/main/orchestration/sql/'
workspace            = 'processing'
LH                   = 'test'
schema               = 'new'
compaction_threshold = 100

**<mark>Import Packages</mark>**

In [13]:
%%time
from   deltalake     import DeltaTable,  write_deltalake
from   datetime      import datetime, time
from   zoneinfo      import ZoneInfo
import duckdb
import requests
import re

CPU times: user 16 µs, sys: 2 µs, total: 18 µs
Wall time: 21.9 µs


**<mark>Function to Run SQL Queries</mark>**

In [14]:
table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/'
con = duckdb.connect()   
def run_sql(table_name, mode=None):
    """
    Executes SQL scripts for a given table and mode.

    Args:
        table_name (str): The name of the table. This argument is required.
        mode (str, optional): The mode of operation. Allowed values are None,
                              'overwrite', 'append', 'ignore', Defaults to None.
                              None is treated as direct SQL execution without write_delta.

    Returns:
        int: The number of successful SQL runs (either data update or view creation).
    """
    con.sql(f""" CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{notebookutils.credentials.getToken('storage') }')   """) 
    successful_runs = 0

    allowed_modes = [None, 'overwrite', 'append', 'ignore']

    if mode not in allowed_modes:
        print(f"Error: Invalid mode '{mode}'. Allowed modes are: {', '.join(map(str, allowed_modes))}")
        return 0

    tbl_path = table_base_url + schema + '/' + table_name
    sql_content = None

    try:
        file_path = f'{sql_folder}/{table_name}.sql'
        is_github = sql_folder.startswith("http")

        if is_github:
            response = requests.get(file_path)
            response.raise_for_status()
            sql_content = response.text
        else:
            print(f"Reading SQL from local file: {file_path}")
            with open(file_path, 'r') as file:
                sql_content = file.read()

        if not sql_content or not sql_content.strip():
            print(f"Error: SQL content is empty or could not be read from '{file_path}'.")
            return 0

        if mode in ['overwrite', 'append', 'ignore']:
            print(f"Running in mode: {mode} for table: {table_name}")
            if mode =='overwrite':
                    try:
                        con.sql(f'drop VIEW if exists {table_name}')
                        write_delta(sql_content,table_name,schema, mode)
                        con.sql(f""" CREATE OR REPLACE VIEW {table_name} AS
                                    SELECT * FROM delta_scan('{table_base_url}{schema}/{table_name}') """)
                        successful_runs = successful_runs +1
                        dt = DeltaTable(tbl_path)
                        dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
                        dt.cleanup_metadata()
                        print("vaccum completed")
                    except Exception as e:
                        print(f"Error updating data or creating view in overwrite mode for {table_name}: {e}")

            elif mode =='append':
                    try:
                        write_delta(sql_content,table_name,schema, mode)
                        con.sql(f""" CREATE OR REPLACE VIEW {table_name} AS
                                    SELECT * FROM delta_scan('{table_base_url}{schema}/{table_name}') """)
                        successful_runs = successful_runs +1
                        dt = DeltaTable(tbl_path)
                        if len(dt.files()) > compaction_threshold:
                            dt.optimize.compact()
                            dt.vacuum(retention_hours=7, dry_run=False, enforce_retention_duration=False)
                            dt.cleanup_metadata()
                            print("compaction completed")
                    except Exception as e:
                        print(f"Error updating data or creating view in append mode for {table_name}: {e}")
            elif mode =='ignore':
                    try:
                        dt = DeltaTable(tbl_path)
                        print(f"{table_name} exist already. Ignoring write operation.")
                        successful_runs = successful_runs + 1
                    except :
                         print(f"{table_name} does not exist. Writing in overwrite mode.")
                         try:
                            write_delta(sql_content,table_name,schema, 'overwrite')
                            con.sql(f""" CREATE OR REPLACE VIEW {table_name} AS
                                        SELECT * FROM delta_scan('{table_base_url}{schema}/{table_name}') """)
                            successful_runs = successful_runs +1
                            dt = DeltaTable(tbl_path)
                            dt.vacuum(retention_hours=7, dry_run=False, enforce_retention_duration=False)
                            dt.cleanup_metadata()
                            print(f"{table_name} created and vacuumed.")
                         except Exception as e:
                            print(f"Error creating table in ignore mode (using overwrite) for {table_name}: {e}")

        elif mode is None:
             print(f"Running in mode: {mode} (direct SQL execution) for table: {table_name}")
             try:
                 con.sql(sql_content).show()
                 print(f"SQL executed successfully for {table_name}")
                 successful_runs = successful_runs + 1
             except Exception as e:
                 print(f"Error executing SQL for {table_name}: {e}")


    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching SQL from URL {file_path}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return successful_runs

def write_delta(sql_content,tbl,schema, mode):
            tbl_path = table_base_url + schema + '/' +tbl
            RG=8_000_000
            try:
                df = con.sql(sql_content).record_batch()
                write_deltalake(
                tbl_path,
                df,
                mode=mode,
                max_rows_per_file = RG ,
                max_rows_per_group = RG,
                min_rows_per_group = RG,
                engine ='pyarrow'
                )
                print(f'table {tbl} updated, Delta mode {mode}')
                return 'done'
            except Exception as e:
                 print(f"Error writing to delta table {tbl} in mode {mode}: {e}")
                 raise

def duckdb_attach_lakehouse():
    con.sql(f""" CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{notebookutils.credentials.getToken('storage') }')   """) 
    try:
        list_tables = con.sql(f""" SELECT  distinct(split_part(file, '_delta_log', 1)) as tables
                        FROM glob ("abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/*/*/_delta_log/*.json")
                         """).df()['tables'].tolist()
        for table_path in list_tables:
                parts = table_path.strip("/").split("/")
                table = parts[-1]
                try:
                    con.sql(f"""CREATE OR REPLACE view {table}
                     AS select * FROM delta_scan('abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/{schema}/{table}');""")
                except Exception as e:
                    print(f"Error creating/replacing view for table {table}: {e}")
                    pass
        return con.sql(""" select name ,column_names from (show all tables) where database='memory' """).show(max_width=130)
    except Exception as e:
        print(f"An error occurred during lakehouse attachment: {e}")
        return 'error'
def run_sql_sequence(tasks_list):
    previous_task_successful = True

    for task in tasks_list:
        if previous_task_successful:
            if not isinstance(task, (list, tuple)) or len(task) < 1:
                print(f"Error: Invalid task entry format: {task}. Expected a list or tuple like ['table_name', 'mode']. Skipping this task.")
                previous_task_successful = False
                continue

            table_name = task[0]
            mode = task[1] if len(task) > 1 else None

            if not isinstance(table_name, str) or not table_name.strip():
                 print(f"Error: Invalid table_name in task entry: {task}. table_name must be a non-empty string. Skipping this task.")
                 previous_task_successful = False
                 continue

            if mode is not None and not isinstance(mode, str):
                 print(f"Error: Invalid mode in task entry: {task}. mode must be a string or None. Skipping this task.")
                 previous_task_successful = False
                 continue

            print(f"Attempting to run SQL for table: {table_name} with mode: {mode}")

            result = run_sql(table_name=table_name, mode=mode)

            if result == 1:
                print(f"Successfully ran SQL for table: {table_name}")
                previous_task_successful = True
            else:
                print(f"Failed to run SQL for table: {table_name}. Stopping sequence.")
                previous_task_successful = False
                break
        else:
            skip_table_name = task[0] if isinstance(task, (list, tuple)) and len(task) > 0 else 'Unknown'
            print(f"Skipping SQL run for table: {skip_table_name} due to previous failure.")

    if previous_task_successful:
        print("All specified SQL tasks completed successfully.")
    else:
        print("One or more SQL tasks failed.")

    return previous_task_successful
duckdb_attach_lakehouse()


┌─────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│    name     │                                                   column_names                                                   │
│   varchar   │                                                    varchar[]                                                     │
├─────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ calendar    │ [date, year, month]                                                                                              │
│ duid        │ [DUID, Region, FuelSourceDescriptor, Participant, State, latitude, longitude]                                    │
│ mstdatetime │ [time, hour]                                                                                                     │
│ price       │ [UNIT, REGIONID, VERSION, RUNNO, INTERVENTION, RRP, EEP, ROP, APCFL

In [15]:
sql_tasks_to_run_nightly = [
    ['price', 'append'],
    ['scada', 'append'],
    ['duid', 'ignore'],
    ['summary', 'overwrite'],
    ['calendar', 'ignore'],
    ['mstdatetime', 'ignore'],
]

In [16]:
%%time
now_brisbane         = datetime.now(ZoneInfo("Australia/Brisbane")).time()
start = time(4, 0)   
end   = time(5, 30) 
if start <= now_brisbane <= end:
   run_sql_sequence(sql_tasks_to_run_nightly)

CPU times: user 11 µs, sys: 1 µs, total: 12 µs
Wall time: 13.8 µs


In [17]:
%%time
sql_tasks_to_intraday = [
    ['price_today', 'append'],
    ['scada_today', 'append'],
    ['duid', 'ignore'],
    ['summary', 'append']
]
run_sql_sequence(sql_tasks_to_intraday)

Attempting to run SQL for table: price_today with mode: append
Running in mode: append for table: price_today
table price_today updated, Delta mode append
Successfully ran SQL for table: price_today
Attempting to run SQL for table: scada_today with mode: append
Running in mode: append for table: scada_today
Error writing to delta table scada_today in mode append: Parser Error: read_csv cannot take NULL list as parameter
Error updating data or creating view in append mode for scada_today: Parser Error: read_csv cannot take NULL list as parameter
Failed to run SQL for table: scada_today. Stopping sequence.
One or more SQL tasks failed.
CPU times: user 851 ms, sys: 129 ms, total: 980 ms
Wall time: 9.21 s


False