In [10]:
sql_folder     = './builtin/sql/'
workspace      = 'processing'
LH             = 'test'
schema         = 'new'

**<mark>Function to Run SQL Queries</mark>**

In [11]:
from   deltalake     import  write_deltalake,DeltaTable
import duckdb
import re
table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/'
def run_sql(list_files):
    successful_runs = 0
    for x in list_files:
        try:
            file_path = f'{sql_folder}/{x}.sql'
            with open(file_path, 'r') as file:
                sql_content = file.read()

            match = re.search(r"-- materialized:\s*\((.*?)\)", sql_content)

            if match:
                materialized_content = match.group(1)
                parts = [part.strip() for part in materialized_content.split(',')]

                if len(parts) >= 2:
                    table_name = parts[0]
                    mode       = parts[1]
                    try:
                        write_delta(sql_content,table_name,schema, mode)
                        duckdb.sql(f""" create or replace view {table_name} as select * from delta_scan('{table_base_url}{schema}/{table_name}') """)
                        successful_runs = successful_runs +1
                    except:
                        print(f"Data not updated in {x}")
                else:
                    try:
                            duckdb.sql(sql_content)
                            print(f"Data updated in {x}")
                            successful_runs = successful_runs +1
                    except:
                            print(f"Data not updated in {x}")

        except FileNotFoundError:
            print(f"Error: The file '{file_path}' was not found.")
        except Exception as e:
            print(f"An error occurred: {e}")      
    return successful_runs
def write_delta(sql_content,tbl,schema, mode):
            tbl_path = table_base_url + schema + '/' +tbl
            RG=8_000_000
            df = duckdb.sql(sql_content).arrow()
            r = df.num_rows
            write_deltalake( tbl_path,
            df,
            mode=mode,
            max_rows_per_file = RG ,
            max_rows_per_group = RG,
            min_rows_per_group = RG,
            engine ='pyarrow')
            if mode !='ignore':
                dt = DeltaTable(tbl_path)
                if len(dt.files()) > 90:
                    dt.optimize.compact()
                    dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
                    dt.cleanup_metadata()
            return print(f'{r} rows inserted into {tbl}')
def duckdb_attach_lakehouse():   
    sql_schema     = set()
    sql_statements = set()
    duckdb.sql(f""" SELECT  * FROM glob ("abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/dbo/*") """).df()['file'].tolist()
    list_tables = duckdb.sql(f""" SELECT  distinct(split_part(file, '_delta_log', 1)) as tables FROM glob ("abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/*/*/_delta_log/*.json") """).df()['tables'].tolist()
    for table_path in list_tables:
            parts = table_path.strip("/").split("/")
            table = parts[-1]
            try:
                duckdb.sql(f"""CREATE OR REPLACE view {table}
                 AS select * FROM delta_scan('abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{LH}.Lakehouse/Tables/{schema}/{table}');""")
            except:
                pass
    duckdb.sql(""" select name ,column_names from (show all tables) where database='memory' """).show(max_width=120)
    return 'done'

In [16]:
%%time
print("load existing tables")
duckdb_attach_lakehouse()
print("insert latest data")
run_sql(['scada_today','price_today'])
print("check backfill Data")
t = run_sql(['scada','price'])
print(f""" Nbr of tables changed {t}:  if any changes do backup otherwise do incremental """)
if t > 0 :
    run_sql(['calendar','mstdatetime','duid','summary_backfill'])
    run_sql(['summary_incremental'])
else :
    run_sql(['summary_incremental'])

load existing tables
┌─────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│    name     │                                              column_names                                              │
│   varchar   │                                               varchar[]                                                │
├─────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ calendar    │ [date, year, month]                                                                                    │
│ duid        │ [DUID, Region, FuelSourceDescriptor, Participant, State]                                               │
│ mstdatetime │ [SETTLEMENTDATE, time, date, year, month]                                                              │
│ price       │ [UNIT, REGIONID, VERSION, RUNNO, INTERVENTION, RRP, EEP, ROP, APCFLAG, MARKETSUSPENDEDFLAG, TOTALDEM…  │
│ price_tod