In [8]:
!pip install -q duckdb    --upgrade
!pip install    obstore   --upgrade
import sys
sys.exit(0)

sys.exit called with value 0. The interpreter will be restarted.


In [9]:
import duckdb
duckdb.sql(" force install delta from core_nightly")

In [10]:
ws                    = 'largedata'
lh                    = 'simple'
schema                = 'test'
compaction_threshold  =  150
sql_folder            = 'https://github.com/djouallah/Fabric_Notebooks_Demo/raw/refs/heads/main/orchestration/new/'
Nbr_files_to_download =  60

In [11]:
import duckdb
import requests
import os
import sys
import importlib.util
from deltalake import DeltaTable, write_deltalake
from typing import List, Tuple, Union, Any, Optional, Callable, Dict
from string import Template


class Tasksql:
    """
    Simplified Lakehouse task runner supporting:
      - ('script_name', (args,))          → runs script_name.py → script_name(*args)
      - ('table_name', 'mode', {params})  → runs table_name.sql with params, writes to Delta
    """

    def __init__(self, workspace: str, lakehouse_name: str, schema: str, sql_folder: str, compaction_threshold: int = 10):
        self.workspace = workspace
        self.lakehouse_name = lakehouse_name
        self.schema = schema
        self.sql_folder = sql_folder.strip()
        self.compaction_threshold = compaction_threshold
        self.table_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/'
        self.con = duckdb.connect()
        self._attach_lakehouse()

    @classmethod
    def connect(cls, workspace: str, lakehouse_name: str, schema: str, sql_folder: str, compaction_threshold: int = 10):
        print("Connecting to Lakehouse...")
        return cls(workspace, lakehouse_name, schema, sql_folder.strip(), compaction_threshold)

    def _get_storage_token(self):
        return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")

    def _create_onelake_secret(self):
        token = self._get_storage_token()
        if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
            self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
        else:
            print("Please login to Azure CLI")
            self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")

    def _attach_lakehouse(self):
        self._create_onelake_secret()
        try:
            list_tables_query = f"""
                SELECT DISTINCT(split_part(file, '_delta_log', 1)) as tables
                FROM glob ("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/*/*/_delta_log/*.json")
            """
            list_tables_df = self.con.sql(list_tables_query).df()
            list_tables = list_tables_df['tables'].tolist() if not list_tables_df.empty else []

            if not list_tables:
                print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables.")
                return

            print(f"Found {len(list_tables)} Delta tables. Attaching as views...")

            for table_path in list_tables:
                parts = table_path.strip("/").split("/")
                if len(parts) >= 2:
                    potential_schema = parts[-2]
                    table = parts[-1]
                    if potential_schema == self.schema:
                        try:
                            self.con.sql(f"""
                                CREATE OR REPLACE VIEW {table}
                                AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
                            """)
                        except Exception as e:
                            print(f"Error creating view for table {table}: {e}")
            print("\nAttached tables (views) in DuckDB:")
            self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
        except Exception as e:
            print(f"Error attaching lakehouse: {e}")



    def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
        is_url = self.sql_folder.startswith("http")
        if is_url:
            url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
            try:
                resp = requests.get(url)
                resp.raise_for_status()
                content = resp.text
            except Exception as e:
                print(f"Failed to fetch SQL from {url}: {e}")
                return None
        else:
            path = os.path.join(self.sql_folder, f"{table_name}.sql")
            try:
                with open(path, 'r') as f:
                    content = f.read()
            except Exception as e:
                print(f"Failed to read SQL file {path}: {e}")
                return None

        if not content.strip():
            print(f"SQL file is empty: {table_name}.sql")
            return None

        # Merge system + user params
        full_params = {
            'ws': self.workspace,
            'lh': self.lakehouse_name,
            'schema': self.schema
        }
        if params:
            full_params.update(params)

        # Use string.Template ($ws, ${run_date}) — safe with DuckDB {}
        try:
            template = Template(content)
            content = template.substitute(full_params)
        except KeyError as e:
            print(f"Missing parameter in SQL file: ${e}")
            return None
        except Exception as e:
            print(f"Error during SQL template substitution: {e}")
            return None

        return content

    def _load_py_function(self, name: str) -> Optional[Callable]:
        is_url = self.sql_folder.startswith("http")
        try:
            if is_url:
                url = f"{self.sql_folder.rstrip('/')}/{name}.py".strip()
                resp = requests.get(url)
                resp.raise_for_status()
                code = resp.text
                namespace = {}
                exec(code, namespace)
                func = namespace.get(name)
                return func if callable(func) else None
            else:
                path = os.path.join(self.sql_folder, f"{name}.py")
                if not os.path.isfile(path):
                    print(f"Python file not found: {path}")
                    return None
                spec = importlib.util.spec_from_file_location(name, path)
                mod = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(mod)
                func = getattr(mod, name, None)
                return func if callable(func) else None
        except Exception as e:
            print(f"Error loading Python function '{name}': {e}")
            return None

    def _run_py_task(self, name: str, args: tuple) -> int:
        func = self._load_py_function(name)
        if not func:
            return 0
        try:
            print(f"Running Python task: {name}{args}")
            
            print(f"✅ Python task '{name}' completed.")
            return func(*args)
        except Exception as e:
            print(f"❌ Error in Python task '{name}': {e}")
            return 0

    def _write_delta(self, sql: str, table: str, mode: str):
        path = f"{self.table_base_url}{self.schema}/{table}"
        df = self.con.sql(sql).arrow()
        write_deltalake(
            path, df, mode=mode,
            max_rows_per_file=8_000_000,
            max_rows_per_group=8_000_000,
            min_rows_per_group=8_000_000,
            engine='pyarrow'
        )
        # Refresh view
        self.con.sql(f"CREATE OR REPLACE VIEW {table} AS SELECT * FROM delta_scan('{path}')")

    def _run_sql_task(self, table: str, mode: str, params: Optional[Dict] = None) -> int:
        allowed_modes = {'overwrite', 'append', 'ignore'}
        if mode not in allowed_modes:
            print(f"Invalid mode '{mode}'. Use: {allowed_modes}")
            return 0

        sql = self._read_sql_file(table, params)
        if sql is None:
            return 0

        path = f"{self.table_base_url}{self.schema}/{table}"
        try:
            if mode == 'overwrite':
                self.con.sql(f"DROP VIEW IF EXISTS {table}")
                self._write_delta(sql, table, mode)
                dt = DeltaTable(path)
                dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
                dt.cleanup_metadata()
            elif mode == 'append':
                self._write_delta(sql, table, mode)
                dt = DeltaTable(path)
                if len(dt.files()) > self.compaction_threshold:
                    print(f"Compacting {table} (files: {len(dt.files())})")
                    dt.optimize.compact()
                    dt.vacuum(dry_run=False)
                    dt.cleanup_metadata()
            elif mode == 'ignore':
                try:
                    DeltaTable(path)
                except:
                    print(f"{table} doesn't exist. Creating in overwrite mode.")
                    self.con.sql(f"DROP VIEW IF EXISTS {table}")
                    self._write_delta(sql, table, 'overwrite')
                    dt = DeltaTable(path)
                    dt.vacuum(dry_run=False)
                    dt.cleanup_metadata()
            print(f"✅ SQL task '{table}' ({mode}) completed.")
            return 1
        except Exception as e:
            print(f"❌ Error in SQL task '{table}': {e}")
            return 0

    def run_task_sequences(self, tasks: List[Union[Tuple[str, tuple], Tuple[str, str, Dict]]]) -> bool:
        """
        Run tasks with simple syntax:
          - ('download', (url_list, path_list, depth))
          - ('staging_table', 'overwrite', {'run_date': '2024-06-01'})
        """
        for i, task in enumerate(tasks):
            print(f"\n--- Running Task {i+1}: {task[0]} ---")
            name = task[0]

            if len(task) == 2:
                # Python task: ('name', (args,))
                args = task[1]
                if not isinstance(args, (tuple, list)):
                    args = (args,)
                result = self._run_py_task(name, tuple(args))
            elif len(task) == 3:
                # SQL write task: ('table', 'mode', {params})
                mode, params = task[1], task[2]
                if not isinstance(params, dict):
                    print(f"❌ Expected dict as 3rd item in SQL task, got {type(params)}")
                    return False
                result = self._run_sql_task(name, mode, params)
            else:
                print(f"❌ Invalid task format: {task}")
                return False

            if result != 1:
                print(f"❌ Task {i+1} failed. Stopping.")
                return False

        print("\n✅ All tasks completed successfully.")
        return True

    def get_connection(self):
        return self.con

    def close(self):
        if self.con:
            self.con.close()
            print("DuckDB connection closed.")

In [12]:
%%time
con = Tasksql.connect( ws,lh,schema, sql_folder, compaction_threshold  )

Connecting to Lakehouse...
Found 8 Delta tables. Attaching as views...

Attached tables (views) in DuckDB:
┌─────────────┐
│    name     │
│   varchar   │
├─────────────┤
│ calendar    │
│ duid        │
│ mstdatetime │
│ price       │
│ price_today │
│ scada       │
│ scada_today │
│ summary     │
└─────────────┘

CPU times: user 1.64 s, sys: 66.6 ms, total: 1.71 s
Wall time: 2.77 s


In [13]:
%%time
con.run_task_sequences([
                        ('download_files', (["http://nemweb.com.au/Reports/Current/DispatchIS_Reports/"],["Reports/Current/DispatchIS_Reports/"], Nbr_files_to_download, ws,lh,6)),
                        ('price_today','append',{'ws': ws,'lh':lh}),
                        ('download_files', (["http://nemweb.com.au/Reports/Current/Dispatch_SCADA/" ],["Reports/Current/Dispatch_SCADA/"], Nbr_files_to_download, ws,lh,6)),
                        ('scada_today','append',{'ws': ws,'lh':lh}),
                        ('duid','ignore',{'ws': ws,'lh':lh}),
                        ('summary', 'append',{})
                        
                    ])


--- Running Task 1: download_files ---
Running Python task: download_files(['http://nemweb.com.au/Reports/Current/DispatchIS_Reports/'], ['Reports/Current/DispatchIS_Reports/'], 60, 'largedata', 'simple', 6)
✅ Python task 'download_files' completed.
Failed to download PUBLIC_DISPATCHIS_202509261920_0000000482307340.zip: HTTP 403
Updated log Reports/Current/DispatchIS_Reports/download_log.csv with 59 new entries
http://nemweb.com.au/Reports/Current/DispatchIS_Reports/ - 59 files extracted and uploaded

--- Running Task 2: price_today ---
✅ SQL task 'price_today' (append) completed.

--- Running Task 3: download_files ---
Running Python task: download_files(['http://nemweb.com.au/Reports/Current/Dispatch_SCADA/'], ['Reports/Current/Dispatch_SCADA/'], 60, 'largedata', 'simple', 6)
✅ Python task 'download_files' completed.
Updated log Reports/Current/Dispatch_SCADA/download_log.csv with 60 new entries
http://nemweb.com.au/Reports/Current/Dispatch_SCADA/ - 60 files extracted and uploaded



True

In [14]:
%%time
con.run_task_sequences([
                        ('download_files', (["https://nemweb.com.au/Reports/Current/Daily_Reports/"],["Reports/Current/Daily_Reports/"],Nbr_files_to_download,ws,lh,6)),
                        ('price','append',{'ws': ws,'lh':lh}),
                        ('scada','append',{'ws': ws,'lh':lh}),
                        ('download_excel',("raw/", ws,lh)),
                        ('duid','overwrite',{'ws': ws,'lh':lh}),
                        ('calendar','ignore',{}),
                        ('mstdatetime','ignore',{}),
                        ('summary','overwrite',{})
                     ])


--- Running Task 1: download_files ---
Running Python task: download_files(['https://nemweb.com.au/Reports/Current/Daily_Reports/'], ['Reports/Current/Daily_Reports/'], 60, 'largedata', 'simple', 6)
✅ Python task 'download_files' completed.
https://nemweb.com.au/Reports/Current/Daily_Reports/ - 0 files extracted (all 60 files already downloaded)
❌ Task 1 failed. Stopping.
CPU times: user 63.2 ms, sys: 1.79 ms, total: 65 ms
Wall time: 972 ms


False