In [1]:
#!pip install pyiceberg[adlfs]  --upgrade
#!pip install duckdb            --upgrade

In [None]:
ws               =  "mimtest"
lh               =  "test"
schema           =  "demo"  
tbl              =  'newdata'
onelake          =  "onelake" # in production use onelake only
onelake_endpoint = f"https://{onelake}.table.fabric.microsoft.com/iceberg"

In [3]:
import duckdb
import os
duckdb.sql(f""" update extensions ;""")

┌────────────────┬──────────────┬─────────────────────┬──────────────────┬─────────────────┐
│ extension_name │  repository  │    update_result    │ previous_version │ current_version │
│    varchar     │   varchar    │       varchar       │     varchar      │     varchar     │
├────────────────┼──────────────┼─────────────────────┼──────────────────┼─────────────────┤
│ autocomplete   │ core         │ NO_UPDATE_AVAILABLE │ v1.4.0           │ v1.4.0          │
│ avro           │ core         │ NO_UPDATE_AVAILABLE │ 0c97a61          │ 0c97a61         │
│ azure          │ core         │ NO_UPDATE_AVAILABLE │ 5e458fc          │ 5e458fc         │
│ delta          │ core_nightly │ NO_UPDATE_AVAILABLE │ 03aaf0f          │ 03aaf0f         │
│ ducklake       │ core_nightly │ NO_UPDATE_AVAILABLE │ 51a67d2          │ 51a67d2         │
│ httpfs         │ core_nightly │ NO_UPDATE_AVAILABLE │ 08d99ff          │ 08d99ff         │
│ iceberg        │ core_nightly │ NO_UPDATE_AVAILABLE │ c6757bd9      

In [4]:
def get_token():
    token_env = os.environ.get("AZURE_STORAGE_TOKEN")
    if token_env:
        return "done"
    from azure.identity import InteractiveBrowserCredential, ChainedTokenCredential
    credential = ChainedTokenCredential(InteractiveBrowserCredential())
    token = credential.get_token("https://storage.azure.com/.default")
    os.environ["AZURE_STORAGE_TOKEN"] = token.token
    return "done"
get_token()

'done'

In [5]:
df=duckdb.sql(""" 
                        SELECT cast(unnest(generate_series(cast ('2018-04-01' as date), cast('2024-12-31' as date), interval 1 day)) as date) as date,
                        EXTRACT(year from date) as year,
                        EXTRACT(month from date) as month
""").to_arrow_table()

Write iceberg

In [6]:
warhouse_path    = f'abfss://{ws}@{onelake}.dfs.fabric.microsoft.com/{lh}.Lakehouse/Tables'

In [7]:
from   pyiceberg.catalog.sql import SqlCatalog
catalog = SqlCatalog(
      "default",
      **{
          "uri"                       : f"sqlite:///{onelake}_pyiceberg.db",
          "adls.account-name"         : onelake ,
          "adls.account-host"         : f"{onelake}.blob.fabric.microsoft.com" ,
          "adls.token"                :  os.environ.get('AZURE_STORAGE_TOKEN') ,
          "warehouse"                 :  warhouse_path 
      },
                        )
catalog.create_namespace_if_not_exists(schema)
catalog.list_tables(schema)
if not catalog.table_exists(f"{schema}.{tbl}_iceberg"):
    table = catalog.create_table_if_not_exists(f"{schema}.{tbl}_iceberg",schema=df.schema )
    table.overwrite(df)
else:
    print(f"Table {schema}.{tbl}_iceberg already exists")
catalog.close()



Write Delta

In [8]:
from deltalake.writer import write_deltalake
storage_options={"bearer_token": os.environ.get("AZURE_STORAGE_TOKEN"), "use_fabric_endpoint": "true" }
write_deltalake(f"{warhouse_path}/{schema}/{tbl}_delta", df,mode='ignore' ,storage_options=storage_options)

Read

In [9]:
duckdb.sql(f""" 
  CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.environ.get('AZURE_STORAGE_TOKEN')}')   ;
  load httpfs ;
  ATTACH or replace '{ws}/{lh}.lakehouse' AS onelake (TYPE ICEBERG, ENDPOINT '{onelake_endpoint}', TOKEN '{os.environ.get("AZURE_STORAGE_TOKEN")}');
  show all tables;
        """)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────┬────────────────────┬───────────────────────┬──────────────┬──────────────┬───────────┐
│ database │       schema       │         name          │ column_names │ column_types │ temporary │
│ varchar  │      varchar       │        varchar        │  varchar[]   │  varchar[]   │  boolean  │
├──────────┼────────────────────┼───────────────────────┼──────────────┼──────────────┼───────────┤
│ onelake  │ demo               │ calendar_delta        │ [__]         │ [UNKNOWN]    │ false     │
│ onelake  │ demo               │ calendar_iceberg      │ [__]         │ [UNKNOWN]    │ false     │
│ onelake  │ demo               │ calendardelta         │ [__]         │ [UNKNOWN]    │ false     │
│ onelake  │ demo               │ calendariceberg       │ [__]         │ [UNKNOWN]    │ false     │
│ onelake  │ demo               │ newdata_delta         │ [__]         │ [UNKNOWN]    │ false     │
│ onelake  │ demo               │ newdata_iceberg       │ [__]         │ [UNKNOWN]    │ false     │


Read Iceberg using Iceberg Reader

In [10]:
duckdb.sql(f""" use onelake.{schema};""")

In [11]:
duckdb.sql(f"""  from {tbl}_iceberg limit 3 """)

┌────────────┬───────┬───────┐
│    date    │ year  │ month │
│    date    │ int64 │ int64 │
├────────────┼───────┼───────┤
│ 2018-04-01 │  2018 │     4 │
│ 2018-04-02 │  2018 │     4 │
│ 2018-04-03 │  2018 │     4 │
└────────────┴───────┴───────┘

Read Delta using Iceberg Reader

In [12]:
duckdb.sql(f"""  from {tbl}_delta limit 3 """)

┌────────────┬───────┬───────┐
│    date    │ year  │ month │
│    date    │ int64 │ int64 │
├────────────┼───────┼───────┤
│ 2018-04-01 │  2018 │     4 │
│ 2018-04-02 │  2018 │     4 │
│ 2018-04-03 │  2018 │     4 │
└────────────┴───────┴───────┘

Read Iceberg using Delta Reader

In [13]:
duckdb.sql(f" FROM delta_scan('{warhouse_path}/{schema}/{tbl}_iceberg') ")

┌────────────┬───────┬───────┐
│    date    │ year  │ month │
│    date    │ int64 │ int64 │
├────────────┼───────┼───────┤
│ 2018-04-01 │  2018 │     4 │
│ 2018-04-02 │  2018 │     4 │
│ 2018-04-03 │  2018 │     4 │
│ 2018-04-04 │  2018 │     4 │
│ 2018-04-05 │  2018 │     4 │
│ 2018-04-06 │  2018 │     4 │
│ 2018-04-07 │  2018 │     4 │
│ 2018-04-08 │  2018 │     4 │
│ 2018-04-09 │  2018 │     4 │
│ 2018-04-10 │  2018 │     4 │
│     ·      │    ·  │     · │
│     ·      │    ·  │     · │
│     ·      │    ·  │     · │
│ 2024-12-22 │  2024 │    12 │
│ 2024-12-23 │  2024 │    12 │
│ 2024-12-24 │  2024 │    12 │
│ 2024-12-25 │  2024 │    12 │
│ 2024-12-26 │  2024 │    12 │
│ 2024-12-27 │  2024 │    12 │
│ 2024-12-28 │  2024 │    12 │
│ 2024-12-29 │  2024 │    12 │
│ 2024-12-30 │  2024 │    12 │
│ 2024-12-31 │  2024 │    12 │
├────────────┴───────┴───────┤
│ 2467 rows        3 columns │
│ (20 shown)                 │
└────────────────────────────┘

pyiceberg

In [14]:
from   pyiceberg.catalog import load_catalog
catalog = load_catalog("onelake", **{
    "uri"                       :                   onelake_endpoint, 
    "token"                     :                   os.environ.get('AZURE_STORAGE_TOKEN'), 
    "warehouse"                 :                   f"{ws}/{lh}.lakehouse",
    "adls.account-name"         :                   onelake ,
    "adls.account-host"         :                   f"{onelake}.blob.fabric.microsoft.com" ,
    "adls.token"                :                   os.environ.get('AZURE_STORAGE_TOKEN') ,
})
catalog.list_tables(schema)

[('demo', 'calendar_delta'),
 ('demo', 'calendar_iceberg'),
 ('demo', 'calendardelta'),
 ('demo', 'calendariceberg'),
 ('demo', 'newdata_delta'),
 ('demo', 'newdata_iceberg')]

In [15]:
table = catalog.load_table(f"{schema}.{tbl}_delta")
scan  = table.scan( limit=3,).to_pandas()
scan

Unnamed: 0,date,year,month
0,2018-04-01,2018,4
1,2018-04-02,2018,4
2,2018-04-03,2018,4
