<img src="./images/logo.svg" alt="lakeFS logo" width=300/> <img src="https://www.apache.org/logos/res/iceberg/iceberg.png" alt="Apache Iceberg logo" width=300/>  

## lakeFS ❤️ Apache Iceberg - a basic example of the integration

## Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFODNN7EXAMPLE'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "lakefs-iceberg"

### Create lakeFSClient

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.config.get_lake_fs_version()
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v.version}")

### Define lakeFS Repository

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

### Set up Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Iceberg / Jupyter") \
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0,io.lakefs:lakefs-iceberg:0.1.1") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .config("spark.sql.catalog.lakefs", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.lakefs.catalog-impl", "io.lakefs.iceberg.LakeFSCatalog") \
        .config("spark.sql.catalog.lakefs.warehouse", f"lakefs://{repo_name}") \
        .config("spark.sql.catalog.lakefs.uri", lakefsEndPoint) \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

---

---

# Main demo starts here 🚦 👇🏻

## Create an Iceberg table in the lakeFS catalog `main` branch

In [None]:
%%sql

CREATE OR REPLACE TABLE main.rmoff.my_table (id int, data string);

## Write three rows of data to the table

In [None]:
%%sql

INSERT INTO main.rmoff.my_table VALUES(0,"test");

In [None]:
from pyspark.sql.functions import when, col

df = spark.range(1, 3) \
     .withColumn("data", when(col("id") % 2 == 0, "bar") \
                 .otherwise("foo"))

In [None]:
df.writeTo("main.rmoff.my_table").append()

In [None]:
%%sql

SELECT * FROM main.rmoff.my_table;

## Commit the new table and its data

In [None]:
lakefs.commits.commit(repo.id, "main", CommitCreation(
    message="Initial data load",
    metadata={'author': 'rmoff'}
) )

## Create a new branch

_This is copy-on-write; we're not duplicating the data_

In [None]:
lakefs.branches.create_branch(repo.id, BranchCreation("dev","main"))

## Observe that the new branch has the same data as `main`

In [None]:
%%sql

SELECT * FROM dev.rmoff.my_table;

## Insert a row into the `dev` branch's version of the table

In [None]:
%%sql

INSERT INTO dev.rmoff.my_table VALUES(3,"wibble");

## Observe that the `main` version of the table remain unaltered

In [None]:
%%sql

SELECT * FROM main.rmoff.my_table;

## Inspect the `dev` version of the table with the new data

In [None]:
%%sql

SELECT * FROM dev.rmoff.my_table;

## Use a [reference expression](https://docs.lakefs.io/understand/model.html#ref-expressions) to look at only the committed changes (`@`) in the `dev` branch 

In [None]:
%%sql

SELECT * FROM `dev@`.rmoff.my_table 

## Select only uncommitted data on `dev` 

_by comparing current state to committed state_

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev@`.rmoff.my_table 

### Commit the changes in `dev`

In [None]:
lakefs.commits.commit(repo.id, "dev", CommitCreation(
    message="Added a row (wibble)",
    metadata={'author': 'rmoff'}
) )

### `dev` committed state

In [None]:
%%sql

SELECT * FROM `dev@`.rmoff.my_table 

### uncommitted data on `dev` 

_by comparing current state to committed state_

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev@`.rmoff.my_table 

### Wait for a while

In [None]:
import time

time.sleep(120)

### `dev` committed state

In [None]:
%%sql

SELECT * FROM `dev@`.rmoff.my_table 

### uncommitted data on `dev` 

_by comparing current state to committed state_

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev@`.rmoff.my_table 

## Insert a row into the `dev` branch's version of the table

In [None]:
%%sql

INSERT INTO dev.rmoff.my_table VALUES(4,"snarf");

## Select only uncommitted data on `dev` 

_by comparing current state to committed state_

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev@`.rmoff.my_table 

## More references

### `dev` (committed and uncommitted)

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table 

### `dev` (committed)

In [None]:
%%sql

SELECT * FROM `dev@`.rmoff.my_table 

### `dev` (uncomitted)

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev@`.rmoff.my_table 

### `dev` (changed since two commits ago)

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev^1`.rmoff.my_table 

### Commit the changes in `dev`

In [None]:
lakefs.commits.commit(repo.id, "dev", CommitCreation(
    message="Added a row (snarf)",
    metadata={'author': 'rmoff'}
) )

### `dev` (uncommitted)

In [None]:
%%sql

SELECT * FROM `dev`.rmoff.my_table EXCEPT SELECT * FROM `dev@`.rmoff.my_table 

### Create a tag

In [None]:
lakefs.tags.create_tag(repo.id, TagCreation(id="dev-tag-01", ref="dev"))

### Insert another row into the `dev` branch's version of the table

In [None]:
%%sql

INSERT INTO dev.rmoff.my_table VALUES(5,"zibble");

### What's changed in the table between now and the tag? 

In [None]:
%%sql

SELECT * FROM dev.rmoff.my_table EXCEPT SELECT * FROM `dev-tag-01`.rmoff.my_table

---

In [None]:
from IPython.display import Markdown as md

if lakefsEndPoint=='http://lakefs:8000':
    lakeFSWebUI='http://localhost:8000'
else:
    lakeFSWebUI=lakefsEndPoint

md(f"### 👉🏻 View the objects in [lakeFS web UI]({lakeFSWebUI}/repositories/{repo.id}/objects)")