# What happens if you use Iceberg on lakeFS _without_ the new built-in support

# Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [1]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [2]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

# Setup

**(you shouldn't need to change anything in this section, just run it)**

In [3]:
repo_name = "iceberg-01"

### Create lakeFSClient

In [4]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

#### Verify lakeFS credentials by getting lakeFS version

In [5]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.config.get_config()
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v['version_config']['latest_version']}")

Verifying lakeFS credentials…
…✅lakeFS credentials verified

ℹ️lakeFS version 0.104.0


### Define lakeFS Repository

In [6]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

Repository iceberg-01 does not exist, so going to try and create it now.
Created new repo iceberg-01 using storage namespace s3://example/iceberg-01


In [7]:
data_dir=f"s3a://{repo.id.replace('s3','s3a')}"
print(f"Using {data_dir} for data storage")

Using s3a://iceberg-01 for data storage


### Set up Spark

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(f"lakeFS sample / {repo.id}") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0") \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", (f"{data_dir}/main")) \
        .config("spark.sql.defaultCatalog", "local") \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

---

---

# Main demo starts here 🚦 👇🏻

## Load test data and write it as an Iceberg table

In [9]:
df = spark.read.option("inferSchema","true").option("multiline","true").json("/data/nyc_film_permits.json")

In [10]:
df.write.saveAsTable("nyc.permits")

In [11]:
%sql DESCRIBE EXTENDED nyc.permits

col_name,data_type,comment
borough,string,
category,string,
communityboard_s,string,
country,string,
enddatetime,string,
enteredon,string,
eventagency,string,
eventid,string,
eventtype,string,
parkingheld,string,


## Commit the data to the `main` branch

In [12]:
lakefs.commits.commit(repo.id, "main", CommitCreation(
    message="Initial data load",
    metadata={'author': 'rmoff',
              'data source': 'https://data.cityofnewyork.us/City-Government/Film-Permits/tg4x-b46p'}
) )

{'committer': 'everything-bagel',
 'creation_date': 1689580028,
 'id': '0fc579cd5cfead861aaafcb37eff349514209ff4cd6d1a8b0df10073244eeedf',
 'message': 'Initial data load',
 'meta_range_id': '',
 'metadata': {'author': 'rmoff',
              'data source': 'https://data.cityofnewyork.us/City-Government/Film-Permits/tg4x-b46p'},
 'parents': ['3f71ab5438445542cb6528af2067dc0311a087c0dbce4194d9478d53c1d174b1']}

## Create branch

In [13]:
lakefs.branches.create_branch(repo.id, 
                              BranchCreation(name="dev",
                                             source="main"))

'0fc579cd5cfead861aaafcb37eff349514209ff4cd6d1a8b0df10073244eeedf'

## Query the table

In [14]:
%sql SELECT COUNT(*) FROM nyc.permits

count(1)
1000


# Stop the Spark session and create a new one to read the dev version of the table

In [15]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(f"lakeFS sample / {repo.id}") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0") \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", (f"{data_dir}/dev")) \
        .config("spark.sql.defaultCatalog", "local") \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

In [16]:
%sql SELECT COUNT(*) FROM nyc.permits

count(1)
1000


In [17]:
%sql DESCRIBE EXTENDED nyc.permits

col_name,data_type,comment
borough,string,
category,string,
communityboard_s,string,
country,string,
enddatetime,string,
enteredon,string,
eventagency,string,
eventid,string,
eventtype,string,
parkingheld,string,


In [18]:
%sql show databases

namespace
nyc


In [19]:
%sql show tables from nyc

namespace,tableName,isTemporary
nyc,permits,False


In [20]:
%sql show tblproperties nyc.permits

key,value
current-snapshot-id,5861622983075030793
format,iceberg/parquet
format-version,1
write.format.default,parquet


In [21]:
%sql DESCRIBE TABLE EXTENDED nyc.permits

col_name,data_type,comment
borough,string,
category,string,
communityboard_s,string,
country,string,
enddatetime,string,
enteredon,string,
eventagency,string,
eventid,string,
eventtype,string,
parkingheld,string,


In [22]:
df = spark.read.option("inferSchema","true").option("multiline","true").json("/data/nyc_film_permits.json")

In [23]:
df.write.saveAsTable("nyc.permits_written_to_dev")