# What happens if you use Iceberg on lakeFS _without_ the new built-in support

# Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

# Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "iceberg-01"

### Versioning Information

In [None]:
mainBranch = "main"
devBranch = "dev"

### Import libraries

In [None]:
import os
import lakefs
from assets.lakefs_demo import print_commit, print_diff

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=mainBranch, exist_ok=True)
branchMain = repo.branch(mainBranch)
print(repo)

In [None]:
data_dir=f"s3a://{repo_name.replace('s3','s3a')}"
print(f"Using {data_dir} for data storage")

### Set up Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(f"lakeFS sample / {repo.id}") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0") \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", (f"{data_dir}/{mainBranch}")) \
        .config("spark.sql.defaultCatalog", "local") \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

---

---

# Main demo starts here 🚦 👇🏻

## Load test data and write it as an Iceberg table

In [None]:
df = spark.read.option("inferSchema","true").option("multiline","true").json("/data/nyc_film_permits.json")

In [None]:
df.write.saveAsTable("nyc.permits")

In [None]:
%sql DESCRIBE EXTENDED nyc.permits

## Commit the data to the `main` branch

In [None]:
ref = branchMain.commit(message="Initial data load",
    metadata={'author': 'lakefs',
              'data source': 'https://data.cityofnewyork.us/City-Government/Film-Permits/tg4x-b46p'})
print_commit(ref.get_commit())

## Create branch

In [None]:
branchDev = repo.branch(devBranch).create(source_reference=mainBranch, exist_ok=True)
print(f"{devBranch} ref:", branchDev.get_commit().id)

## Query the table

In [None]:
%sql SELECT COUNT(*) FROM nyc.permits

# Stop the Spark session and create a new one to read the dev version of the table

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(f"lakeFS sample / {repo.id}") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0") \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", (f"{data_dir}/{devBranch}")) \
        .config("spark.sql.defaultCatalog", "local") \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

In [None]:
%sql SELECT COUNT(*) FROM nyc.permits

In [None]:
%sql DESCRIBE EXTENDED nyc.permits

In [None]:
%sql show databases

In [None]:
%sql show tables from nyc

In [None]:
%sql show tblproperties nyc.permits

In [None]:
%sql DESCRIBE TABLE EXTENDED nyc.permits

In [None]:
df = spark.read.option("inferSchema","true").option("multiline","true").json("/data/nyc_film_permits.json")

In [None]:
df.write.saveAsTable("nyc.permits_written_to_dev")

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack