## Data As Code - lakeFS basic example 

### Glue session configuration

In [None]:
%stop_session
%session_id_prefix 'iceberg-books-demo'
%idle_timeout 120
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

%additional_python_modules 'lakefs'
%extra_jars https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.3.0/iceberg-spark-runtime-3.3_2.12-1.3.0.jar, https://repo1.maven.org/maven2/io/lakefs/lakefs-iceberg/0.1.3/lakefs-iceberg-0.1.3.jar, https://repo1.maven.org/maven2/io/lakefs/lakefs-spark-extensions_2.12/0.0.3/lakefs-spark-extensions_2.12-0.0.3.jar

%spark_conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.lakefs.iceberg.extension.LakeFSSparkSessionExtensions

## Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "data-as-code-repo"

### Versioning Information

In [None]:
mainBranch = "main"
devBranch = "dev"

### Import libraries

In [None]:
import os
import lakefs
from pyspark.sql.functions import when, col

### Define some helper functions

In [None]:
def print_commit(log):
    from datetime import datetime
    from pprint import pprint

    print('Message:', log.message)
    print('ID:', log.id)
    print('Committer:', log.committer)
    print('Creation Date:', datetime.utcfromtimestamp(log.creation_date).strftime('%Y-%m-%d %H:%M:%S'))
    print('Parents:', log.parents)
    print('Metadata:')
    pprint(log.metadata)

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}/4/18/a", default_branch=mainBranch, exist_ok=True)
branchMain = repo.branch(mainBranch)
print(repo)

### Set up Spark

In [None]:
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key",lakefsSecretKey)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint",lakefsEndPoint)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key",lakefsAccessKey)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access","true")
spark.conf.set("spark.sql.defaultCatalog","lakefs")
spark.conf.set("spark.sql.catalog.lakefs","org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.lakefs.catalog-impl","io.lakefs.iceberg.LakeFSCatalog")
spark.conf.set("spark.sql.catalog.lakefs.warehouse",f"lakefs://{repo_name}")
spark.conf.set("spark.sql.catalog.lakefs.uri",lakefsEndPoint)
spark.conf.set("spark.sql.catalog.lakefs.cache-enabled","false")

---

---

## Create an Iceberg table in the lakeFS catalog `main` branch

In [None]:
%%sql

CREATE OR REPLACE TABLE main.lakefs_demo.authors(id int, name string) USING iceberg;


In [None]:
%%sql

CREATE OR REPLACE TABLE main.lakefs_demo.books(id int, title string, author_id int) USING iceberg;


In [None]:
%%sql

CREATE OR REPLACE TABLE main.lakefs_demo.book_sales(id int, sale_date date, book_id int, price double) USING iceberg;


In [None]:
%%sql 

INSERT INTO main.lakefs_demo.authors (id, name)
VALUES (1, "J.R.R. Tolkien"), (2, "George R.R. Martin"),
       (3, "Agatha Christie"), (4, "Isaac Asimov"), (5, "Stephen King");

In [None]:
%%sql

INSERT INTO main.lakefs_demo.books (id, title, author_id)
VALUES (1, "The Lord of the Rings", 1), (2, "The Hobbit", 1),  
       (3, "A Song of Ice and Fire", 2), (4, "A Clash of Kings", 2),
       (5, "And Then There Were None", 3), (6, "Murder on the Orient Express", 3),
       (7, "Foundation", 4), (8, "I, Robot", 4),
       (9, "The Shining", 5), (10, "It", 5);

In [None]:
%%sql

INSERT INTO main.lakefs_demo.book_sales (id, sale_date, book_id, price)
VALUES (1, DATE '2024-04-12', 1, 25.50),
       (2, DATE '2024-04-11', 2, 17.99),  
       (3, DATE '2024-04-10', 3, 12.95),  
       (4, DATE '2024-04-13', 4, 32.00),  
       (5, DATE '2024-04-12', 5, 29.99),  
       (6, DATE '2024-03-15', 1, 23.99),  
       (7, DATE '2024-02-22', 2, 19.50),  
       (8, DATE '2024-01-10', 3, 14.95),  
       (9, DATE '2023-12-05', 4, 28.00),  
       (10, DATE '2023-11-18', 5, 27.99), 
       (11, DATE '2023-10-26', 2, 18.99), 
       (12, DATE '2023-10-12', 1, 22.50), 
       (13, DATE '2024-04-09', 3, 11.95), 
       (14, DATE '2024-03-28', 4, 35.00), 
       (15, DATE '2024-04-05', 5, 31.99), 
       (16, DATE '2024-03-01', 1, 27.50),  
       (17, DATE '2024-02-14', 2, 21.99),  
       (18, DATE '2024-01-07', 3, 13.95),  
       (19, DATE '2023-12-20', 4, 29.00),  
       (20, DATE '2023-11-03', 5, 28.99); 

In [None]:
ref = branchMain.commit(message="Initial data load",
    metadata={'author': 'lakefs'})
print_commit(ref.get_commit())

# Main demo starts here 🚦 👇🏻

## Read my production data from my main branch

In [None]:
%%sql

SELECT * FROM main.lakefs_demo.authors LIMIT 5;

In [None]:
%%sql

SELECT * FROM main.lakefs_demo.books LIMIT 5;

In [None]:
%%sql

SELECT * FROM main.lakefs_demo.book_sales LIMIT 5;

## Mess with the data - Create a development sandbox

In [None]:
branchDev = repo.branch(devBranch).create(source_reference=mainBranch, exist_ok=True)
print(f"{devBranch} ref:", branchDev.get_commit().id)

## Read data from my development sandbox

In [None]:
%%sql

SELECT * FROM dev.lakefs_demo.book_sales LIMIT 5;

In [None]:
%%sql 
SELECT 'Prod', SUM(price) AS total_sales
FROM main.lakefs_demo.book_sales
UNION ALL
SELECT 'Dev', SUM(price) AS total_sales
FROM dev.lakefs_demo.book_sales;

In [None]:
%%sql
SELECT
  au.name AS author_name,
  ROUND(SUM(s.price), 2) AS total_sales
FROM main.lakefs_demo.books b
LEFT JOIN main.lakefs_demo.authors au ON b.author_id = au.id
LEFT JOIN main.lakefs_demo.book_sales s ON b.id = s.book_id
GROUP BY au.name
ORDER BY total_sales DESC
LIMIT 3;

## Running pipelines in isolation

### Remove Cancelled Sales

In [None]:
%%sql

DELETE FROM dev.lakefs_demo.book_sales
WHERE id IN (10, 15, 2, 1, 6);

### Who are my top selling authors?

In [None]:
%%sql 

SELECT
  au.name AS author_name,
  ROUND(SUM(s.price), 2) AS total_sales
FROM dev.lakefs_demo.books b
LEFT JOIN dev.lakefs_demo.authors au ON b.author_id = au.id
LEFT JOIN dev.lakefs_demo.book_sales s ON b.id = s.book_id
GROUP BY au.name
ORDER BY total_sales DESC
LIMIT 3;


### Compare dev and main

In [None]:
%%sql
SELECT
  au.name AS author_name,
  ROUND(SUM(s.price), 2) AS total_sales
FROM main.lakefs_demo.books b
LEFT JOIN main.lakefs_demo.authors au ON b.author_id = au.id
LEFT JOIN main.lakefs_demo.book_sales s ON b.id = s.book_id
GROUP BY au.name
ORDER BY total_sales DESC
LIMIT 3;



## `Data diff`
refs_data_diff is an SQL table-valued function (TVF). The expression:
##### `refs_data_diff(PREFIX, FROM_SCHEMA, TO_SCHEMA, TABLE)`
yields a relation that compares the "from" table PREFIX.FROM_SCHEMA.TABLE with the "to" table PREFIX.TO_SCHEMA.TABLE. Its output is the difference: a relation (like a view) that adds a single column lakefs_change to the table schema.

* Rows that appear only in the first version of the table  (in the example, on branch main) appear in the difference with lakefs_change==’-’.
* Rows that appear only in the second version of the table  (in the example, on branch dev) appear in the difference with lakefs_change==’+’.
* Rows that appear in both versions of the table do not appear in the difference.

In [None]:
%%sql

SELECT * FROM refs_data_diff('lakefs', 'main', 'dev', 'lakefs_demo.book_sales');

### Commit dev changes

In [None]:
branchDev = repo.branch(devBranch)

ref = branchDev.commit(message="Removed Cancelled Sales",
    metadata={'author': 'lakefs', 
              '::lakefs::CodeVersion::url[url:ui]': 'http://localhost:8888/lab/workspaces/auto-y/tree/iceberg-books.ipynb'})

print_commit(ref.get_commit())

### Merge Changes

In [None]:
res = branchDev.merge_into(branchMain)
print(res)