<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# lakeFS and Delta Lake diff

This shows the use of Delta Lake with lakeFS.

## Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "delta-lake-diff"

### Import libraries

In [None]:
import os
import lakefs
from assets.lakefs_demo import print_commit, print_diff

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch='main', exist_ok=True)
branchMain = repo.branch('main')
print(repo)

### Set up Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
                    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
                    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
                    .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
                    .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
                    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
                    .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

---

# Main demo starts here 🚦 👇🏻

## Load some data into lakeFS

Read a parquet file from URL

In [None]:
df = spark.read.parquet(f"/data/userdata/userdata1.parquet")

How many rows of data?

In [None]:
display(df.count())

What does the data look like?

In [None]:
display(df.show(n=1,vertical=True))

## Write data to lakeFS (on the `main` branch) in Delta format

In [None]:
branch='main'

In [None]:
df.write.format("delta").mode('overwrite').save('s3a://'+repo_name+'/'+branch+'/demo/users')

#### 👉🏻[The data as seen from LakeFS](http://localhost:8000/repositories/delta-lake-diff/objects?ref=main&path=demo%2Fusers%2F)

### Commit the new file in `main`

In [None]:
ref = branchMain.commit(message='Initial user data load')
print_commit(ref.get_commit())

## Create a branch

In [None]:
branch='modify_user_data'

In [None]:
branchModifyUserData = repo.branch(branch).create(source_reference="main", exist_ok=True)
print(f"{branch} ref:", branchModifyUserData.get_commit().id)

### List the current branches in the repository

In [None]:
for branchList in repo.branches():
    print(branchList.id)

## Add some new data with merge

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

In [None]:
new_df = spark.read.parquet(f"/data/userdata/userdata2.parquet")

In [None]:
users_deltaTable = DeltaTable.forPath(spark, 's3a://'+repo_name+'/'+branch+'/demo/users')

In [None]:
users_deltaTable.alias("users").merge(
    source = new_df.alias("new_users"),
    condition = "users.id = new_users.id") \
  .whenNotMatchedInsertAll() \
  .execute()

### Commit in lakeFS

In [None]:
ref = branchModifyUserData.commit(message='Merge in new user data')
print_commit(ref.get_commit())

## Update some data

In [None]:
deltaTable = DeltaTable.forPath(spark, f"s3a://{repo_name}/{branch}/demo/users")

In [None]:
deltaTable.toDF().filter(col("country").isin("Portugal", "China")).select("country","ip_address").show(5)

In [None]:
deltaTable.update(
    condition = "country == 'Portugal'",
    set = { "ip_address" : "'x.x.x.x'" })

In [None]:
deltaTable.toDF().filter(col("country").isin("Portugal", "China")).select("country","ip_address").show(10)

### Commit in lakeFS

In [None]:
ref = branchModifyUserData.commit(message='Mask all IPs for users in Portugal')
print_commit(ref.get_commit())

## Delete some data

In [None]:
deltaTable.toDF().filter(col("salary") > 60000).count()

In [None]:
deltaTable.delete(col("salary") > 60000)

In [None]:
deltaTable.toDF().filter(col("salary") > 60000).count()

### Commit in lakeFS

In [None]:
ref = branchModifyUserData.commit(message='Delete users with salary over 60k')
print_commit(ref.get_commit())

## Look at the data and diffs

### File level diffs

In [None]:
diff = branchMain.diff(other_ref=branchModifyUserData)
print_diff(diff)

### Delta operation level diffs

In [None]:
def diff_delta_tables(repository, table_path, ref_a, ref_b):
    ref_a_history = DeltaTable.forPath(spark, f"s3a://{repository}/{ref_a}/{table_path}").history()
    ref_b_history = DeltaTable.forPath(spark, f"s3a://{repository}/{ref_b}/{table_path}").history()
    return ref_b_history.join(ref_a_history, (ref_b_history.version ==  ref_a_history.version) & (ref_b_history.timestamp ==  ref_a_history.timestamp), "leftanti")

In [None]:
diff_delta_tables(repository=repo_name, table_path='/demo/users', ref_a='main', ref_b='modify_user_data').show()

### Record count diffs

In [None]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

def delta_table_compare_branches(repository, table_path, refs):
  spark.createDataFrame(
    data=zip(
      refs,
      map(lambda r: spark.read.format('delta').load(f's3a://{repo.id}/{r}/{table_path}').count(), refs)
    ), 
    schema=StructType([ 
      StructField("Branch", StringType(), True),
      StructField("Count", IntegerType(), True)
    ])
  ).show(truncate=False)

In [None]:
refs = ['main', 'modify_user_data']

delta_table_compare_branches(repo_name, '/demo/users', refs)

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack