# [Integration of lakeFS with Delta Lake](https://docs.lakefs.io/integrations/delta.html)

## Use Case: Isolated Testing Environment & Rollback of multi-table transactions

## Change your lakeFS credentials

In [None]:
lakefsAccessKey = '<lakeFS Access Key>'
lakefsSecretKey = '<lakeFS Secret Key>'
lakefsEndPoint = '<lakeFS Endpoint URL>' # e.g. 'https://username.aws_region_name.lakefscloud.io'

## Storage Information
#### Change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://<S3 Bucket Name>/' # e.g. "s3://username-lakefs-cloud/"

## Versioning Information

In [None]:
sourceBranch = "main"
newBranch = "experiment2"
DeltaTable1 = "delta-table1"
DeltaTable2 = "delta-table2"

## Working with the lakeFS Python client API

In [None]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

client = LakeFSClient(configuration)

## You can change lakeFS repo name (it can be an existing repo or provide another repo name)

In [None]:
repo = "my-repo"

## If above mentioned repo already exists on your lakeFS server then you can skip following step otherwise create a new repo:

In [None]:
client.repositories.create_repository(repository_creation=models.RepositoryCreation(name=repo, storage_namespace=storageNamespace, default_branch=sourceBranch))

## Run PySpark with the Delta Lake package and additional configurations

In [None]:
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages io.delta:delta-core_2.12:2.0.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'

## S3A Gateway configuration

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Create a Delta Table in source branch

In [None]:
dataPath1 = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,DeltaTable1)
data = spark.range(0, 5)
data.write.format("delta").mode("overwrite").save(dataPath1)

## Create another Delta Table in source branch

In [None]:
dataPath2 = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,DeltaTable2)
data = spark.range(10, 20)
data.write.format("delta").mode("overwrite").save(dataPath2)

## Read from Delta Tables

In [None]:
df = spark.read.format("delta").load(dataPath1)
df.show()

In [None]:
df = spark.read.format("delta").load(dataPath2)
df.show()

## Commit changes and attach some metadata

In [None]:
client.commits.commit(
    repository=repo,
    branch=sourceBranch,
    commit_creation=models.CommitCreation(message='Added delta tables!', metadata={'using': 'python_api'}))

# Experimentation Starts

## List the repository branches by using lakeFS Python client API

In [None]:
client.branches.list_branches(repository=repo).results

## Create a new branch

In [None]:
client.branches.create_branch(repository=repo, branch_creation=models.BranchCreation(name=newBranch, source=sourceBranch))

## Update 1st Delta Table

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

dataPath1 = "s3a://{0}/{1}/{2}".format(repo,newBranch,DeltaTable1)
deltaTable = DeltaTable.forPath(spark, dataPath1)

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

## Read from updated Delta Table

In [None]:
df = spark.read.format("delta").load(dataPath1)
df.show()

## Update 2nd Delta Table

In [None]:
dataPath2 = "s3a://{0}/{1}/{2}".format(repo,newBranch,DeltaTable2)
deltaTable = DeltaTable.forPath(spark, dataPath2)

# Update every odd value by adding 200 to it
deltaTable.update(
  condition = expr("id % 2 == 1"),
  set = { "id": expr("id + 200") })

## Read from updated Delta Table

In [None]:
df = spark.read.format("delta").load(dataPath2)
df.show()

## Commit changes and attach some metadata

In [None]:
client.commits.commit(
    repository=repo,
    branch=newBranch,
    commit_creation=models.CommitCreation(message='Updated multiple Delta Tables', metadata={'using': 'python_api'}))

## Diff between the new branch and the source branch

In [None]:
client.refs.diff_refs(repository=repo, left_ref=sourceBranch, right_ref=newBranch).results

# Experimentation Completes

## Delete new branch or merge new branch to source branch

## Delete new branch

In [None]:
client.branches.delete_branch(repository=repo, branch=newBranch)

## Or merge new branch to source branch

In [None]:
client.refs.merge_into_branch(repository=repo, source_ref=newBranch, destination_branch=sourceBranch)

## Read Delta Tables from source branch

In [None]:
dataPath1 = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,DeltaTable1)
df = spark.read.format("delta").load(dataPath1)
df.show()

In [None]:
dataPath2 = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,DeltaTable2)
df = spark.read.format("delta").load(dataPath2)
df.show()

## If you merged new branch to source branch then you can revert committed changes for all Delta Tables from the source branch

### Go to lakeFS UI and get the commit ID or copy the 'reference' from the previous merge statement

In [None]:
commit_id = "<lakeFS Commit Id>"
client.branches.revert_branch(repository=repo, branch=sourceBranch, revert_creation=models.RevertCreation(ref=commit_id, parent_number=1))

## Read Delta Tables again from source branch

In [None]:
dataPath1 = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,DeltaTable1)
df = spark.read.format("delta").load(dataPath1)
df.show()

In [None]:
dataPath2 = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,DeltaTable2)
df = spark.read.format("delta").load(dataPath2)
df.show()