<img src="https://docs.lakefs.io/assets/logo.svg" alt="lakeFS logo" width=300/> 

# Integration of lakeFS with Spark and Python

## Use Case: Isolated Testing Environment

## Access lakeFS using the S3A gateway

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [1]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFODNN7EXAMPLE'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [2]:
storageNamespace = 's3://example/spark-demo/' # e.g. "s3://bucket/path-for-lakefs-content"

## Setup

### Configuring lakeFSClient and Spark

In [21]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

### Define lakeFS Repository

_This should already exist; if not, go and create it through the lakeFS UI_

In [22]:
repo_name = "spark-demo"

In [23]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"s3://example/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

Repository spark-demo does not exist, so going to try and create it now.
Created new repo spark-demo using storage namespace s3://example/spark-demo


In [24]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

## Versioning Information 

In [25]:
sourceBranch = "main"
newBranch = "experiment1"
newPath = "partitioned_data"
fileName = "lakefs_test.csv"

## Upload a file

In [26]:
import os
contentToUpload = open(f"/data/{fileName}", 'rb') # Only a single file per upload which must be named \\\"content\\\"
lakefs.objects.upload_object(
    repository=repo.id,
    branch=sourceBranch,
    path=fileName, content=contentToUpload)

{'checksum': 'afe805e5a3c3ec7fa05645a6a2a6e607',
 'content_type': 'text/csv',
 'mtime': 1685028215,
 'path': 'lakefs_test.csv',
 'path_type': 'object',
 'physical_address': 's3://example/spark-demo/data/gnfodhqge7ks77e4jro0/chnnqtqge7ks77e4jsi0',
 'size_bytes': 9}

## Commit changes and attach some metadata

In [27]:
lakefs.commits.commit(
    repository=repo.id,
    branch=sourceBranch,
    commit_creation=CommitCreation(
        message='Added my first file!',
        metadata={'using': 'python_api'}))

{'committer': 'everything-bagel',
 'creation_date': 1685028218,
 'id': '2df3f13a80a80d9e16c81452c62ea19400b01022ca22e4195c1a13b6c8a630e5',
 'message': 'Added my first file!',
 'meta_range_id': '',
 'metadata': {'using': 'python_api'},
 'parents': ['b83bc0359451bb83accda9aeb0325daf16eca1725fd5ea0e18c5c3c5dc89d166']}

## Reading data by using S3A GatewaydataPath

In [28]:
dataPath = f"s3a://{repo.id}/{sourceBranch}/{fileName}"
print(f"Reading CSV from {dataPath}")
df = spark.read.csv(dataPath)
df.show()

Reading CSV from s3a://spark-demo/main/lakefs_test.csv
+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|
+---+---+---+---+---+
|  1|  2|  3|  4|  5|
+---+---+---+---+---+



# Experimentation Starts

## List the repository branches by using lakeFS Python client API

In [29]:
results = map(
    lambda n:[n.id,n.commit_id],
    lakefs.branches.list_branches(
        repository=repo.id).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['id','commit_id']))

id    commit_id
----  ----------------------------------------------------------------
main  2df3f13a80a80d9e16c81452c62ea19400b01022ca22e4195c1a13b6c8a630e5


## Create a new branch

In [30]:
lakefs.branches.create_branch(
    repository=repo.id,
    branch_creation=BranchCreation(
        name=newBranch,
        source=sourceBranch))

'2df3f13a80a80d9e16c81452c62ea19400b01022ca22e4195c1a13b6c8a630e5'

## Partition the data and write to new branch by using S3A Gateway

In [31]:
newDataPath = f"s3a://{repo.id}/{newBranch}/{newPath}"

df.write.partitionBy("_c0").csv(newDataPath)

## Diffing a single branch will show all the uncommitted changes on that branch

In [32]:
results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.branches.diff_branch(
        repository=repo.id,
        branch=newBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

Path                                                                             Path Type      Size(Bytes)  Type
-------------------------------------------------------------------------------  -----------  -------------  ------
partitioned_data/_SUCCESS                                                        object                   8  added
partitioned_data/_c0=1/part-00000-8baf51ac-521c-4644-a534-ea76530facf4.c000.csv  object                   8  added


## Commit changes and attach some metadata

In [33]:
lakefs.commits.commit(
    repository=repo.id,
    branch=newBranch,
    commit_creation=CommitCreation(
        message='Partitioned CSV file!',
        metadata={'using': 'python_api'}))

{'committer': 'everything-bagel',
 'creation_date': 1685028240,
 'id': 'd42951db69bb460da47455083cecbaffcac4f8a774238b6831437100f6a20f60',
 'message': 'Partitioned CSV file!',
 'meta_range_id': '',
 'metadata': {'using': 'python_api'},
 'parents': ['2df3f13a80a80d9e16c81452c62ea19400b01022ca22e4195c1a13b6c8a630e5']}

## Diff between the new branch and the source branch

In [34]:
results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.refs.diff_refs(
        repository=repo.id,
        left_ref=sourceBranch,
        right_ref=newBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

Path                                                                             Path Type      Size(Bytes)  Type
-------------------------------------------------------------------------------  -----------  -------------  ------
partitioned_data/_SUCCESS                                                        object                   8  added
partitioned_data/_c0=1/part-00000-8baf51ac-521c-4644-a534-ea76530facf4.c000.csv  object                   8  added


# Experimentation Completes

## Option A: Experimentation fails, so just delete the new branch

In [39]:
lakefs.branches.delete_branch(
    repository=repo.id,
    branch=newBranch)

## Option B: Experimentation succeeds, so merge new branch to the main branch (atomic promotion to production)

In [38]:
lakefs.refs.merge_into_branch(
    repository=repo.id,
    source_ref=newBranch, 
    destination_branch=sourceBranch)

ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Request-Id': '2a5a3d74-4b18-4b65-9f87-1562b9c6154f', 'Date': 'Thu, 25 May 2023 15:24:31 GMT', 'Content-Length': '45'})
HTTP response body: {"message":"update branch main: no changes"}



## Diff between the new branch and the source branch

In [36]:
results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.refs.diff_refs(
        repository=repo.id,
        left_ref=sourceBranch,
        right_ref=newBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

Path    Path Type    Size(Bytes)    Type
------  -----------  -------------  ------


## If you merged new branch to the main branch then you can atomically rollback all changes

In [37]:
lakefs.branches.revert_branch(
    repository=repo.id,
    branch=sourceBranch, 
    revert_creation=RevertCreation(
        ref=sourceBranch, parent_number=1))

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack