<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# Integration of lakeFS with Spark and Python

Use Case: Isolated Testing Environment

Access lakeFS using the S3A gateway

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFODNN7EXAMPLE'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

In [None]:
repo_name = "spark-demo"

## Setup

### Create lakeFSClient

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

### Define lakeFS Repository

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

### Set up Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

## Versioning Information 

In [None]:
sourceBranch = "main"
newBranch = "experiment01"
newPath = "partitioned_data"
fileName = "lakefs_test.csv"

## Upload a file

In [None]:
import os
contentToUpload = open(f"/data/{fileName}", 'rb') # Only a single file per upload which must be named \\\"content\\\"
lakefs.objects.upload_object(
    repository=repo.id,
    branch=sourceBranch,
    path=fileName, content=contentToUpload)

## Commit changes and attach some metadata

In [None]:
lakefs.commits.commit(
    repository=repo.id,
    branch=sourceBranch,
    commit_creation=CommitCreation(
        message='Added my first file!',
        metadata={'using': 'python_api'}))

## Reading data by using S3A GatewaydataPath

In [None]:
dataPath = f"s3a://{repo.id}/{sourceBranch}/{fileName}"
print(f"Reading CSV from {dataPath}")
df = spark.read.csv(dataPath)
df.show()

# Experimentation Starts

## List the repository branches by using lakeFS Python client API

In [None]:
results = map(
    lambda n:[n.id,n.commit_id],
    lakefs.branches.list_branches(
        repository=repo.id).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['id','commit_id']))

## Create a new branch

In [None]:
lakefs.branches.create_branch(
    repository=repo.id,
    branch_creation=BranchCreation(
        name=newBranch,
        source=sourceBranch))

## Partition the data and write to new branch by using S3A Gateway

In [None]:
newDataPath = f"s3a://{repo.id}/{newBranch}/{newPath}"

df.write.partitionBy("_c0").csv(newDataPath)

## Diffing a single branch will show all the uncommitted changes on that branch

In [None]:
results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.branches.diff_branch(
        repository=repo.id,
        branch=newBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

## Commit changes and attach some metadata

In [None]:
lakefs.commits.commit(
    repository=repo.id,
    branch=newBranch,
    commit_creation=CommitCreation(
        message='Partitioned CSV file!',
        metadata={'using': 'python_api'}))

## Diff between the new branch and the source branch

In [None]:
results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.refs.diff_refs(
        repository=repo.id,
        left_ref=sourceBranch,
        right_ref=newBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

# Experimentation Completes

## Option A: Experimentation fails, so just delete the new branch

In [None]:
lakefs.branches.delete_branch(
    repository=repo.id,
    branch=newBranch)

## Option B: Experimentation succeeds, so merge new branch to the main branch (atomic promotion to production)

In [None]:
lakefs.refs.merge_into_branch(
    repository=repo.id,
    source_ref=newBranch, 
    destination_branch=sourceBranch)

## Diff between the new branch and the source branch

In [None]:
results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.refs.diff_refs(
        repository=repo.id,
        left_ref=sourceBranch,
        right_ref=newBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

## If you merged new branch to the main branch then you can atomically rollback all changes

In [None]:
lakefs.branches.revert_branch(
    repository=repo.id,
    branch=sourceBranch, 
    revert_creation=RevertCreation(
        ref=sourceBranch, parent_number=1))

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack