<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# Creating Dev-Test environments with lakeFS branches

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [1]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFODNN7EXAMPLE'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [2]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

In [3]:
repo_name = "netflix"

## Setup

### Configuring lakeFSClient and Spark

In [4]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

### Define lakeFS Repository

_This should already exist; if not, go and create it through the lakeFS UI_

In [5]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

Found existing repo netflix using storage namespace s3://example/netflix


In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

## Creating Ingest and Staging branches

In [7]:
ingest_branch = "ingress-landing-area"
staging_branch = "staging-area"
prod_branch = "main"


In [8]:
lakefs.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 2},
 'results': [{'commit_id': '3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277',
              'id': 'ingress-landing-area'},
             {'commit_id': '3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277',
              'id': 'main'}]}

In [9]:
lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

ApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Request-Id': 'ba52ed07-f111-486d-9878-0d9499da9808', 'Date': 'Thu, 25 May 2023 15:45:21 GMT', 'Content-Length': '48'})
HTTP response body: {"message":"branch already exists: not unique"}



In [10]:
lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=staging_branch, 
                                                                    source=prod_branch)
                             )

'3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277'

In [11]:
lakefs.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 3},
 'results': [{'commit_id': '3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277',
              'id': 'ingress-landing-area'},
             {'commit_id': '3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277',
              'id': 'main'},
             {'commit_id': '3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277',
              'id': 'staging-area'}]}

## Load some sample data about Netflix movies

The daily partition lands in ingress path (branch)

In [12]:
from datetime import date, time

In [13]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path


'dt=2023-05-25/movies.csv'

In [14]:
with open(f'/data/{ingest_data}', 'rb') as f:
    lakefs.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [15]:
lakefs.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


[{'path': 'dt=2023-05-25/movies.csv',
  'path_type': 'object',
  'size_bytes': 1071619,
  'type': 'added'}]

In [16]:
lakefs.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=CommitCreation(
                          message="netflix movie data arrived at landing area (today's partition)")
                     )

{'committer': 'everything-bagel',
 'creation_date': 1685029538,
 'id': '0793c59b95b7ebd28071fe4e63d7cc8da76d6223b90337d94a86d41db9d8ff20',
 'message': "netflix movie data arrived at landing area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277']}

## Copying daily partition from ingress to staging area (branch)

In [17]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path

's3a://netflix/staging-area'

In [18]:
csv_path = f"{staging_long_path}/raw/dt={str(date.today())}/csv"
csv_path

's3a://netflix/staging-area/raw/dt=2023-05-25/csv'

In [19]:
movies_df = spark.read.option("header","true").csv(f"s3a://{repo_name}/{ingest_branch}/{ingest_path}")

In [20]:
movies_df.write.option("header",True)\
        .mode("append")\
        .csv(csv_path)
    

In [21]:
lakefs.branches.diff_branch(repository=repo_name, 
                            branch=staging_branch).results


[{'path': 'raw/dt=2023-05-25/csv/_SUCCESS',
  'path_type': 'object',
  'size_bytes': 1062839,
  'type': 'added'},
 {'path': 'raw/dt=2023-05-25/csv/part-00000-8b364c80-42c7-4e41-ac47-62f0dcbd0fd4-c000.csv',
  'path_type': 'object',
  'size_bytes': 1062839,
  'type': 'added'}]

In [22]:
lakefs.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=CommitCreation(
                          message="netflix movie data copied to staging area (today's partition)")
                     )

{'committer': 'everything-bagel',
 'creation_date': 1685029547,
 'id': '63137ec9334022ee755f3ee416896e15799d3443405b0dcf0f7b2038fb0a1733',
 'message': "netflix movie data copied to staging area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['3fcdef0716c0ade9ca876136343c93b814ceb670bf87378c6d4b67470e9bb277']}

## Data Exploration and Cleaning in staging area (branch)

In [23]:
movies_df = spark.read.option("header","true").csv(csv_path)
df_columns=movies_df.columns


In [24]:
print(movies_df.count())
print(movies_df.printSchema())


8791
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)

None


In [25]:
movies_df.show(10)

+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|           director|       country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|    Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|    Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|      Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|      Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|       Hai

In [31]:
#movies_df = 
movies_df.sample(False,0.1,0)

DataFrame[show_id: string, type: string, title: string, director: string, country: string, date_added: string, release_year: string, rating: string, duration: string, listed_in: string]

## Null checks

In [32]:
from pyspark.sql.functions import col,isnan,when,count

In [33]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|show_id|type|title|director|country|date_added|release_year|rating|duration|listed_in|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|      0|   0|    0|       0|      0|         0|           0|     0|       0|        0|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+



In [34]:
movies_df = movies_df.na.drop("any")

In [35]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|show_id|type|title|director|country|date_added|release_year|rating|duration|listed_in|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|      0|   0|    0|       0|      0|         0|           0|     0|       0|        0|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+



## Writing Transformed Parquet files to staging area

In [36]:
movies_df.write.option("header",True)\
        .partitionBy("country")\
        .mode("append")\
        .parquet(f"{staging_long_path}/analytics/movies-by-country-parquet")
    

### View uncommitted changes and clean up the files not needed

Go to the lakeFS UI to inspect the uncommitted changes, e.g. http://localhost:8000/repositories/example/changes?ref=staging-area&prefix=analytics%2Fmovies-by-country-parquet%2F

## Commit the changes to staging 

In [37]:
lakefs.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=CommitCreation(
                          message='loaded paritioned movies parquet to staging area'))


{'committer': 'everything-bagel',
 'creation_date': 1685029917,
 'id': '119efd6e663f2c9eee25f25c30b3d5f17051dfb8855bb86b56a200239523a1ec',
 'message': 'loaded paritioned movies parquet to staging area',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['63137ec9334022ee755f3ee416896e15799d3443405b0dcf0f7b2038fb0a1733']}

## Merging Daily Data (Parquet files) to Prod

In [38]:
lakefs.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)


{'reference': 'dde63e3495ffb74c4fa4ca79ad059d0e452bd9fd97a340d79916ac0a5e7d57c0',
 'summary': {'added': 0, 'changed': 0, 'conflict': 0, 'removed': 0}}