<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# Creating Dev-Test environments with lakeFS branches

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFODNN7EXAMPLE'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

In [None]:
repo_name = "netflix"

## Setup

### Configuring lakeFSClient and Spark

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

### Define lakeFS Repository

_This should already exist; if not, go and create it through the lakeFS UI_

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

## Creating Ingest and Staging branches

In [None]:
ingest_branch = "ingress-landing-area"
staging_branch = "staging-area"
prod_branch = "main"


In [None]:
lakefs.branches.list_branches(repo_name)


In [None]:
lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

In [None]:
lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=staging_branch, 
                                                                    source=prod_branch)
                             )

In [None]:
lakefs.branches.list_branches(repo_name)


## Load some sample data about Netflix movies

The daily partition lands in ingress path (branch)

In [None]:
from datetime import date, time

In [None]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path


In [None]:
with open(f'/data/{ingest_data}', 'rb') as f:
    lakefs.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [None]:
lakefs.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=CommitCreation(
                          message="netflix movie data arrived at landing area (today's partition)")
                     )

## Copying daily partition from ingress to staging area (branch)

In [None]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path

In [None]:
csv_path = f"{staging_long_path}/raw/dt={str(date.today())}/csv"
csv_path

In [None]:
movies_df = spark.read.option("header","true").csv(f"s3a://{repo_name}/{ingest_branch}/{ingest_path}")

In [None]:
movies_df.write.option("header",True)\
        .mode("append")\
        .csv(csv_path)
    

In [None]:
lakefs.branches.diff_branch(repository=repo_name, 
                            branch=staging_branch).results


In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=CommitCreation(
                          message="netflix movie data copied to staging area (today's partition)")
                     )

## Data Exploration and Cleaning in staging area (branch)

In [None]:
movies_df = spark.read.option("header","true").csv(csv_path)
df_columns=movies_df.columns


In [None]:
print(movies_df.count())
print(movies_df.printSchema())


In [None]:
movies_df.show(10)

In [None]:
#movies_df = 
movies_df.sample(False,0.1,0)

## Null checks

In [None]:
from pyspark.sql.functions import col,isnan,when,count

In [None]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


In [None]:
movies_df = movies_df.na.drop("any")

In [None]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


## Writing Transformed Parquet files to staging area

In [None]:
movies_df.write.option("header",True)\
        .partitionBy("country")\
        .mode("append")\
        .parquet(f"{staging_long_path}/analytics/movies-by-country-parquet")
    

### View uncommitted changes and clean up the files not needed

Go to the lakeFS UI to inspect the uncommitted changes, e.g. http://localhost:8000/repositories/example/changes?ref=staging-area&prefix=analytics%2Fmovies-by-country-parquet%2F

## Commit the changes to staging 

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=CommitCreation(
                          message='loaded paritioned movies parquet to staging area'))


## Merging Daily Data (Parquet files) to Prod

In [None]:
lakefs.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)
