<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# Creating Dev-Test environments with lakeFS branches

## Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "netflix"

### Versioning Information

In [None]:
ingest_branch = "ingress-landing-area"
staging_branch = "staging-area"
prod_branch = "main"

### Import libraries

In [None]:
import os
import lakefs
from assets.lakefs_demo import print_commit, print_diff
from datetime import date, time
from pyspark.sql.functions import col,isnan,when,count

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=prod_branch, exist_ok=True)
branchProd = repo.branch(prod_branch)
print(repo)

### Set up Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()
spark.sparkContext.setLogLevel("INFO")

spark

## Creating Ingest and Staging branches

In [None]:
branchIngest = repo.branch(ingest_branch).create(source_reference=prod_branch, exist_ok=True)
print(f"{ingest_branch} ref:", branchIngest.get_commit().id)

In [None]:
branchStaging = repo.branch(staging_branch).create(source_reference=prod_branch, exist_ok=True)
print(f"{staging_branch} ref:", branchStaging.get_commit().id)

In [None]:
for branch in repo.branches():
    print(branch)

## Load some sample data about Netflix movies

The daily partition lands in ingress path (branch)

In [None]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path

In [None]:
contentToUpload = open(f'/data/{ingest_data}', 'r').read()
print(branchIngest.object(ingest_path).upload(data=contentToUpload, mode='wb', pre_sign=False))

In [None]:
ref = branchIngest.commit(message="netflix movie data arrived at landing area (today's partition)")
print_commit(ref.get_commit())

In [None]:
diff = branchProd.diff(other_ref=branchIngest)
print_diff(diff)

## Copying daily partition from ingress to staging area (branch)

In [None]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path

In [None]:
csv_path = f"{staging_long_path}/raw/dt={str(date.today())}/csv"
csv_path

In [None]:
movies_df = spark.read.option("header","true").csv(f"s3a://{repo_name}/{ingest_branch}/{ingest_path}")

In [None]:
movies_df.write.option("header",True)\
        .mode("append")\
        .csv(csv_path) 

In [None]:
ref = branchStaging.commit(message="netflix movie data copied to staging area (today's partition)")
print_commit(ref.get_commit())

In [None]:
diff = branchProd.diff(other_ref=branchStaging)
print_diff(diff)

## Data Exploration and Cleaning in staging area (branch)

In [None]:
movies_df = spark.read.option("header","true").csv(csv_path)
df_columns=movies_df.columns

In [None]:
print(movies_df.count())
print(movies_df.printSchema())

In [None]:
movies_df.show(10)

In [None]:
movies_df.sample(False,0.1,0)

## Null checks

In [None]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()

In [None]:
movies_df = movies_df.na.drop("any")

In [None]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()

## Writing Transformed Parquet files to staging area

In [None]:
movies_df.write.option("header",True)\
        .partitionBy("country")\
        .mode("append")\
        .parquet(f"{staging_long_path}/analytics/movies-by-country-parquet")

### View uncommitted changes and clean up the files not needed

Go to the lakeFS UI to inspect the uncommitted changes, e.g. http://localhost:8000/repositories/netflix/changes?ref=staging-area&prefix=analytics%2Fmovies-by-country-parquet%2F

## Commit the changes to staging 

In [None]:
ref = branchStaging.commit(message='loaded paritioned movies parquet to staging area')
print_commit(ref.get_commit())

## Merging Daily Data (Parquet files) to Prod

In [None]:
res = branchStaging.merge_into(branchProd)
print(res)

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack