# Use Case: Reprocess and Backfill Data with new ETL logic

## Setup Task: Change your lakeFS credentials

In [None]:
lakefsAccessKey = '<lakeFS Access Key>'
lakefsSecretKey = '<lakeFS Secret Key>'
lakefsEndPoint = '<lakeFS Endpoint URL>' # e.g. 'https://username.aws_region_name.lakefscloud.io'

## Setup Task: Storage Information
#### Change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://<S3 Bucket Name>/' # e.g. "s3://username-lakefs-cloud/"

## Setup Task: Versioning Information

In [None]:
mainBranch = "main"
ingestBranch = "ingest"
fileName = "lakefs_test.csv"
processedFileName = "lakefs_test_processed.csv"

## Run additional [Setup](./ReprocessData/Setup.ipynb) tasks

In [None]:
%run ./ReprocessData/Setup.ipynb

## You can change lakeFS repo name (it can be an existing repo or provide another repo name)

In [None]:
repo = "my-repo"

## If above mentioned repo already exists on your lakeFS server then you can skip following step otherwise create a new repo:

In [None]:
client.repositories.create_repository(
    repository_creation=models.RepositoryCreation(
        name=repo,
        storage_namespace=storageNamespace,
        default_branch=mainBranch))

## [ETL](./ReprocessData/ETL.ipynb) job normally run as a batch job but run ETL job manually here for the demo

### It will take around a minute to run this ETL job

In [None]:
%run ./ReprocessData/ETL.ipynb

# Reprocessing Starts

## Set the name for reprocessing branch and run [Reprocessing](./ReprocessData/Reprocessing.ipynb) job

In [None]:
reprocessBranch = "new-logic"
%run ./ReprocessData/Reprocessing.ipynb

## While ETL logic is getting fixed, old ETL job is still running in parallel.

## Received new data file

In [None]:
fileName = "lakefs_test_new.csv"

## Run [ETL](./ReprocessData/ETL.ipynb) job again

In [None]:
%run ./ReprocessData/ETL.ipynb

## Now Reprocessing branch is behind Main branch in terms of data

In [None]:
print("Processed data on " + reprocessBranch + " branch")
dataPath = f"s3a://{repo}/{reprocessBranch}/{processedFileName}"

df = spark.read.format("csv").schema(processedDataFileSchema).load(dataPath)
df.show()

In [None]:
print("Processed data on main branch")
dataPath = f"s3a://{repo}/{mainBranch}/{processedFileName}"

df = spark.read.format("csv").schema(processedDataFileSchema).load(dataPath)
df.show()

## Once ETL logic is fixed, pause the old ETL job to deploy new ETL logic

## Set the name for "Backfill and Deploy" branch

In [None]:
backfillAndDeployBranch = "backfill-and-deploy"

## Run [Reprocessing](./ReprocessData/Reprocessing.ipynb) job again on "Backfill and Deploy" branch

In [None]:
reprocessBranch = backfillAndDeployBranch
%run ./ReprocessData/Reprocessing.ipynb

## Now "Backfill and Deploy" branch has same data as Main branch and correct ETL logic

In [None]:
print("Processed data on " + backfillAndDeployBranch + " branch")
dataPath = f"s3a://{repo}/{backfillAndDeployBranch}/{processedFileName}"

df = spark.read.format("csv").schema(processedDataFileSchema).load(dataPath)
df.show()

In [None]:
print("Processed data on main branch")
dataPath = f"s3a://{repo}/{mainBranch}/{processedFileName}"

df = spark.read.format("csv").schema(processedDataFileSchema).load(dataPath)
df.show()

## Merge "Backfill and Deploy" branch to Main branch

In [None]:
client.refs.merge_into_branch(
    repository=repo, source_ref=backfillAndDeployBranch, 
    destination_branch=mainBranch)

# Reprocessing and Backfill completes

## Verify data on Main branch

In [None]:
print("Processed data on main branch")
dataPath = f"s3a://{repo}/{mainBranch}/{processedFileName}"

df = spark.read.format("csv").schema(processedDataFileSchema).load(dataPath)
df.show()

## Now you can schedule the new ETL job