# Use Case: Reprocess and Backfill Data with new ETL logic

## Create branch

In [None]:
lakefs.branches.create_branch(
    repository=repo.id,
    branch_creation=BranchCreation(
        name=reprocessBranch,
        source=mainBranch))

print("🟩 Created " + reprocessBranch + " branch from main branch")

## Read data

In [None]:
print("\n🟩 Reading data from " + reprocessBranch + " branch")
dataPath = f"s3a://{repo.id}/{reprocessBranch}/{processedFileName}"

df = spark.read.format("csv").schema(processedDataFileSchema).load(dataPath)
df = df.drop("Total_Sales")
df = df.drop("Average_Sales_per_Product_Category")
df.show()

## Fix ETL logic

In [None]:
print("🟩 Processed data with correct value for Average field")

# Calculate total
df = df.withColumn('Total_Sales', df.Apparel_Sales + df.Books_Sales + df.Electronics_Sales + df.Furniture_Sales + df.Toys_Sales)
# Calculate average but average calculation is wrong (divided by 4 instead of 5)
df = df.withColumn('Average_Sales_per_Product_Category', (df.Total_Sales)/5)
df.cache()
df.show()

## Overwrite processed data

In [None]:
newDataPath = f"s3a://{repo.id}/{reprocessBranch}/{processedFileName}"

df.write.format("csv").mode("overwrite").save(newDataPath)

print("🟩 Overwrote processed data to " + reprocessBranch + " branch")

## Commit changes and attach new Git code URL

In [None]:
gitURL = 'https://github.com/treeverse/lakeFS-samples/blob/main/03-apache-spark-python-demo/Airflow/etl_task2_1.py'
lakefs.commits.commit(
    repository=repo.id,
    branch=reprocessBranch,
    commit_creation=CommitCreation(
        message='Fixed ETL job',
        metadata={'Git URL': gitURL}))

print("\n🟩 Committed " + reprocessBranch + " branch and added Git URL for the new ETL logic as a metadata")
print("Example Git URL: " + gitURL)

## Diff between the reprocess branch and the main branch

In [None]:
print("\n🟩 Diff between " + reprocessBranch + " branch and the main branch")

results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    lakefs.refs.diff_refs(
        repository=repo.id,
        left_ref=mainBranch,
        right_ref=reprocessBranch).results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

print("\n🟩 🟩 Process finished 🟩 🟩")