# Use Case: Reprocess and Backfill Data with new ETL logic

## Create Ingestion Branch

In [None]:
branch_date_time = datetime.datetime.now().strftime("_%Y-%m-%d_%H-%M-%S")
latestIngestBranch = ingestBranch + branch_date_time

branchIngest = repo.branch(latestIngestBranch).create(source_reference=mainBranch, exist_ok=True)

print("🟩 Created ingestion branch: " + latestIngestBranch)

## Ingest data file

In [None]:
contentToUpload = open(f"/data/{fileName}", 'r').read()
branchIngest.object(fileName).upload(data=contentToUpload, mode='wb', pre_sign=False)

print("🟩 Ingested data file: " + fileName)

## Read data

In [None]:
print("\n🟩 Reading data from ingestion branch")
dataPath = f"s3a://{repo.id}/{latestIngestBranch}/{fileName}"

df = spark.read.format("csv").schema(dataFileSchema).load(dataPath)
df.show()

## ETL Logic

### Calculate average but average calculation is wrong (divided by 4 instead of 5)

In [None]:
print("🟩 Processed data with wrong value for Average field. Average value is Total divided 4 instead of dividing by 5")

# Calculate total
df = df.withColumn('Total_Sales', df.Apparel_Sales + df.Books_Sales + df.Electronics_Sales + df.Furniture_Sales + df.Toys_Sales)
# Calculate average but average calculation is wrong (divided by 4 instead of 5)
df = df.withColumn('Average_Sales_per_Product_Category', (df.Total_Sales)/4)
df.show()

## Append processed data

In [None]:
newDataPath = f"s3a://{repo.id}/{latestIngestBranch}/{processedFileName}"

df.write.format("csv").mode("append").save(newDataPath)
print("🟩 Appended processed data to ingestion branch")

## Commit changes and attach Git code URL

In [None]:
gitURL = 'https://github.com/treeverse/lakeFS-samples/blob/main/01_standalone_examples/airflow-01/airflow/New_DAG/etl_task1.py'
branchIngest.commit(message='Committed by ETL job',
    metadata={'Git URL': gitURL})

print("\n🟩 Committed ingestion branch and added Git URL for the ETL logic as a metadata")
print("Example Git URL: " + gitURL)

## Merge ingest branch to main branch

In [None]:
branchIngest.merge_into(branchMain)

print("\n🟩 Merged ingestion branch to main branch")
print("\n🟩 🟩 ETL job finished 🟩 🟩 ")