<img src="https://lakefs.io/wp-content/uploads/2022/09/lakeFS-Logo.svg" alt="lakeFS logo" width=250/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src="https://www.apache.org/logos/res/kafka/default.png" alt="Apache Kafka" width=200/>  

## lakeFS ❤️ Apache Kafka - an example using streaming data

# Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

# Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "kafka-stream-demo"

## Versioning Information 

In [None]:
sourceBranch = "main"

### Import libraries

In [None]:
import os
import lakefs
import datetime
from assets.lakefs_demo import print_commit, print_diff

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=sourceBranch, exist_ok=True)
branchMain = repo.branch(sourceBranch)
print(repo)

### Set up Spark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Kafka / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .getOrCreate()

spark

### Initialize a new Kafka producer

In [None]:
from kafka import KafkaProducer
from time import sleep
producer = KafkaProducer(bootstrap_servers=['lakefs-broker:29092'])

---

---

# Main demo starts here 🚦 👇🏻

# Ingest Streaming Data

### Create a dirty branch for streaming data

In [None]:
streaming_branch = "streaming" + datetime.datetime.now().strftime("_%Y%m%dT%H%M%S")
branchStreaming = repo.branch(streaming_branch).create(source_reference=sourceBranch)
print(f"Created {streaming_branch} branch from main")

### Configure Kafka S3 Sink Connector to sink data to streaming branch in lakeFS repo

In [None]:
command = 'curl -s -X PUT -H  "Content-Type:application/json" http://connect:8083/connectors/test_connector/config \
    -d \'{ \
    "connector.class": "io.confluent.connect.s3.S3SinkConnector", \
    "tasks.max": "1", \
    "key.converter": "org.apache.kafka.connect.storage.StringConverter", \
    "value.converter": "org.apache.kafka.connect.storage.StringConverter", \
    "topics": "quickstart", \
    "topics.dir": "' + streaming_branch + '/ingest", \
    "format.class": "io.confluent.connect.s3.format.json.JsonFormat", \
    "flush.size": "100", \
    "schema.compatibility": "NONE", \
    "s3.bucket.name": "' + repo_name + '", \
    "s3.region": "us-east-1", \
    "storage.class": "io.confluent.connect.s3.storage.S3Storage", \
    "store.url": "' + lakefsEndPoint + '", \
    "aws.access.key.id": "' + lakefsAccessKey + '", \
    "aws.secret.access.key": "' + lakefsSecretKey + '", \
    "partitioner.class": "io.confluent.connect.storage.partitioner.DefaultPartitioner" \
    }\''

! $command

### Produce some streaming data

In [None]:
for e in range(100):
    producer.send('quickstart', bytes(f"message-{e}", 'utf-8'))

sleep(3)
producer.flush()

### Read streaming data sinked to lakeFS repo

In [None]:
dataPath = f"s3a://{repo_name}/{streaming_branch}/ingest/quickstart/partition=0/"
print(f"Reading ingested data from {dataPath}")
df = spark.read.csv(dataPath).withColumnRenamed("_c0","data")
df.show()

### Create a ingestion branch to process and load streaming data

In [None]:
ingest_time = datetime.datetime.now().strftime("_%Y%m%dT%H%M%S")
ingest_branch = "ingest" + ingest_time

branchIngest = repo.branch(ingest_branch).create(source_reference=sourceBranch)
print(f"Created {ingest_branch} branch from main")

### Append streaming data to ingestion branch

In [None]:
df.write.csv(f"s3a://{repo.id}/{ingest_branch}/stream/quickstart{ingest_time}")

### Commit streaming data load

In [None]:
ref = branchIngest.commit(message='Streaming data load',
                       metadata={'author': 'demo user',
                                 'data source': 'Kafka',
                                 'Kafka topic': 'quickstart'})
print_commit(ref.get_commit())

### Merge ingestion branch to main branch

In [None]:
res = branchIngest.merge_into(branchMain)
print(res)

---

---

---