In [1]:
%pip install happybase

Defaulting to user installation because normal site-packages is not writeable
Collecting happybase
  Downloading happybase-1.2.0.tar.gz (40 kB)
     |################################| 40 kB 206 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting thriftpy2>=0.4
  Downloading thriftpy2-0.5.2.tar.gz (782 kB)
     |################################| 782 kB 282 kB/s            
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25h  Downloading thriftpy2-0.5.1.tar.gz (781 kB)
     |################################| 781 kB 215 kB/s            
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting ply<4.0

In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType, DoubleType
from happybase import Connection
import sys
import traceback

In [35]:
spark = SparkSession.builder \
    .appName("KafkaStreamingToHBase") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.8,org.apache.hbase:hbase-client:2.4.9,org.apache.hbase:hbase-common:2.4.9") \
    .getOrCreate()

spark

In [36]:
# Define schema for the incoming JSON data
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", TimestampType()) \
    .add("metadata", StructType()
        .add("category", StringType())
        .add("source", StringType())
    ) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", DoubleType()) \
    .add("paymentMethod", StringType()) \
    .add("recommendedProductId", StringType()) \
    .add("algorithm", StringType())


In [37]:
# Kafka connection details
bootstrap_servers = "pkc-56d1g.eastus.azure.confluent.cloud:9092"
kafka_topic = "Emad_topic" # add topic name
kafka_username = "JUKQQM4ZM632RECA"
kafka_password = "UUkrPuSttgOC0U9lY3ZansNsKfN9fbxZPFwrGxudDrfv+knTD4rCwK+KdIzVPX0D"

In [38]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.sasl.jaas.config",
            f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_username}" password="{kafka_password}";') \
    .load()

In [39]:
json_df = df.selectExpr("CAST(value AS STRING)").select(from_json("value", schema).alias("data")).select("data.*")

In [40]:
def writeToHBase(df, epoch_id):
    print(f"Batch {epoch_id}: {df.count()} records")
    df.show(5, truncate=False)  # Show first 5 records

    rdd = df.rdd
    
    rdd.foreachPartition(lambda rows: writePartition(rows))

def writePartition(rows):
    
    hbase_host = 'hbase'
    hbase_table = 'CUSTEVENTS:CustomerEvents'
    
    try:
        connection = Connection(hbase_host)
        table = connection.table(hbase_table)
        print(f"Successfully connected to HBase and opened table {hbase_table}", file=sys.stderr)
        print(f"Available tables: {tables}", file=sys.stderr)
        
        if hbase_table.encode() not in tables:
            print(f"Table {hbase_table} does not exist!", file=sys.stderr)
            return
        
        row_count = 0
        for row in rows:
            try:
                row_key = row['customerId']
                data = row.asDict()
                hbase_data = {f'cf:{k}'.encode(): str(v).encode() for k, v in data.items()}
                table.put(row_key.encode(), hbase_data)
                row_count += 1
            except Exception as row_error:
                print(f"Error processing row: {str(row_error)}", file=sys.stderr)
                print(f"Row data: {row}", file=sys.stderr)
                traceback.print_exc(file=sys.stderr)
        
        print(f"Wrote {row_count} rows to HBase", file=sys.stderr)
        
    except Exception as e:
        print(f"Error in writePartition: {str(e)}", file=sys.stderr)
        traceback.print_exc(file=sys.stderr)
    finally:
        if 'connection' in locals():
            connection.close()

In [41]:
query = json_df \
    .writeStream \
    .foreachBatch(writeToHBase) \
    .outputMode("append") \
    .start()


In [42]:
query.awaitTermination()


Batch 0: 537 records
+-------------------+----------+---------+-------------------+---------------------+--------+-----------+-------------+--------------------+-------------+
|eventType          |customerId|productId|timestamp          |metadata             |quantity|totalAmount|paymentMethod|recommendedProductId|algorithm    |
+-------------------+----------+---------+-------------------+---------------------+--------+-----------+-------------+--------------------+-------------+
|productView        |68829     |3432     |2024-07-09 11:15:27|[Clothing, Search]   |null    |null       |null         |null                |null         |
|productView        |30516     |6488     |2024-07-09 11:15:29|[Electronics, Direct]|null    |null       |null         |null                |null         |
|recommendationClick|45014     |7822     |2024-07-09 11:15:31|[,]                  |null    |null       |null         |6637                |content_based|
|productView        |78939     |8505     |2024-07

StreamingQueryException: 'An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):\n  File "/opt/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 2381, in _call_proxy\n    return_value = getattr(self.pool[obj_id], method)(*params)\n  File "/opt/spark2/python/pyspark/sql/utils.py", line 191, in call\n    raise e\n  File "/opt/spark2/python/pyspark/sql/utils.py", line 188, in call\n    self.func(DataFrame(jdf, self.sql_ctx), batch_id)\n  File "<ipython-input-40-6e62a8a5909d>", line 7, in writeToHBase\n    rdd.foreachPartition(lambda rows: writePartition(rows))\n  File "/opt/spark2/python/pyspark/rdd.py", line 806, in foreachPartition\n    self.mapPartitions(func).count()  # Force evaluation\n  File "/opt/spark2/python/pyspark/rdd.py", line 1055, in count\n    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()\n  File "/opt/spark2/python/pyspark/rdd.py", line 1046, in sum\n    return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)\n  File "/opt/spark2/python/pyspark/rdd.py", line 917, in fold\n    vals = self.mapPartitions(func).collect()\n  File "/opt/spark2/python/pyspark/rdd.py", line 816, in collect\n    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())\n  File "/opt/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__\n    answer, self.gateway_client, self.target_id, self.name)\n  File "/opt/spark2/python/pyspark/sql/utils.py", line 63, in deco\n    return f(*a, **kw)\n  File "/opt/spark2/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value\n    format(target_id, ".", name), value)\npy4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 3.0 failed 4 times, most recent failure: Lost task 1.3 in stage 3.0 (TID 10, itvdelab, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n  File "/opt/spark2/python/pyspark/worker.py", line 364, in main\n    func, profiler, deserializer, serializer = read_command(pickleSer, infile)\n  File "/opt/spark2/python/pyspark/worker.py", line 69, in read_command\n    command = serializer._read_with_length(file)\n  File "/opt/spark2/python/pyspark/serializers.py", line 173, in _read_with_length\n    return self.loads(obj)\n  File "/opt/spark2/python/pyspark/serializers.py", line 587, in loads\n    return pickle.loads(obj, encoding=encoding)\nModuleNotFoundError: No module named \'happybase\'\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$class.foreach(Iterator.scala:891)\n\tat org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)\n\tat scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)\n\tat scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)\n\tat scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)\n\tat org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)\n\tat scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)\n\tat org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)\n\tat scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)\n\tat org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2107)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2107)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2088)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2107)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2132)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:385)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:989)\n\tat org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)\n\tat org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n  File "/opt/spark2/python/pyspark/worker.py", line 364, in main\n    func, profiler, deserializer, serializer = read_command(pickleSer, infile)\n  File "/opt/spark2/python/pyspark/worker.py", line 69, in read_command\n    command = serializer._read_with_length(file)\n  File "/opt/spark2/python/pyspark/serializers.py", line 173, in _read_with_length\n    return self.loads(obj)\n  File "/opt/spark2/python/pyspark/serializers.py", line 587, in loads\n    return pickle.loads(obj, encoding=encoding)\nModuleNotFoundError: No module named \'happybase\'\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$class.foreach(Iterator.scala:891)\n\tat org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)\n\tat scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)\n\tat scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)\n\tat scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)\n\tat org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)\n\tat scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)\n\tat org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)\n\tat scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)\n\tat org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2107)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2107)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\n\n\n=== Streaming Query ===\nIdentifier: [id = 6bdec090-76a3-4ed8-9185-d05444a58758, runId = b9c87492-35e9-4ea7-9765-fd53bd975bf7]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {KafkaV2[Subscribe[Emad_topic]]: {"Emad_topic":{"1":274,"0":263}}}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nProject [data#321.eventType AS eventType#323, data#321.customerId AS customerId#324, data#321.productId AS productId#325, data#321.timestamp AS timestamp#326, data#321.metadata AS metadata#327, data#321.quantity AS quantity#328, data#321.totalAmount AS totalAmount#329, data#321.paymentMethod AS paymentMethod#330, data#321.recommendedProductId AS recommendedProductId#331, data#321.algorithm AS algorithm#332]\n+- Project [jsontostructs(StructField(eventType,StringType,true), StructField(customerId,StringType,true), StructField(productId,StringType,true), StructField(timestamp,TimestampType,true), StructField(metadata,StructType(StructField(category,StringType,true), StructField(source,StringType,true)),true), StructField(quantity,IntegerType,true), StructField(totalAmount,DoubleType,true), StructField(paymentMethod,StringType,true), StructField(recommendedProductId,StringType,true), StructField(algorithm,StringType,true), value#319, Some(GMT)) AS data#321]\n   +- Project [cast(value#306 as string) AS value#319]\n      +- StreamingExecutionRelation KafkaV2[Subscribe[Emad_topic]], [key#305, value#306, topic#307, partition#308, offset#309L, timestamp#310, timestampType#311]\n'

In [1]:
spark.stop()

NameError: name 'spark' is not defined