In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.2 pyspark-shell'

In [2]:
from pyspark.sql.session import SparkSession
import json

In [3]:
spark = SparkSession \
        .builder \
        .appName("test").getOrCreate()

In [1]:
from fauked.io import SparkStreamingDataSet

In [2]:
ssds = SparkStreamingDataSet(filepath="IGNORE", file_format="kafka", load_args={
    "kafka.bootstrap.servers": "localhost:19092",
    "subscribe": "hello-world",
    "includeHeaders": "true",
    "startingOffsets": "earliest",
})

In [3]:
df = ssds.load()

In [4]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:19092") \
    .option("subscribe", "hello-world") \
    .option("includeHeaders", "true") \
    .option("startingOffsets", "earliest") \
    .load()

In [4]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)
 |-- headers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: binary (nullable = true)



In [None]:
ssds.save(df)

In [None]:
df.writeStream.format("console").outputMode("append").start().awaitTermination()

In [10]:
df.writeStream.foreach(lambda row: print(row)).start().awaitTermination()

KeyboardInterrupt: 

In [12]:
from pyspark.sql.types import StructType

userSchema = StructType().add("key", "string").add("value", "string")

In [6]:
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import ArrayType, FloatType, StringType, StructField, StructType

string_df = df.selectExpr("CAST(value AS STRING)")
    
# Print out the new dataframa schema
#string_df.printSchema()

# Create a schema for the df
schema = StructType([
    StructField("columns", ArrayType(StringType())),
    StructField("data", ArrayType(ArrayType(FloatType())))
    ])

# Select the data present in the column value and apply the schema on it
json_df = string_df.withColumn("jsonData", from_json(col("value"), schema)).select("jsondata.*")

# Print out the dataframa schema
#json_df.printSchema()

# Write output to the terminal
json_df.writeStream.format("console").outputMode("append").start().awaitTermination()

# Write output to kafka topic
#json_df.selectExpr("id AS key", "to_json(struct(*)) AS value")\

KeyboardInterrupt: 

Read from `bey`

In [4]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:19092") \
    .option("subscribe", "bey") \
    .option("includeHeaders", "true") \
    .option("startingOffsets", "earliest") \
    .load()



from pyspark.sql.functions import col, from_json
from pyspark.sql.types import ArrayType, FloatType, IntegerType, StringType, StructField, StructType

string_df = df.selectExpr("CAST(value AS STRING)")
    
# Print out the new dataframa schema
#string_df.printSchema()

# Create a schema for the df
schema = StructType([
    StructField("columns", ArrayType(StringType())),
    StructField("data", ArrayType(ArrayType(FloatType()))),
    StructField("id", IntegerType())
    ])

# Select the data present in the column value and apply the schema on it
json_df = string_df.withColumn("jsonData", from_json(col("value"), schema)).select("jsondata.*")

# Print out the dataframa schema
json_df.printSchema()

# Write output to the terminal
#json_df.writeStream.format("console").outputMode("append").start().awaitTermination()

# Write output to kafka topic
#json_df.selectExpr("id AS key", "to_json(struct(*)) AS value")\

root
 |-- columns: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- data: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- id: integer (nullable = true)



Now for `hey`

In [5]:
other_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:19092") \
    .option("subscribe", "hey") \
    .option("includeHeaders", "true") \
    .option("startingOffsets", "earliest") \
    .load()


from pyspark.sql.functions import col, from_json
from pyspark.sql.types import ArrayType, FloatType, IntegerType, StringType, StructField, StructType

other_string_df = other_df.selectExpr("CAST(value AS STRING)")
    
# Print out the new dataframa schema
#string_df.printSchema()

# Create a schema for the df
other_schema = StructType([
    StructField("stuff", StringType()),
    StructField("id", IntegerType())
    ])

# Select the data present in the column value and apply the schema on it
other_json_df = other_string_df.withColumn("jsonData", from_json(col("value"), other_schema)).select("jsondata.*")

# Print out the dataframa schema
other_json_df.printSchema()

# Write output to the terminal
#json_df.writeStream.format("console").outputMode("append").start().awaitTermination()

# Write output to kafka topic
#json_df.selectExpr("id AS key", "to_json(struct(*)) AS value")\

root
 |-- stuff: string (nullable = true)
 |-- id: integer (nullable = true)



In [6]:
joined_df = json_df.join(other_json_df, on="id")

In [7]:
joined_df.writeStream.format("console").outputMode("append").start().awaitTermination()

KeyboardInterrupt: 

In [8]:
joined_df.isStreaming

True

In [11]:
joined_df.writeStream.foreach(lambda row: print(int(row.stuff) * 100)).start().awaitTermination()

KeyboardInterrupt: 

In [10]:
df.createOrReplaceTempView("updates")
spark.sql("select count(*) from updates")

DataFrame[count(1): bigint]

In [12]:
df.select("value")

DataFrame[value: binary]