In [16]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("local").getOrCreate()

In [17]:
streaming_df=spark.read.parquet("hdfs://namenode:9000/PurchasedItems")
streaming_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [24]:
json_expanded_df = streaming_df \
    .withColumn("value", streaming_df["value"].cast("string")) \
    .select("value")

In [19]:
json_df = streaming_df.selectExpr("cast(value as string) as value", "timestamp")
json_df.printSchema() 

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType

# SparkSession oluşturma
spark = SparkSession.builder \
    .appName("Read from Hadoop") \
    .getOrCreate()

# Kafka'dan okunan verilerin şemasını tanımlama
purchased_schema = StructType([
    StructField("session_id", IntegerType(), True),
    StructField("time_stamp", StringType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("total_price", FloatType(), True),
    StructField("order_id", IntegerType(), True),
    StructField("payment_type", StringType(), True),
    StructField("products", ArrayType(
        StructType([
            StructField("product_id", IntegerType(), True),
            StructField("price", FloatType(), True),
            StructField("discount", IntegerType(), True),
            StructField("item_count", IntegerType(), True)
        ])
    ), True)
])

# Kafka'dan okunan verileri DataFrame'e yükleme ve şemaya uygun olarak dönüştürme
kafka_df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:29092") \
    .option("subscribe", "PurchasedItems") \
    .option("startingOffsets", "earliest") \
    .load()

# Binary verileri uygun bir formata dönüştürme
json_expanded_df = kafka_df \
    .withColumn("value", kafka_df["value"].cast("string"))

# JSON verisini şemaya uygun olarak parse etme
parsed_df = json_expanded_df.selectExpr("CAST(value AS STRING)") \
    .selectExpr("from_json(value, 'purchased_schema') AS data") \
    .select("data.*")

# DataFrame'i geçici bir tabloya kaydetme
parsed_df.createOrReplaceTempView("purchased_items")

# Oluşturulan tabloyu sorgulama ve sonucu ekrana yazdırma
spark.sql("SELECT * FROM purchased_items").show(truncate=False)




AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.