# 1. Load LIB

In [1]:
import sys.process._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Dataset


# 2. Load function & params

In [2]:
def killAll() = {
    SparkSession
        .active
        .streams
        .active
        .foreach { x =>
                    val desc = x.lastProgress.sources.head.description
                    x.stop
                    println(s"Stopped ${desc}")
        }               
}

killAll: ()Unit


In [3]:
def createConsoleSink(df: DataFrame) = {
    df
    .writeStream
    .format("console")
    .trigger(Trigger.ProcessingTime("10 seconds")) // раз в 10 секунд, а по умолчанию раз в 1 секунду
    .option("truncate", "false")
    .option("numRows", "20")
}


createConsoleSink: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.streaming.DataStreamWriter[org.apache.spark.sql.Row]


In [4]:
def createParquetSink(df: DataFrame, 
                      fileName: String) = {
    df
    .writeStream
    .format("parquet")
    .option("path", s"/tmp/$fileName")
    .option("checkpointLocation", s"/tmp/$fileName")
    //.trigger(Trigger.ProcessingTime("10 seconds"))
}

createParquetSink: (df: org.apache.spark.sql.DataFrame, fileName: String)org.apache.spark.sql.streaming.DataStreamWriter[org.apache.spark.sql.Row]


In [9]:
val sdf_rate = spark
    .readStream
    .format("rate")
    .load
sdf_rate.printSchema
// sdf_rate.explain(true)

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



sdf_rate = [timestamp: timestamp, value: bigint]


[timestamp: timestamp, value: bigint]

In [6]:
val csvOptions = Map("header" -> "true", "inferSchema" -> "true")
val airports = spark.read.options(csvOptions).csv("airport-codes.csv")

airports.printSchema
airports.show(numRows = 1, truncate = 100, vertical = true)

Waiting for a Spark session to start...

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- elevation_ft: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)

-RECORD 0------------------------------------------
 ident        | 00A                                
 type         | heliport                           
 name         | Total Rf Heliport                  
 elevation_ft | 11                                 
 continent    | NA                                 
 iso_country  | US                                 
 iso_region   | US-PA                              
 municipality | Bensalem                           
 gps_code     | 00A                 

csvOptions = Map(header -> true, inferSchema -> true)
airports = [ident: string, type: string ... 10 more fields]


[ident: string, type: string ... 10 more fields]

# 3. example1: Console Sink

In [10]:
val sink_console = createConsoleSink(sdf_rate)
val sq_console = sink_console.start

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+



sink_console = org.apache.spark.sql.streaming.DataStreamWriter@13f1c1c7
sq_console = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5623cac8


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5623cac8

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2023-03-12 13:59:42.223|0    |
|2023-03-12 13:59:44.223|2    |
|2023-03-12 13:59:46.223|4    |
|2023-03-12 13:59:48.223|6    |
|2023-03-12 13:59:43.223|1    |
|2023-03-12 13:59:45.223|3    |
|2023-03-12 13:59:47.223|5    |
+-----------------------+-----+



In [11]:
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


# 4. example2: Parquet Sink1

In [13]:
val sink_pq = createParquetSink(sdf_rate, "tmp_01.parquet")
val sq_pq = sink_pq.start

sink_pq = org.apache.spark.sql.streaming.DataStreamWriter@7b6c920a
sq_pq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5084a09d


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5084a09d

In [14]:
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


In [15]:
"hadoop fs -ls /tmp/tmp_01.parquet".!!

"Found 55 items
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 14:00 /tmp/tmp_01.parquet/_spark_metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 14:00 /tmp/tmp_01.parquet/commits
-rw-r--r--   3 dinar.sadykov hdfs         45 2023-03-12 13:35 /tmp/tmp_01.parquet/metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 14:00 /tmp/tmp_01.parquet/offsets
-rw-r--r--   3 dinar.sadykov hdfs        790 2023-03-12 13:36 /tmp/tmp_01.parquet/part-00000-014d8859-c5e4-4f06-a3a4-bdd30cdc3379-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs        790 2023-03-12 13:36 /tmp/tmp_01.parquet/part-00000-0ab1e079-7c1a-40d0-b4b4-af45e6198219-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs        790 2023-03-12 14:00 /tmp/tmp_01.parquet/part-00000-17b...


In [16]:
val rates = spark.read
    .parquet("/tmp/tmp_01.parquet")
println(rates.count)
rates.printSchema
rates.show(5, false)

1471
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)

+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2023-03-12 13:36:29.651|31   |
|2023-03-12 13:36:31.651|33   |
|2023-03-12 13:36:33.651|35   |
|2023-03-12 13:36:35.651|37   |
|2023-03-12 13:36:37.651|39   |
+-----------------------+-----+
only showing top 5 rows



rates = [timestamp: timestamp, value: bigint]


[timestamp: timestamp, value: bigint]

# 5. example3: Parquet Sink1 + ident

In [17]:
val idents = airports.select("ident").limit(200).distinct.as[String].collect

val ident_sdf_rate = sdf_rate.withColumn("ident"
                              , shuffle(
                                  array(
                                      idents.map(lit(_)):_*)
                              )(0))

idents = Array(00A, 00AA, 00AK, 00AL, 00AR, 00AS, 00AZ, 00CA, 00CL, 00CN, 00CO, 00FA, 00FD, 00FL, 00GA, 00GE, 00HI, 00ID, 00IG, 00II, 00IL, 00IN, 00IS, 00KS, 00KY, 00LA, 00LL, 00LS, 00MD, 00MI, 00MN, 00MO, 00MT, 00N, 00NC, 00NJ, 00NK, 00NY, 00OH, 00OI, 00OK, 00OR, 00PA, 00PN, 00PS, 00S, 00SC, 00SD, 00TA, 00TE, 00TN, 00TS, 00TX, 00UT, 00VA, 00VI, 00W, 00WA, 00WI, 00WN, 00WV, 00WY, 00XS, 01A, 01AK, 01AL, 01AR, 01AZ, 01C, 01CA, 01CL, 01CN, 01CO, 01CT, 01FA, 01FD, 01FL, 01GA, 01GE, 01IA, 01ID, 01II, 01IL, 01IN, 01IS, 01J, 01K, 01KS, 01KY, 01LA, 01LL, 01LS, 01MA, 01MD, 01ME, 01MI, 01MN, 01MO, 01MT, 01NC, 01NE, 01NH, 01NJ, 01NM, 01NV, 01NY, 01OI, 01OK, 01OR, 01PA, 01PN, 01PS, 01SC, 01TA, 01TE, 01TN, 01TS, 01TX, 01U, 01UT, 01VA, 01WA, 01WI, 01WN, 01WT, 01WY, 01XA, 01XS, 02AK, 02...


Array(00A, 00AA, 00AK, 00AL, 00AR, 00AS, 00AZ, 00CA, 00CL, 00CN, 00CO, 00FA, 00FD, 00FL, 00GA, 00GE, 00HI, 00ID, 00IG, 00II, 00IL, 00IN, 00IS, 00KS, 00KY, 00LA, 00LL, 00LS, 00MD, 00MI, 00MN, 00MO, 00MT, 00N, 00NC, 00NJ, 00NK, 00NY, 00OH, 00OI, 00OK, 00OR, 00PA, 00PN, 00PS, 00S, 00SC, 00SD, 00TA, 00TE, 00TN, 00TS, 00TX, 00UT, 00VA, 00VI, 00W, 00WA, 00WI, 00WN, 00WV, 00WY, 00XS, 01A, 01AK, 01AL, 01AR, 01AZ, 01C, 01CA, 01CL, 01CN, 01CO, 01CT, 01FA, 01FD, 01FL, 01GA, 01GE, 01IA, 01ID, 01II, 01IL, 01IN, 01IS, 01J, 01K, 01KS, 01KY, 01LA, 01LL, 01LS, 01MA, 01MD, 01ME, 01MI, 01MN, 01MO, 01MT, 01NC, 01NE, 01NH, 01NJ, 01NM, 01NV, 01NY, 01OI, 01OK, 01OR, 01PA, 01PN, 01PS, 01SC, 01TA, 01TE, 01TN, 01TS, 01TX, 01U, 01UT, 01VA, 01WA, 01WI, 01WN, 01WT, 01WY, 01XA, 01XS, 02AK, 02...

In [18]:
val ident_sq_pq_sink = createParquetSink(ident_sdf_rate, "tmp_02.parquet")
val ident_sq_pq = ident_sq_pq_sink.start

ident_sq_pq_sink = org.apache.spark.sql.streaming.DataStreamWriter@13f6d0ba
ident_sq_pq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1ece07ef


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1ece07ef

In [19]:
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


In [20]:
println("hadoop fs -ls /tmp/tmp_02.parquet/*".!!)

Found 102 items
-rw-r--r--   3 dinar.sadykov hdfs        257 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/0
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/1
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/10
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 14:02 /tmp/tmp_02.parquet/_spark_metadata/100
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 14:02 /tmp/tmp_02.parquet/_spark_metadata/101
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/11
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/12
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/13
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.parquet/_spark_metadata/14
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 13:50 /tmp/tmp_02.p

In [21]:
val ident_pq = spark.read
    .parquet("/tmp/tmp_02.parquet/part-00000-01408689-bc5e-44ca-b0e0-31f18bd4d7fe-c000.snappy.parquet")

println(ident_pq.count)
ident_pq.printSchema
ident_pq.show(5, false)

1
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)

+-----------------------+-----+-----+
|timestamp              |value|ident|
+-----------------------+-----+-----+
|2023-03-12 14:01:58.601|697  |00AL |
+-----------------------+-----+-----+



ident_pq = [timestamp: timestamp, value: bigint ... 1 more field]


[timestamp: timestamp, value: bigint ... 1 more field]

# 6. Работа с Kafka с помощь Static Dataframe

In [22]:
val ident_pq = spark.read
    .parquet("/tmp/tmp_02.parquet/")

println(ident_pq.count)
ident_pq.printSchema
ident_pq.show(5, false)

700
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)

+-----------------------+-----+-----+
|timestamp              |value|ident|
+-----------------------+-----+-----+
|2023-03-12 13:51:01.601|40   |01NC |
|2023-03-12 13:51:03.601|42   |01XA |
|2023-03-12 13:51:05.601|44   |00TA |
|2023-03-12 13:51:07.601|46   |00VI |
|2023-03-12 13:51:09.601|48   |00NY |
+-----------------------+-----+-----+
only showing top 5 rows



ident_pq = [timestamp: timestamp, value: bigint ... 1 more field]


[timestamp: timestamp, value: bigint ... 1 more field]

In [None]:
// def writeKafka[T](topic: String, data: Dataset[T]): Unit = {
//     val kafkaParams = Map(
//         "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667"
//     )
    
//     data
//         .toJSON
//         .withColumn("topic", lit(topic))
//         .write
//         .format("kafka")
//         .options(kafkaParams)
//         .save
// }

// writeKafka("test_topic0", ident_pq)

In [25]:
val kafkaParams = Map(
        "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667",
        "subscribe" -> "test_topic0"
    )


val sdf_kafka0 = spark.read
    .format("kafka")
    .options(kafkaParams)
    .load

sdf_kafka0.printSchema
sdf_kafka0.show(3)
sdf_kafka0.count

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

+----+--------------------+-----------+---------+------+--------------------+-------------+
| key|               value|      topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------+---------+------+--------------------+-------------+
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   310|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   311|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   312|2023-03-08 23:48:...|            0|
+----+--------------------+-----------+---------+------+--------------------+-------------+
only showing top 3 rows



kafkaParams = Map(kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667, subscribe -> test_topic0)
sdf_kafka0 = [key: binary, value: binary ... 5 more fields]


81

Чтение из Kafka имеет несколько особенностей:
- по умолчанию читается все содержимое топика. Поскольку обычно в нем много данных, эта операция может создать большую нагрузку на кластер Kafka и Spark приложение
- колонки `value` и `key` имеют тип `binary`, который необходимо десереализовать

Чтобы прочитать только определенную часть топика, нам необходимо задать минимальный и максимальный оффсет для чтения с помощью параметров `startingOffsets` , `endingOffsets`. Возьмем два случайных события:

In [27]:
// На основании этих событий подготовим параметры startingOffsets и endingOffsets

sdf_kafka0
    .sample(0.1)
    .limit(10)
    .select('topic, 'partition, 'offset)
    .show

+-----------+---------+------+
|      topic|partition|offset|
+-----------+---------+------+
|test_topic0|        0|   316|
|test_topic0|        0|   329|
|test_topic0|        0|   340|
|test_topic0|        0|   356|
|test_topic0|        0|   360|
|test_topic0|        0|   366|
|test_topic0|        0|   371|
|test_topic0|        0|   385|
+-----------+---------+------+



In [28]:
val kafkaParams = Map(
        "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667",
        "subscribe" -> "test_topic0",
        "startingOffsets" -> """ { "test_topic0": { "0": 316 } } """,
        "endingOffsets" -> """ { "test_topic0": { "0": 385 } }  """//,
        //"failOnDataLoss" -> "false"
    )


val sdf_kafka1 = spark
    .read
    .format("kafka")
    .options(kafkaParams)
    .load

sdf_kafka1.printSchema
sdf_kafka1.show(20)

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

+----+--------------------+-----------+---------+------+--------------------+-------------+
| key|               value|      topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------+---------+------+--------------------+-------------+
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   316|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   317| 2023-03-08 23:48:04|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   318| 2023-03-08 23:48:04|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   319| 2023-03-08 23:48:04|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   320|2023-03-08 23:48:

kafkaParams = Map(kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667, subscribe -> test_topic0, startingOffsets -> " { "test_topic0": { "0": 316 } } ", endingOffsets -> " { "test_topic0": { "0": 385 } }  ")
sdf_kafka1 = [key: binary, value: binary ... 5 more fields]


[key: binary, value: binary ... 5 more fields]

По умолчанию параметр `startingOffsets` имеет значение `earliest`, а `endingOffsets` - `latest`. Поэтому, когда мы не указывали эти параметры, Spark прочитал содержимое всего топика

Чтобы получить наши данные, которые мы записали в топик, нам необходимо их десереализовать. В нашем случае достаточно использовать `.cast("string")`, однако это работает не всегда, т.к. формат данных может быть произвольным.

In [30]:
val sdf_kafka1_json = sdf_kafka1
    .select('value.cast("string"))
    .as[String]

sdf_kafka1_json.show(3, false)

val sdf_kafka1_json_parsed = spark.read
    .json(sdf_kafka1_json)

sdf_kafka1_json_parsed.printSchema
sdf_kafka1_json_parsed.show(3, false)

+-----------------------------------------------------------------------+
|value                                                                  |
+-----------------------------------------------------------------------+
|{"timestamp":"2022-10-31T19:53:49.336+03:00","value":11,"ident":"02II"}|
|{"timestamp":"2022-10-31T19:53:51.336+03:00","value":13,"ident":"01IA"}|
|{"timestamp":"2022-10-31T19:53:53.336+03:00","value":15,"ident":"00S"} |
+-----------------------------------------------------------------------+
only showing top 3 rows

root
 |-- ident: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- value: long (nullable = true)

+-----+-----------------------------+-----+
|ident|timestamp                    |value|
+-----+-----------------------------+-----+
|02II |2022-10-31T19:53:49.336+03:00|11   |
|01IA |2022-10-31T19:53:51.336+03:00|13   |
|00S  |2022-10-31T19:53:53.336+03:00|15   |
+-----+-----------------------------+-----+
only showing top 3 rows



sdf_kafka1_json = [value: string]
sdf_kafka1_json_parsed = [ident: string, timestamp: string ... 1 more field]


[ident: string, timestamp: string ... 1 more field]

# 7. Работа с Kafka с помощью Streaming DF

При создании SDF из Kafka необходимо помнить, что:
- `startingOffsets` по умолчанию имеет значение `latest`
- `endingOffsets` использовать нельзя
- количество сообщений за батч можно (и нужно) ограничить параметром `maxOffsetPerTrigger` (по умолчанию он не задан и первый батч будет содержать данные всего топика

In [34]:
val kafkaParams = Map(
        "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667",
        "subscribe" -> "test_topic0",
        "startingOffsets" -> """earliest""",
        "maxOffsetsPerTrigger" -> "2"
    )

val sdf_kafka2 = spark
    .readStream // <- TOBE, AS IS = .read
    .format("kafka")
    .options(kafkaParams)
    .load

val sdf_kafka2_parsed = sdf_kafka2
    .select('value.cast("string")
            , 'topic
            , 'partition
            , 'offset)

val sdf_kafka2_sink = createConsoleSink(sdf_kafka2_parsed)

val sdf_kafka2_sq = sdf_kafka2_sink.start

kafkaParams = Map(kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667, subscribe -> test_topic0, startingOffsets -> earliest, maxOffsetsPerTrigger -> 2)
sdf_kafka2 = [key: binary, value: binary ... 5 more fields]
sdf_kafka2_parsed = [value: string, topic: string ... 2 more fields]
sdf_kafka2_sink = org.apache.spark.sql.streaming.DataStreamWriter@52fe6256
sdf_kafka2_sq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@20d8d7d5


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@20d8d7d5

-------------------------------------------
Batch: 0
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partition|offset|
+----------------------------------------------------------------------+-----------+---------+------+
|{"timestamp":"2022-10-31T19:53:38.336+03:00","value":0,"ident":"00KS"}|test_topic0|0        |310   |
|{"timestamp":"2022-10-31T19:53:40.336+03:00","value":2,"ident":"00MO"}|test_topic0|0        |311   |
+----------------------------------------------------------------------+-----------+---------+------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partiti

-------------------------------------------
Batch: 12
-------------------------------------------
+-----------------------------------------------------------------------+-----------+---------+------+
|value                                                                  |topic      |partition|offset|
+-----------------------------------------------------------------------+-----------+---------+------+
|{"timestamp":"2022-10-31T19:54:25.336+03:00","value":47,"ident":"02OH"}|test_topic0|0        |334   |
|{"timestamp":"2022-10-31T19:54:27.336+03:00","value":49,"ident":"00IG"}|test_topic0|0        |335   |
+-----------------------------------------------------------------------+-----------+---------+------+

-------------------------------------------
Batch: 13
-------------------------------------------
+-----------------------------------------------------------------------+-----------+---------+------+
|value                                                                  |topic    

Если мы перезапустим этот стрим, он повторно прочитает все данные. Чтобы обеспечить сохранение состояния стрима после обработки каждого батча, нам необходимо добавить параметр `checkpointLocation` в опции `writeStream`:

In [35]:
def createConsoleSinkWithCheckpoint(chkName: String, df: DataFrame) = {
    df
    .writeStream
    .format("console")
    .trigger(Trigger.ProcessingTime("10 seconds"))
    .option("checkpointLocation", s"/tmp/$chkName")
    .option("truncate", "false")
    .option("numRows", "20")
}

createConsoleSinkWithCheckpoint: (chkName: String, df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.streaming.DataStreamWriter[org.apache.spark.sql.Row]


In [36]:
val sdf_kafka2_sink_2 = createConsoleSinkWithCheckpoint("tmp_03", sdf_kafka2_parsed)
val sdf_kafka2_sq_2 = sdf_kafka2_sink_2.start

sdf_kafka2_sink_2 = org.apache.spark.sql.streaming.DataStreamWriter@2eba5948
sdf_kafka2_sq_2 = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3ed3a585


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3ed3a585

-------------------------------------------
Batch: 0
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partition|offset|
+----------------------------------------------------------------------+-----------+---------+------+
|{"timestamp":"2022-10-31T19:53:38.336+03:00","value":0,"ident":"00KS"}|test_topic0|0        |310   |
|{"timestamp":"2022-10-31T19:53:40.336+03:00","value":2,"ident":"00MO"}|test_topic0|0        |311   |
+----------------------------------------------------------------------+-----------+---------+------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partiti

-------------------------------------------
Batch: 12
-------------------------------------------
+-----------------------------------------------------------------------+-----------+---------+------+
|value                                                                  |topic      |partition|offset|
+-----------------------------------------------------------------------+-----------+---------+------+
|{"timestamp":"2022-10-31T19:54:25.336+03:00","value":47,"ident":"02OH"}|test_topic0|0        |334   |
|{"timestamp":"2022-10-31T19:54:27.336+03:00","value":49,"ident":"00IG"}|test_topic0|0        |335   |
+-----------------------------------------------------------------------+-----------+---------+------+

-------------------------------------------
Batch: 13
-------------------------------------------
+-----------------------------------------------------------------------+-----------+---------+------+
|value                                                                  |topic    

In [37]:
killAll

Stopped KafkaV2[Subscribe[test_topic0]]
Stopped KafkaV2[Subscribe[test_topic0]]


In [38]:
println("hadoop fs -ls /tmp/tmp_03/".!!)

Found 4 items
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 14:21 /tmp/tmp_03/commits
-rw-r--r--   3 dinar.sadykov hdfs         45 2023-03-12 14:18 /tmp/tmp_03/metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 14:21 /tmp/tmp_03/offsets
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 14:18 /tmp/tmp_03/sources



In [39]:
println("hadoop fs -ls /tmp/tmp_03/offsets/".!!)

Found 15 items
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:18 /tmp/tmp_03/offsets/0
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:18 /tmp/tmp_03/offsets/1
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:20 /tmp/tmp_03/offsets/10
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:20 /tmp/tmp_03/offsets/11
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:20 /tmp/tmp_03/offsets/12
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:20 /tmp/tmp_03/offsets/13
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:21 /tmp/tmp_03/offsets/14
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:19 /tmp/tmp_03/offsets/2
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:19 /tmp/tmp_03/offsets/3
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:19 /tmp/tmp_03/offsets/4
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 14:19 /tmp/tmp_03/offsets/5
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 

In [41]:
println("hadoop fs -head /tmp/tmp_03/offsets/4/".!!)

v1
{"batchWatermarkMs":0,"batchTimestampMs":1678619960002,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"test_topic0":{"0":320}}



# 8. Выводы:

- Работать с Kafka можно как с использованием Static DF, так и с помощью Streaming DF
- Чтобы стрим запоминал свое состояние после остановки, необходимо использовать checkpoint - директорию на HDFS (или локальной ФС), в которую будет сохранятся состояние стрима после каждого батча
- Apache Kafka - распределенная система, обеспечивающая передачу потока данных в слабосвязанных системах