# 1. Load LIB

In [1]:
import sys.process._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Dataset
import org.apache.log4j._

# 2. Load function & params

In [2]:
def killAll() = {
    SparkSession
        .active
        .streams
        .active
        .foreach { x =>
                    val desc = x.lastProgress.sources.head.description
                    x.stop
                    println(s"Stopped ${desc}")
        }               
}

killAll: ()Unit


In [3]:
def createConsoleSink(df: DataFrame) = {
    df
    .writeStream
    .format("console")
    .trigger(Trigger.ProcessingTime("10 seconds")) // раз в 10 секунд, а по умолчанию раз в 1 секунду
    .option("truncate", "false")
    .option("numRows", "20")
}


createConsoleSink: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.streaming.DataStreamWriter[org.apache.spark.sql.Row]


In [4]:
def createParquetSink(df: DataFrame, 
                      fileName: String) = {
    df
    .writeStream
    .format("parquet")
    .option("path", s"/tmp/$fileName")
    .option("checkpointLocation", s"/tmp/$fileName")
    //.trigger(Trigger.ProcessingTime("10 seconds"))
}

createParquetSink: (df: org.apache.spark.sql.DataFrame, fileName: String)org.apache.spark.sql.streaming.DataStreamWriter[org.apache.spark.sql.Row]


In [5]:
val sdf_rate = spark
    .readStream
    .format("rate")
    .load
sdf_rate.printSchema
// sdf_rate.explain(true)

Waiting for a Spark session to start...

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



sdf_rate = [timestamp: timestamp, value: bigint]


[timestamp: timestamp, value: bigint]

In [6]:
val csvOptions = Map("header" -> "true", "inferSchema" -> "true")
val airports = spark.read.options(csvOptions).csv("airport-codes.csv")

airports.printSchema
airports.show(numRows = 1, truncate = 100, vertical = true)

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- elevation_ft: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)

-RECORD 0------------------------------------------
 ident        | 00A                                
 type         | heliport                           
 name         | Total Rf Heliport                  
 elevation_ft | 11                                 
 continent    | NA                                 
 iso_country  | US                                 
 iso_region   | US-PA                              
 municipality | Bensalem                           
 gps_code     | 00A                 

csvOptions = Map(header -> true, inferSchema -> true)
airports = [ident: string, type: string ... 10 more fields]


[ident: string, type: string ... 10 more fields]

In [7]:
// Нам нужно получить схему для JSON, когда JSON в ячейке в столбце col датафрейма
def col_to_schema(sdf_tmp: DataFrame, col: String): org.apache.spark.sql.types.StructType = {

    val row_data: String = sdf_tmp.select( col ).collect()(0)(0).toString
    val schema = spark.read.json(
        SparkSession.active.sparkContext.parallelize(List(row_data))
    ).schema
    schema
}

col_to_schema: (sdf_tmp: org.apache.spark.sql.DataFrame, col: String)org.apache.spark.sql.types.StructType


# 3. example1: Console Sink

In [8]:
val sink_console = createConsoleSink(sdf_rate)
val sq_console = sink_console.start

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+



sink_console = org.apache.spark.sql.streaming.DataStreamWriter@50175bc2
sq_console = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@662bd605


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@662bd605

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------+-----+
|timestamp             |value|
+----------------------+-----+
|2023-03-12 16:15:45.71|0    |
|2023-03-12 16:15:47.71|2    |
|2023-03-12 16:15:46.71|1    |
|2023-03-12 16:15:48.71|3    |
+----------------------+-----+



In [9]:
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


# 4. example2: Parquet Sink1

In [10]:
println("hadoop fs -rm -r /tmp/tmp_01.parquet".!!)

23/03/12 16:17:01 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/tmp/tmp_01.parquet' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/dinar.sadykov/.Trash/Current/tmp/tmp_01.parquet


In [11]:
val sink_pq = createParquetSink(sdf_rate, "tmp_01.parquet")
val sq_pq = sink_pq.start

sink_pq = org.apache.spark.sql.streaming.DataStreamWriter@2177c0da
sq_pq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1593eeac


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1593eeac

In [12]:
// After 10-20 seconds
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


In [13]:
"hadoop fs -ls /tmp/tmp_01.parquet".!!

"Found 15 items
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:17 /tmp/tmp_01.parquet/_spark_metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:17 /tmp/tmp_01.parquet/commits
-rw-r--r--   3 dinar.sadykov hdfs         45 2023-03-12 16:17 /tmp/tmp_01.parquet/metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:17 /tmp/tmp_01.parquet/offsets
-rw-r--r--   3 dinar.sadykov hdfs        790 2023-03-12 16:17 /tmp/tmp_01.parquet/part-00000-0441a012-ef95-4c13-93ef-7db3d9a7deda-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs        392 2023-03-12 16:17 /tmp/tmp_01.parquet/part-00000-0b2187e5-1444-44ef-91aa-d715f86e155f-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs        790 2023-03-12 16:17 /tmp/tmp_01.parquet/part-00000-39f...


In [14]:
val rates = spark
    .read
    .parquet("/tmp/tmp_01.parquet")
println(rates.count)
rates.printSchema
rates.show(5, false)

8
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)

+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2023-03-12 16:17:03.705|0    |
|2023-03-12 16:17:04.705|1    |
|2023-03-12 16:17:05.705|2    |
|2023-03-12 16:17:06.705|3    |
|2023-03-12 16:17:07.705|4    |
+-----------------------+-----+
only showing top 5 rows



rates = [timestamp: timestamp, value: bigint]


[timestamp: timestamp, value: bigint]

# 5. example3: Parquet Sink1 + ident

In [15]:
val idents = airports.select("ident").limit(200).distinct.as[String].collect

val ident_sdf_rate = sdf_rate.withColumn("ident"
                              , shuffle( // для каждой строки будет перемешивание вутри массива
                                  array(
                                      idents.map(lit(_)):_*)
                              )(0)) // берем первый элемент массива

idents = Array(00A, 00AA, 00AK, 00AL, 00AR, 00AS, 00AZ, 00CA, 00CL, 00CN, 00CO, 00FA, 00FD, 00FL, 00GA, 00GE, 00HI, 00ID, 00IG, 00II, 00IL, 00IN, 00IS, 00KS, 00KY, 00LA, 00LL, 00LS, 00MD, 00MI, 00MN, 00MO, 00MT, 00N, 00NC, 00NJ, 00NK, 00NY, 00OH, 00OI, 00OK, 00OR, 00PA, 00PN, 00PS, 00S, 00SC, 00SD, 00TA, 00TE, 00TN, 00TS, 00TX, 00UT, 00VA, 00VI, 00W, 00WA, 00WI, 00WN, 00WV, 00WY, 00XS, 01A, 01AK, 01AL, 01AR, 01AZ, 01C, 01CA, 01CL, 01CN, 01CO, 01CT, 01FA, 01FD, 01FL, 01GA, 01GE, 01IA, 01ID, 01II, 01IL, 01IN, 01IS, 01J, 01K, 01KS, 01KY, 01LA, 01LL, 01LS, 01MA, 01MD, 01ME, 01MI, 01MN, 01MO, 01MT, 01NC, 01NE, 01NH, 01NJ, 01NM, 01NV, 01NY, 01OI, 01OK, 01OR, 01PA, 01PN, 01PS, 01SC, 01TA, 01TE, 01TN, 01TS, 01TX, 01U, 01UT, 01VA, 01WA, 01WI, 01WN, 01WT, 01WY, 01XA, 01XS, 02AK, 02...


Array(00A, 00AA, 00AK, 00AL, 00AR, 00AS, 00AZ, 00CA, 00CL, 00CN, 00CO, 00FA, 00FD, 00FL, 00GA, 00GE, 00HI, 00ID, 00IG, 00II, 00IL, 00IN, 00IS, 00KS, 00KY, 00LA, 00LL, 00LS, 00MD, 00MI, 00MN, 00MO, 00MT, 00N, 00NC, 00NJ, 00NK, 00NY, 00OH, 00OI, 00OK, 00OR, 00PA, 00PN, 00PS, 00S, 00SC, 00SD, 00TA, 00TE, 00TN, 00TS, 00TX, 00UT, 00VA, 00VI, 00W, 00WA, 00WI, 00WN, 00WV, 00WY, 00XS, 01A, 01AK, 01AL, 01AR, 01AZ, 01C, 01CA, 01CL, 01CN, 01CO, 01CT, 01FA, 01FD, 01FL, 01GA, 01GE, 01IA, 01ID, 01II, 01IL, 01IN, 01IS, 01J, 01K, 01KS, 01KY, 01LA, 01LL, 01LS, 01MA, 01MD, 01ME, 01MI, 01MN, 01MO, 01MT, 01NC, 01NE, 01NH, 01NJ, 01NM, 01NV, 01NY, 01OI, 01OK, 01OR, 01PA, 01PN, 01PS, 01SC, 01TA, 01TE, 01TN, 01TS, 01TX, 01U, 01UT, 01VA, 01WA, 01WI, 01WN, 01WT, 01WY, 01XA, 01XS, 02AK, 02...

In [16]:
println("hadoop fs -rm -r /tmp/tmp_02.parquet".!!)

23/03/12 16:17:40 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/tmp/tmp_02.parquet' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/dinar.sadykov/.Trash/Current/tmp/tmp_02.parquet


In [17]:
val ident_sq_pq_sink = createParquetSink(ident_sdf_rate, "tmp_02.parquet")
val ident_sq_pq = ident_sq_pq_sink.start

ident_sq_pq_sink = org.apache.spark.sql.streaming.DataStreamWriter@325b44b8
ident_sq_pq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@60ea23f8


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@60ea23f8

In [18]:
// After 10-20 seconds
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


In [19]:
println("hadoop fs -ls /tmp/tmp_02.parquet/*".!!)

Found 10 items
-rw-r--r--   3 dinar.sadykov hdfs        257 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/0
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/1
-rw-r--r--   3 dinar.sadykov hdfs        257 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/2
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/3
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/4
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/5
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/6
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/7
-rw-r--r--   3 dinar.sadykov hdfs        258 2023-03-12 16:17 /tmp/tmp_02.parquet/_spark_metadata/8
-rw-r--r--   3 dinar.sadykov hdfs       2560 2023-03-12 16:17 /tmp/tmp_02.parquet/_sp

In [20]:
val ident_pq = spark.read
    .parquet("/tmp/tmp_02.parquet/part-00000-192cfc2b-465c-4e91-b47d-75b5bc42fef9-c000.snappy.parquet")

println(ident_pq.count)
ident_pq.printSchema
ident_pq.show(5, false)

1
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)

+----------------------+-----+-----+
|timestamp             |value|ident|
+----------------------+-----+-----+
|2023-03-12 16:17:45.01|1    |00A  |
+----------------------+-----+-----+



ident_pq = [timestamp: timestamp, value: bigint ... 1 more field]


[timestamp: timestamp, value: bigint ... 1 more field]

# 6. Работа с Kafka с помощь Static Dataframe

In [21]:
val ident_pq = spark.read
    .parquet("/tmp/tmp_02.parquet/")

println(ident_pq.count)
ident_pq.printSchema
ident_pq.show(5, false)

9
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)

+----------------------+-----+-----+
|timestamp             |value|ident|
+----------------------+-----+-----+
|2023-03-12 16:17:44.01|0    |01MD |
|2023-03-12 16:17:46.01|2    |00NK |
|2023-03-12 16:17:47.01|3    |00IL |
|2023-03-12 16:17:48.01|4    |01MD |
|2023-03-12 16:17:49.01|5    |00LS |
+----------------------+-----+-----+
only showing top 5 rows



ident_pq = [timestamp: timestamp, value: bigint ... 1 more field]


[timestamp: timestamp, value: bigint ... 1 more field]

In [None]:
// def writeKafka[T](topic: String, data: Dataset[T]): Unit = {
//     val kafkaParams = Map(
//         "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667"
//     )
    
//     data
//         .toJSON
//         .withColumn("topic", lit(topic))
//         .write
//         .format("kafka")
//         .options(kafkaParams)
//         .save
// }

// writeKafka("test_topic0", ident_pq)

In [22]:
val kafkaParams = Map(
        "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667",
        "subscribe" -> "test_topic0"
    )


val sdf_kafka0 = spark.read
    .format("kafka")
    .options(kafkaParams)
    .load

sdf_kafka0.printSchema
sdf_kafka0.show(3)
sdf_kafka0.count

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

+----+--------------------+-----------+---------+------+--------------------+-------------+
| key|               value|      topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------+---------+------+--------------------+-------------+
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   310|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   311|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   312|2023-03-08 23:48:...|            0|
+----+--------------------+-----------+---------+------+--------------------+-------------+
only showing top 3 rows



kafkaParams = Map(kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667, subscribe -> test_topic0)
sdf_kafka0 = [key: binary, value: binary ... 5 more fields]


81

Чтение из Kafka имеет несколько особенностей:
- по умолчанию читается все содержимое топика. Поскольку обычно в нем много данных, эта операция может создать большую нагрузку на кластер Kafka и Spark приложение
- колонки `value` и `key` имеют тип `binary`, который необходимо десереализовать

Чтобы прочитать только определенную часть топика, нам необходимо задать минимальный и максимальный оффсет для чтения с помощью параметров `startingOffsets` , `endingOffsets`. Возьмем два случайных события:

In [23]:
// На основании этих событий подготовим параметры startingOffsets и endingOffsets

sdf_kafka0
    .sample(0.1)
    .limit(10)
    .select('topic, 'partition, 'offset)
    .show

+-----------+---------+------+
|      topic|partition|offset|
+-----------+---------+------+
|test_topic0|        0|   322|
|test_topic0|        0|   329|
|test_topic0|        0|   364|
|test_topic0|        0|   377|
+-----------+---------+------+



In [24]:
val kafkaParams = Map(
        "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667",
        "subscribe" -> "test_topic0",
        "startingOffsets" -> """ { "test_topic0": { "0": 322 } } """,
        "endingOffsets" -> """ { "test_topic0": { "0": 377 } }  """//,
        //"failOnDataLoss" -> "false"
    )


val sdf_kafka1 = spark
    .read
    .format("kafka")
    .options(kafkaParams)
    .load

sdf_kafka1.printSchema
sdf_kafka1.show(20)

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

+----+--------------------+-----------+---------+------+--------------------+-------------+
| key|               value|      topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------+---------+------+--------------------+-------------+
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   322|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   323|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   324|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   325|2023-03-08 23:48:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic0|        0|   326|2023-03-08 23:48:

kafkaParams = Map(kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667, subscribe -> test_topic0, startingOffsets -> " { "test_topic0": { "0": 322 } } ", endingOffsets -> " { "test_topic0": { "0": 377 } }  ")
sdf_kafka1 = [key: binary, value: binary ... 5 more fields]


[key: binary, value: binary ... 5 more fields]

По умолчанию параметр `startingOffsets` имеет значение `earliest`, а `endingOffsets` - `latest`. Поэтому, когда мы не указывали эти параметры, Spark прочитал содержимое всего топика

Чтобы получить наши данные, которые мы записали в топик, нам необходимо их десереализовать. В нашем случае достаточно использовать `.cast("string")`, однако это работает не всегда, т.к. формат данных может быть произвольным.

In [25]:
val sdf_kafka1_json = sdf_kafka1
    .select('value.cast("string"))
    .as[String]

sdf_kafka1_json.show(3, false)

val sdf_kafka1_json_parsed = spark.read
    .json(sdf_kafka1_json)

sdf_kafka1_json_parsed.printSchema
sdf_kafka1_json_parsed.show(3, false)

+-----------------------------------------------------------------------+
|value                                                                  |
+-----------------------------------------------------------------------+
|{"timestamp":"2022-10-31T19:54:01.336+03:00","value":23,"ident":"01CA"}|
|{"timestamp":"2022-10-31T19:54:03.336+03:00","value":25,"ident":"00IS"}|
|{"timestamp":"2022-10-31T19:54:05.336+03:00","value":27,"ident":"01U"} |
+-----------------------------------------------------------------------+
only showing top 3 rows

root
 |-- ident: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- value: long (nullable = true)

+-----+-----------------------------+-----+
|ident|timestamp                    |value|
+-----+-----------------------------+-----+
|01CA |2022-10-31T19:54:01.336+03:00|23   |
|00IS |2022-10-31T19:54:03.336+03:00|25   |
|01U  |2022-10-31T19:54:05.336+03:00|27   |
+-----+-----------------------------+-----+
only showing top 3 rows



sdf_kafka1_json = [value: string]
sdf_kafka1_json_parsed = [ident: string, timestamp: string ... 1 more field]


[ident: string, timestamp: string ... 1 more field]

# 7. Работа с Kafka с помощью Streaming DF

При создании SDF из Kafka необходимо помнить, что:
- `startingOffsets` по умолчанию имеет значение `latest`
- `endingOffsets` использовать нельзя
- количество сообщений за батч можно (и нужно) ограничить параметром `maxOffsetPerTrigger` (по умолчанию он не задан и первый батч будет содержать данные всего топика

In [26]:
val kafkaParams = Map(
        "kafka.bootstrap.servers" -> "spark-master-1.newprolab.com:6667",
        "subscribe" -> "test_topic0",
        "startingOffsets" -> """earliest""",
        "maxOffsetsPerTrigger" -> "2"
    )

val sdf_kafka2 = spark
    .readStream // <- TOBE, AS IS = .read
    .format("kafka")
    .options(kafkaParams)
    .load

val sdf_kafka2_parsed = sdf_kafka2
    .select('value.cast("string")
            , 'topic
            , 'partition
            , 'offset)

val sdf_kafka2_sink = createConsoleSink(sdf_kafka2_parsed)

val sdf_kafka2_sq = sdf_kafka2_sink.start

kafkaParams = Map(kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667, subscribe -> test_topic0, startingOffsets -> earliest, maxOffsetsPerTrigger -> 2)
sdf_kafka2 = [key: binary, value: binary ... 5 more fields]
sdf_kafka2_parsed = [value: string, topic: string ... 2 more fields]
sdf_kafka2_sink = org.apache.spark.sql.streaming.DataStreamWriter@7e383d0b
sdf_kafka2_sq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3aa5fd55


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3aa5fd55

-------------------------------------------
Batch: 0
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partition|offset|
+----------------------------------------------------------------------+-----------+---------+------+
|{"timestamp":"2022-10-31T19:53:38.336+03:00","value":0,"ident":"00KS"}|test_topic0|0        |310   |
|{"timestamp":"2022-10-31T19:53:40.336+03:00","value":2,"ident":"00MO"}|test_topic0|0        |311   |
+----------------------------------------------------------------------+-----------+---------+------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partiti

Если мы перезапустим этот стрим, он повторно прочитает все данные. Чтобы обеспечить сохранение состояния стрима после обработки каждого батча, нам необходимо добавить параметр `checkpointLocation` в опции `writeStream`:

In [27]:
def createConsoleSinkWithCheckpoint(chkName: String, df: DataFrame) = {
    df
    .writeStream
    .format("console")
    .trigger(Trigger.ProcessingTime("10 seconds"))
    .option("checkpointLocation", s"/tmp/$chkName")
    .option("truncate", "false")
    .option("numRows", "20")
}

createConsoleSinkWithCheckpoint: (chkName: String, df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.streaming.DataStreamWriter[org.apache.spark.sql.Row]


In [28]:
println("hadoop fs -rm -r /tmp/tmp_03".!!)

23/03/12 16:20:16 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/tmp/tmp_03' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/dinar.sadykov/.Trash/Current/tmp/tmp_03


In [29]:
val sdf_kafka2_sink_2 = createConsoleSinkWithCheckpoint("tmp_03", sdf_kafka2_parsed)
val sdf_kafka2_sq_2 = sdf_kafka2_sink_2.start

sdf_kafka2_sink_2 = org.apache.spark.sql.streaming.DataStreamWriter@5c401598
sdf_kafka2_sq_2 = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7853a778


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7853a778

-------------------------------------------
Batch: 0
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partition|offset|
+----------------------------------------------------------------------+-----------+---------+------+
|{"timestamp":"2022-10-31T19:53:38.336+03:00","value":0,"ident":"00KS"}|test_topic0|0        |310   |
|{"timestamp":"2022-10-31T19:53:40.336+03:00","value":2,"ident":"00MO"}|test_topic0|0        |311   |
+----------------------------------------------------------------------+-----------+---------+------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------------------------------------------------------+-----------+---------+------+
|value                                                                 |topic      |partiti

In [30]:
//After 10-20 second
killAll

Stopped KafkaV2[Subscribe[test_topic0]]
Stopped KafkaV2[Subscribe[test_topic0]]


In [31]:
println("hadoop fs -ls /tmp/tmp_03/".!!)

Found 4 items
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:20 /tmp/tmp_03/commits
-rw-r--r--   3 dinar.sadykov hdfs         45 2023-03-12 16:20 /tmp/tmp_03/metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:20 /tmp/tmp_03/offsets
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:20 /tmp/tmp_03/sources



In [32]:
println("hadoop fs -ls /tmp/tmp_03/offsets/".!!)

Found 2 items
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 16:20 /tmp/tmp_03/offsets/0
-rw-r--r--   3 dinar.sadykov hdfs        432 2023-03-12 16:20 /tmp/tmp_03/offsets/1



In [34]:
println("hadoop fs -head /tmp/tmp_03/offsets/1/".!!)

v1
{"batchWatermarkMs":0,"batchTimestampMs":1678627220950,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"test_topic0":{"0":314}}



lastException: Throwable = null


# 8. Выводы:

- Работать с Kafka можно как с использованием Static DF, так и с помощью Streaming DF
- Чтобы стрим запоминал свое состояние после остановки, необходимо использовать checkpoint - директорию на HDFS (или локальной ФС), в которую будет сохранятся состояние стрима после каждого батча
- Apache Kafka - распределенная система, обеспечивающая передачу потока данных в слабосвязанных системах

# 9. laba04b

In [35]:
val topic1 = "dinar.sadykov"

topic1 = dinar.sadykov


dinar.sadykov

In [36]:
val sdf_buy1 = spark
    .read
    .json("visits/buy")
    .filter("uid is not NULL")

val sdf_view1 = spark
    .read
    .json("visits/view")
    .filter("uid is not NULL")

val sdf_visit1 = sdf_buy1.union(sdf_view1)
    .repartitionByRange(200, 'uid)
    .cache()

sdf_visit1.printSchema
sdf_visit1.show(numRows = 10, truncate = 10)

root
 |-- category: string (nullable = true)
 |-- date: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_price: long (nullable = true)
 |-- uid: string (nullable = true)
 |-- p_date: integer (nullable = true)

+----------+--------+----------+----------+----------+----------+--------+
|  category|    date|event_type|   item_id|item_price|       uid|  p_date|
+----------+--------+----------+----------+----------+----------+--------+
|Kitchen...|20200426|       buy|Kitchen...|      1245|036ab2e...|20200426|
|Enterta...|20200426|       buy|Enterta...|       252|032df29...|20200426|
|Enterta...|20200206|       buy|Enterta...|      1815|0342da4...|20200206|
|Kitchen...|20200206|       buy|Kitchen...|      3691|0355d72...|20200206|
|Enterta...|20200206|       buy|Enterta...|      3432|038168f...|20200206|
|Enterta...|20200306|       buy|Enterta...|      2211|039331b...|20200306|
|Mobile-...|20200220|       buy|Mobile-...|     

sdf_buy1 = [category: string, date: string ... 5 more fields]
sdf_view1 = [category: string, date: string ... 5 more fields]
sdf_visit1 = [category: string, date: string ... 5 more fields]


[category: string, date: string ... 5 more fields]

In [37]:
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
val buy_list = sdf_visit1
    .filter(" event_type == 'buy' ")
    .select('uid, 'item_id, 'item_price)
    .limit(200)
    .distinct
    .toJSON
    .as[String].collect

////////////////////////////////////////////////////////
val view_list = sdf_visit1
    .filter(" event_type == 'view' ")
    .select('uid, 'item_id, 'item_price)
    .limit(200)
    .distinct
    .toJSON
    .as[String].collect

////////////////////////////////////////////////////////
////////////////////////////////////////////////////////

// timestamp -> timestamp
// value -> offset
// ident -> value
// topic -> topic
// randint -> partition

val buy_rate = sdf_rate
        .withColumn("ident"
                  , shuffle( // для каждой строки будет перемешивание вутри массива
                      array(
                          buy_list.map(lit(_)):_*)
                  )(0)) // берем первый элемент массива
        .withColumn("topic", lit(topic1))
        .withColumnRenamed("value","offset")
        .withColumn("partition", (round( org.apache.spark.sql.functions.rand()*(10)+5,0)).cast("int") )
        .withColumnRenamed("ident","value")

buy_rate.printSchema

////////////////////////////////////////////////////////
val view_rate = sdf_rate
        .withColumn("ident"
                  , shuffle( // для каждой строки будет перемешивание вутри массива
                      array(
                          view_list.map(lit(_)):_*)
                  )(0)) // берем первый элемент массива
        .withColumn("topic", lit(topic1))
        .withColumnRenamed("value","offset")
        .withColumn("partition", (round( org.apache.spark.sql.functions.rand()*(10)+5,0)).cast("int") )
        .withColumnRenamed("ident","value")

view_rate.printSchema

root
 |-- timestamp: timestamp (nullable = true)
 |-- offset: long (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = false)
 |-- partition: integer (nullable = true)

root
 |-- timestamp: timestamp (nullable = true)
 |-- offset: long (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = false)
 |-- partition: integer (nullable = true)



buy_list = Array({"uid":"036ab2e1-ac19-41bb-8db4-487e1202c860","item_id":"Kitchen-appliances-14","item_price":1245}, {"uid":"032df295-1a28-4900-b405-dee9e9271c98","item_id":"Entertainment-equipment-0","item_price":252}, {"uid":"0342da4b-6ef6-488e-b331-11777764e768","item_id":"Entertainment-equipment-18","item_price":1815}, {"uid":"0355d721-ad22-4730-b8de-27cf43795b28","item_id":"Kitchen-appliances-14","item_price":3691}, {"uid":"038168f6-3507-48bf-9593-708ad6db6309","item_id":"Entertainment-equipment-17","item_price":3432}, {"uid":"039331bf-0c05-4746-ba32-fa66f2475082","item_id":"Entertainment-equipment-9","item_price":2211}, {"uid":"034568f3-8b80-4740-9f31-371e3148de3a","item_id":"Mobile-phones-5","item_price":3472}, {"uid":"037ddbad-ad22-42ac-89f1-a0466fa24072","item_id...


Array({"uid":"036ab2e1-ac19-41bb-8db4-487e1202c860","item_id":"Kitchen-appliances-14","item_price":1245}, {"uid":"032df295-1a28-4900-b405-dee9e9271c98","item_id":"Entertainment-equipment-0","item_price":252}, {"uid":"0342da4b-6ef6-488e-b331-11777764e768","item_id":"Entertainment-equipment-18","item_price":1815}, {"uid":"0355d721-ad22-4730-b8de-27cf43795b28","item_id":"Kitchen-appliances-14","item_price":3691}, {"uid":"038168f6-3507-48bf-9593-708ad6db6309","item_id":"Entertainment-equipment-17","item_price":3432}, {"uid":"039331bf-0c05-4746-ba32-fa66f2475082","item_id":"Entertainment-equipment-9","item_price":2211}, {"uid":"034568f3-8b80-4740-9f31-371e3148de3a","item_id":"Mobile-phones-5","item_price":3472}, {"uid":"037ddbad-ad22-42ac-89f1-a0466fa24072","item_id...

In [38]:
// val sink_tmp = createConsoleSink(view_rate)
// val sq_tmp = sink_tmp.start

//After 10-20 second
// killAll()

println("hadoop fs -rm -r /tmp/tmp_04_b.parquet".!!)

23/03/12 16:21:46 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/tmp/tmp_04_b.parquet' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/dinar.sadykov/.Trash/Current/tmp/tmp_04_b.parquet1678627306779


In [39]:
println("hadoop fs -rm -r /tmp/tmp_04_v.parquet".!!)

23/03/12 16:21:48 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/tmp/tmp_04_v.parquet' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/dinar.sadykov/.Trash/Current/tmp/tmp_04_v.parquet


In [40]:
val buy_sq_pq_sink = createParquetSink(buy_rate, "tmp_04_b.parquet")
val buy_sq_pq = buy_sq_pq_sink.start

buy_sq_pq_sink = org.apache.spark.sql.streaming.DataStreamWriter@764205a0
buy_sq_pq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@79305ac2


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@79305ac2

In [41]:
val view_sq_pq_sink = createParquetSink(view_rate, "tmp_04_v.parquet")
val view_sq_pq = view_sq_pq_sink.start

view_sq_pq_sink = org.apache.spark.sql.streaming.DataStreamWriter@25b6783d
view_sq_pq = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5ebed4f2


lastException: Throwable = null


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5ebed4f2

In [42]:
// After 10-20 second
killAll()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default
Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


In [43]:
println("hadoop fs -ls /tmp/tmp_04_b.parquet".!!)

Found 17 items
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:22 /tmp/tmp_04_b.parquet/_spark_metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:22 /tmp/tmp_04_b.parquet/commits
-rw-r--r--   3 dinar.sadykov hdfs         45 2023-03-12 16:21 /tmp/tmp_04_b.parquet/metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:21 /tmp/tmp_04_b.parquet/offsets
-rw-r--r--   3 dinar.sadykov hdfs       2327 2023-03-12 16:21 /tmp/tmp_04_b.parquet/part-00000-0451f711-156f-4e4f-b9ef-7d9f266ea6f2-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs       2353 2023-03-12 16:21 /tmp/tmp_04_b.parquet/part-00000-0c72f24d-5e0e-4eca-a5ca-9f37847bf523-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs       2344 2023-03-12 16:21 /tmp/tmp_04_b.parquet/part-00000-1c23f3bc-52cb-4b87-80c7-7010935e7b24-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs       2353 2023-03-12 16:21 /tmp/tmp_04_b.parquet/part-00000-264c4fe0-58f2-4b5f-aafd-4aa798e35e7a-c000.snappy.parquet


In [44]:
val sdf_tmp = spark.read
    .parquet("/tmp/tmp_04_b.parquet/part-00000-0451f711-156f-4e4f-b9ef-7d9f266ea6f2-c000.snappy.parquet")
sdf_tmp.count()
sdf_tmp.printSchema
sdf_tmp.show(3)

root
 |-- timestamp: timestamp (nullable = true)
 |-- offset: long (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)

+--------------------+------+--------------------+-------------+---------+
|           timestamp|offset|               value|        topic|partition|
+--------------------+------+--------------------+-------------+---------+
|2023-03-12 16:21:...|     9|{"uid":"03ee2a93-...|dinar.sadykov|       15|
+--------------------+------+--------------------+-------------+---------+



sdf_tmp = [timestamp: timestamp, offset: bigint ... 3 more fields]


[timestamp: timestamp, offset: bigint ... 3 more fields]

In [45]:
println("hadoop fs -ls /tmp/tmp_04_v.parquet".!!)

Found 17 items
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:22 /tmp/tmp_04_v.parquet/_spark_metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:22 /tmp/tmp_04_v.parquet/commits
-rw-r--r--   3 dinar.sadykov hdfs         45 2023-03-12 16:21 /tmp/tmp_04_v.parquet/metadata
drwxr-xr-x   - dinar.sadykov hdfs          0 2023-03-12 16:22 /tmp/tmp_04_v.parquet/offsets
-rw-r--r--   3 dinar.sadykov hdfs       2362 2023-03-12 16:21 /tmp/tmp_04_v.parquet/part-00000-0bcf1e15-af57-45ed-bce4-1e992e794a8a-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs       2245 2023-03-12 16:22 /tmp/tmp_04_v.parquet/part-00000-22a21e8d-07a0-4137-8f0b-a429de4b6d64-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs       2297 2023-03-12 16:21 /tmp/tmp_04_v.parquet/part-00000-41beddc0-5acf-4de5-a417-ccd4b91e97d0-c000.snappy.parquet
-rw-r--r--   3 dinar.sadykov hdfs       2290 2023-03-12 16:21 /tmp/tmp_04_v.parquet/part-00000-49adcc02-b73f-436b-b101-0fdbb17ee099-c000.snappy.parquet


In [47]:
val sdf_tmp = spark.read
    .parquet("/tmp/tmp_04_v.parquet/part-00000-0bcf1e15-af57-45ed-bce4-1e992e794a8a-c000.snappy.parquet")
sdf_tmp.count()
sdf_tmp.printSchema
sdf_tmp.show(3)

root
 |-- timestamp: timestamp (nullable = true)
 |-- offset: long (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)

+--------------------+------+--------------------+-------------+---------+
|           timestamp|offset|               value|        topic|partition|
+--------------------+------+--------------------+-------------+---------+
|2023-03-12 16:21:...|     2|{"uid":"03634065-...|dinar.sadykov|       12|
+--------------------+------+--------------------+-------------+---------+



sdf_tmp = [timestamp: timestamp, offset: bigint ... 3 more fields]


[timestamp: timestamp, offset: bigint ... 3 more fields]

In [48]:
val schema = col_to_schema(sdf_tmp, "value")

// Зная схему - получаем JSON -> Columns
val sdf_tmp_parse = sdf_tmp
    .withColumn("root", from_json('value, schema) )
    .select( 'timestamp, 'offset, 'topic, 'partition, col("root.*") )

sdf_tmp_parse.printSchema
sdf_tmp_parse.show(3)

root
 |-- timestamp: timestamp (nullable = true)
 |-- offset: long (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_price: long (nullable = true)
 |-- uid: string (nullable = true)

+--------------------+------+-------------+---------+--------------------+----------+--------------------+
|           timestamp|offset|        topic|partition|             item_id|item_price|                 uid|
+--------------------+------+-------------+---------+--------------------+----------+--------------------+
|2023-03-12 16:21:...|     2|dinar.sadykov|       12|Household-applian...|      3785|03634065-874e-411...|
+--------------------+------+-------------+---------+--------------------+----------+--------------------+



schema = StructType(StructField(item_id,StringType,true), StructField(item_price,LongType,true), StructField(uid,StringType,true))
sdf_tmp_parse = [timestamp: timestamp, offset: bigint ... 5 more fields]


[timestamp: timestamp, offset: bigint ... 5 more fields]