In [1]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf

In [2]:
spark

In [6]:
raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load() 

In [7]:
raw_events.show()

+----+--------------------+------+---------+------+--------------------+-------------+
| key|               value| topic|partition|offset|           timestamp|timestampType|
+----+--------------------+------+---------+------+--------------------+-------------+
|null|[7B 22 65 76 65 6...|events|        0|     0|2021-12-07 19:52:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     1|2021-12-07 19:52:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     2|2021-12-07 19:52:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     3|2021-12-07 19:52:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     4|2021-12-07 19:52:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     5|2021-12-07 19:52:...|            0|
|null|[7B 22 79 5F 63 6...|events|        0|     6|2021-12-07 19:52:...|            0|
|null|[7B 22 79 5F 63 6...|events|        0|     7|2021-12-07 19:52:...|            0|
|null|[7B 22 79 5F 63 6...|events|        0

In [8]:
raw_events.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [9]:
raw_events.count()

105

In [13]:
all_events = raw_events.select(raw_events.value.cast('string'))
all_events.show()

+--------------------+
|               value|
+--------------------+
|{"event_type": "a...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
|{"event_type": "c...|
+--------------------+
only showing top 20 rows



In [15]:
json.loads(all_events.collect()[6].value)

{'event_type': 'check',
 'neighboring_bombs': 35,
 'outcome': 'hit_mine',
 'session_id': 'ac796b7b-cf99-4aad-bb93-2c87a46e946a',
 'x_coord': 34,
 'y_coord': 98}

In [16]:
events_list= ['a_startup_event','check','flag','solution']

In [10]:
name = 'check'

@udf('boolean')
def test(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == name:
        return True
    return False

In [11]:
check_events = raw_events \
    .select(raw_events.value.cast('string').alias('stats'),\
            raw_events.timestamp.cast('string'))\
    .filter(test('stats'))

In [12]:
extracted_check_events = check_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.stats))) \
    .toDF()

In [13]:
extracted_check_events.show()

+----------+-----------------+--------+--------------------+--------------------+-------+-------+
|event_type|neighboring_bombs| outcome|          session_id|           timestamp|x_coord|y_coord|
+----------+-----------------+--------+--------------------+--------------------+-------+-------+
|     check|               28|    safe|9d814b0a-03d1-4c3...|2021-12-07 19:52:...|     49|     25|
|     check|               36|    safe|9d814b0a-03d1-4c3...|2021-12-07 19:52:...|     39|     95|
|     check|               52|    safe|9d814b0a-03d1-4c3...|2021-12-07 19:52:...|      7|     86|
|     check|               30|    safe|9d814b0a-03d1-4c3...|2021-12-07 19:52:...|     61|     45|
|     check|               49|    safe|9d814b0a-03d1-4c3...|2021-12-07 19:52:...|     12|     81|
|     check|               50|    safe|5e7390a7-2876-478...|2021-12-07 19:52:...|      5|     72|
|     check|               35|hit_mine|5e7390a7-2876-478...|2021-12-07 19:52:...|     32|     47|
|     check|        

In [31]:
extracted_check_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/check_cell')

In [2]:
check = spark.read.parquet('/tmp/check_cell')

In [3]:
check.registerTempTable('check_event_table')

In [5]:
spark.sql("select * from check_event_table").toPandas().shape

(25, 7)