In [1]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf

In [2]:
spark

In [3]:
raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load() 

In [4]:
raw_events.show()

+----+--------------------+------+---------+------+--------------------+-------------+
| key|               value| topic|partition|offset|           timestamp|timestampType|
+----+--------------------+------+---------+------+--------------------+-------------+
|null|[7B 22 65 76 65 6...|events|        0|     0|2021-12-07 00:47:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     1|2021-12-07 00:47:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     2|2021-12-07 00:47:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     3|2021-12-07 00:47:...|            0|
|null|[7B 22 79 5F 63 6...|events|        0|     4|2021-12-07 00:47:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     5|2021-12-07 00:47:...|            0|
|null|[7B 22 65 76 65 6...|events|        0|     6|2021-12-07 00:47:...|            0|
|null|[7B 22 79 5F 63 6...|events|        0|     7|2021-12-07 00:47:...|            0|
|null|[7B 22 79 5F 63 6...|events|        0

In [5]:
raw_events.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [71]:
all_events.collect()[1].value

'{"event_type": "uncover_safe_space", "y_coord": 49, "x_coord": 53, "session_id": "b8953d22-e3dd-4753-a2bb-ecf18dc2b2da", "datetime": "2021-12-07 00:47:19.545635", "neighboring_bombs": 33}'

In [72]:
all_events.collect()[300].value

'{"y_coord": 77, "datetime": "2021-12-07 00:47:22.475078", "event_type": "correct_flag", "session_id": "9e8134e0-c413-4257-a5b4-d3c88df9ff39", "x_coord": 5}'

In [94]:
json.loads(all_events.collect()[300].value)

{'datetime': '2021-12-07 00:47:22.475078',
 'event_type': 'correct_flag',
 'session_id': '9e8134e0-c413-4257-a5b4-d3c88df9ff39',
 'x_coord': 5,
 'y_coord': 77}

In [136]:
raw_events.count()

780

In [112]:
events_list= ['a_startup_event','hit_mine','uncover_safe_space','correct_flag','incorrect_flag','solution']

In [172]:
name = 'uncover_safe_space'

@udf('boolean')
def test(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == name:
        return True
    return False

In [174]:
safe_events = raw_events \
    .select(raw_events.value.cast('string').alias('stats'))\
    .filter(test('stats'))

In [175]:
extracted_safe_events = safe_events \
    .rdd \
    .map(lambda r: Row(**json.loads(r.stats))) \
    .toDF()

In [176]:
extracted_safe_events.show()

+--------------------+------------------+-----------------+--------------------+-------+-------+
|            datetime|        event_type|neighboring_bombs|          session_id|x_coord|y_coord|
+--------------------+------------------+-----------------+--------------------+-------+-------+
|2021-12-07 00:47:...|uncover_safe_space|               33|b8953d22-e3dd-475...|     53|     49|
|2021-12-07 00:47:...|uncover_safe_space|               25|b8953d22-e3dd-475...|     53|     12|
|2021-12-07 00:47:...|uncover_safe_space|               55|b8953d22-e3dd-475...|     38|      3|
|2021-12-07 00:47:...|uncover_safe_space|               19|b8953d22-e3dd-475...|     66|     60|
|2021-12-07 00:47:...|uncover_safe_space|               24|b8953d22-e3dd-475...|     61|      7|
|2021-12-07 00:47:...|uncover_safe_space|               53|b8953d22-e3dd-475...|      6|     45|
|2021-12-07 00:47:...|uncover_safe_space|                7|b8953d22-e3dd-475...|     91|     81|
|2021-12-07 00:47:...|uncover_

In [177]:
extracted_safe_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/click_safe_cell')

In [178]:
safe = spark.read.parquet('/tmp/click_safe_cell')

In [179]:
safe.registerTempTable('safe')

In [186]:
spark.sql("select * from safe").toPandas()

Unnamed: 0,datetime,event_type,neighboring_bombs,session_id,x_coord,y_coord
0,2021-12-07 00:47:19.545635,uncover_safe_space,33,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,53,49
1,2021-12-07 00:47:19.561825,uncover_safe_space,25,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,53,12
2,2021-12-07 00:47:19.571946,uncover_safe_space,55,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,38,3
3,2021-12-07 00:47:19.593213,uncover_safe_space,19,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,66,60
4,2021-12-07 00:47:19.625799,uncover_safe_space,24,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,61,7
5,2021-12-07 00:47:19.666876,uncover_safe_space,53,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,6,45
6,2021-12-07 00:47:19.681671,uncover_safe_space,7,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,91,81
7,2021-12-07 00:47:19.710408,uncover_safe_space,3,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,99,15
8,2021-12-07 00:47:19.725023,uncover_safe_space,20,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,68,80
9,2021-12-07 00:47:19.740969,uncover_safe_space,52,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da,3,70


In [185]:
#didn't retype all the code, just copy pasted to get the "uncover safe space" data
spark.sql("select * from startups").toPandas()

Unnamed: 0,datetime,event_type,session_id
0,2021-12-07 00:47:19.524459,a_startup_event,b8953d22-e3dd-4753-a2bb-ecf18dc2b2da
1,2021-12-07 00:47:21.215454,a_startup_event,9e8134e0-c413-4257-a5b4-d3c88df9ff39
2,2021-12-07 00:47:22.596074,a_startup_event,db0dbc38-c194-4840-8802-3b43d22f000c
3,2021-12-07 00:47:24.139882,a_startup_event,503ffb87-35ab-4e7c-8dca-141ed8063b2b
4,2021-12-07 00:47:25.557702,a_startup_event,cdab81d5-2099-40fc-bf0d-ae3e6a293fd1
