In [15]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import decode, from_json, length
import os

In [2]:
pyspark.__version__

'2.4.0'

In [3]:
#os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/lib/pravega-connectors-spark-0.4.0-SNAPSHOT.jar,/home/jovyan/lib/pravega-keycloak-credentials-0.4.0-2030.d99411b-0.0.1-020.26736d2-shadow.jar pyspark-shell'
#os.environ['pravega_client_auth_method'] = 'Bearer'
#os.environ['pravega_client_auth_loadDynamic'] = 'true'
#os.environ['KEYCLOAK_SERVICE_ACCOUNT_FILE'] = '/home/jovyan/keycloak.json'

In [4]:
spark = (SparkSession
             .builder
             .appName('test1')
             .config('spark.jars', '/home/jovyan/lib/pravega-connectors-spark-0.4.0-SNAPSHOT.jar,/home/jovyan/lib/pravega-keycloak-credentials-0.4.0-2030.d99411b-0.0.1-020.26736d2-shadow.jar')
             .config('spark.driver.memory', '4g')
             .config('spark.executor.memory', '4g')
             .getOrCreate()
             )

In [5]:
spark.conf.set('spark.sql.shuffle.partitions', '1')
spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
#spark.conf.set('spark.jars', '/home/jovyan/lib/pravega-connectors-spark-0.4.0-SNAPSHOT.jar,/home/jovyan/lib/pravega-keycloak-credentials-0.4.0-2030.d99411b-0.0.1-020.26736d2-shadow.jar')

In [6]:
spark.sparkContext.getConf().getAll()

[('spark.jars',
  '/home/jovyan/lib/pravega-connectors-spark-0.4.0-SNAPSHOT.jar,/home/jovyan/lib/pravega-keycloak-credentials-0.4.0-2030.d99411b-0.0.1-020.26736d2-shadow.jar'),
 ('spark.driver.memory', '4g'),
 ('spark.executor.memory', '4g'),
 ('spark.driver.host', 'jupyter-claudio'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.port', '38083'),
 ('spark.repl.local.jars',
  'file:///home/jovyan/lib/pravega-connectors-spark-0.4.0-SNAPSHOT.jar,file:///home/jovyan/lib/pravega-keycloak-credentials-0.4.0-2030.d99411b-0.0.1-020.26736d2-shadow.jar'),
 ('spark.app.id', 'local-1556433112123'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'test1')]

In [46]:
controller = 'tcp://nautilus-pravega-controller.nautilus-pravega.svc.cluster.local:9090'
scope = 'examples'
df = (spark
      .read
      .format("pravega")
      .option("controller", controller)
      .option("scope", scope)
      .option("stream", "video")
      .option("encoding", "chunked_v1")
      .load()
      )
#df.show()

In [47]:
schema='timestamp timestamp, frame_number int, camera int, ssrc int, data binary'
df = df.withColumnRenamed('event', 'raw_event')
df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
df = df.select('*', from_json('event_string', schema=schema).alias('event'))
df = df.select('*', 'event.*')
df = df.select('*', length('data'))

In [49]:
df.printSchema()

root
 |-- raw_event: binary (nullable = true)
 |-- scope: string (nullable = true)
 |-- stream: string (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- offset: long (nullable = true)
 |-- event_string: string (nullable = true)
 |-- event: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- frame_number: integer (nullable = true)
 |    |-- camera: integer (nullable = true)
 |    |-- ssrc: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- frame_number: integer (nullable = true)
 |-- camera: integer (nullable = true)
 |-- ssrc: integer (nullable = true)
 |-- data: binary (nullable = true)
 |-- length(data): integer (nullable = true)



In [50]:
df = df.limit(10).cache()

In [51]:
df.drop('raw_event', 'event_string', 'event', 'data').show()

+--------+------+----------+---------+--------------------+------------+------+----------+------------+
|   scope|stream|segment_id|   offset|           timestamp|frame_number|camera|      ssrc|length(data)|
+--------+------+----------+---------+--------------------+------------+------+----------+------------+
|examples| video|         0|        0|2019-04-25 05:54:...|           0|     0|1227050840|    14798051|
|examples| video|         0| 19731145|2019-04-25 05:54:...|           1|     0|1227050840|     2429487|
|examples| video|         0| 22970630|2019-04-25 05:55:...|           2|     0|1227050840|    11734712|
|examples| video|         0| 38617259|2019-04-25 05:55:...|           3|     0|1227050840|    16315434|
|examples| video|         0| 60371612|2019-04-25 05:55:...|           4|     0|1227050840|      448155|
|examples| video|         0| 60969273|2019-04-25 05:55:...|           5|     0|1227050840|    20487721|
|examples| video|         0| 88286774|2019-04-25 05:55:...|     

In [52]:
def f(row):
    filename = '/home/jovyan/file-%d.png' % (row.frame_number,)
    with open(filename, 'wb') as output:
        output.write(row.data)
df.foreach(f)