In [1]:
# Configure the necessary Spark environment
import os
import sys

spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")

# Add the py4j to the path.
# You may need to change the version number to match your install -- currently using spark 2.4
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

#os.environ['PYSPARK_SUBMIT_ARGS']="--jars /work/ericr/spark/sparkdev/postgresql.jar --executor-memory 40g --executor-cores 16 pyspark-shell"

# Initialize PySpark to predefine the SparkContext variable 'sc'
#execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

In [2]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import flatten, explode, col
import json


In [3]:
spark = SparkSession.builder.appName('Basic').config("spark.executor.extraJavaOptions","--executor-memory 40G --executor-cores 40 --driver-class-path $SPARK_HOME/postgresql.jar").getOrCreate()

In [4]:
dbconfig = {"url": "jdbc:postgresql://localhost/qxedb",
            "dbtable": "gateway_eventnotification",
            "user": "hermes",
            "password": "mysecret",
            "driver": "org.postgresql.Driver"}

In [5]:
spark

In [6]:
df = spark.read.jdbc(url="jdbc:postgresql://localhost/qxedb", 
                      table="gateway_eventnotification",
                      properties=dbconfig)

In [7]:
sqlcontext = SQLContext(spark.sparkContext)

In [8]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- payload: string (nullable = true)
 |-- event_category_id: string (nullable = true)
 |-- table_data: string (nullable = true)



In [9]:
payload_df = sqlcontext.read.json(df.rdd.map(lambda r: r.payload))

In [10]:
payload_df.printSchema()

root
 |-- data_type: string (nullable = true)
 |-- events: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- dateTime: string (nullable = true)
 |    |    |-- eventCategory: string (nullable = true)
 |    |    |-- eventDetail: struct (nullable = true)
 |    |    |    |-- brand: string (nullable = true)
 |    |    |    |-- connectionInfo: struct (nullable = true)
 |    |    |    |    |-- localIpv4Address: struct (nullable = true)
 |    |    |    |    |    |-- addr: string (nullable = true)
 |    |    |    |    |    |-- subnetMask: string (nullable = true)
 |    |    |    |    |-- sourceInSameSubnet: string (nullable = true)
 |    |    |    |    |-- sourceIpv4Address: struct (nullable = true)
 |    |    |    |    |    |-- addr: string (nullable = true)
 |    |    |    |    |    |-- subnetMask: string (nullable = true)
 |    |    |    |    |-- sourceIpv6Address: struct (nullable = true)
 |    |    |    |    |    |-- addr: string (nullable = true)
 |   

In [11]:
versions = payload_df.select("version")

In [12]:
versions.show()

+-------+
|version|
+-------+
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|   null|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|   null|
|  1.1.0|
|  1.1.0|
|  1.1.0|
|  1.1.0|
+-------+
only showing top 20 rows



In [13]:
x = payload_df.select(explode(payload_df.events).alias("event"))

In [14]:
x.head(1)

[Row(event=Row(dateTime='2001-02-19T19:01:42.0Z', eventCategory='supply', eventDetail=Row(brand='HP', connectionInfo=None, connectionState=None, connectionStateCurrent=None, connectionStateMax=None, connectionStateMin=None, consumableLabelCode='M', consumableManufacturingSignature='020101f1126ffa0f8285c448769c641023286cab1acee82f43ccc6b7be426800c0186019ec36be2bca80e84947cbc294521fe51eae1a245306fc10bf96fd023caae3083d0dc42b2a20d1f04655986f9a31657407bb', consumablePercentageLevelRemaining=90, consumableProductNumber=None, consumableSerialNumber='560256648', consumableState=None, consumableUniqueId='000000000000000045f0ff19d100088c', dataSource=None, domainConfig=None, downloadDuration=None, dustLevel=None, errorCategory=None, errorCode=None, errorType=None, eventCategory=None, eventCode='17.99.32', eventDetailAssert=None, eventDetailConsumable=None, eventDetailErrorRecovery=None, eventDetailJobStatus=None, eventDetailSystemError=None, eventDetailType=None, eventIntArray=None, eventOccurre

In [15]:
flx = x.filter("event.eventCategory == 'systemError'").select("event.eventDetail.eventCode")

In [16]:
counts = flx.groupBy("eventCode").count().sort("count", ascending=False).collect()

In [17]:
systemErrors = x.filter("event.eventCategory == 'systemError'")

In [18]:
counts

[Row(eventCode='71.00.E9', count=41835),
 Row(eventCode='', count=3399),
 Row(eventCode=None, count=2798),
 Row(eventCode='71.02.1B', count=2641),
 Row(eventCode='71.00.0E', count=2479),
 Row(eventCode='71.02.01', count=2396),
 Row(eventCode='71.02.34', count=1057),
 Row(eventCode='71.00.ED', count=796),
 Row(eventCode='71.02.3B', count=708),
 Row(eventCode='71.02.55', count=596),
 Row(eventCode='71.02.53', count=502),
 Row(eventCode='71.02.45', count=463),
 Row(eventCode='71.02.18', count=463),
 Row(eventCode='71.B1.11', count=454),
 Row(eventCode='71.02.ED', count=378),
 Row(eventCode='71.00.EA', count=358),
 Row(eventCode='71.00.EF', count=354),
 Row(eventCode='71.02.11', count=317),
 Row(eventCode='71.02.0D', count=256),
 Row(eventCode='71.02.31', count=239),
 Row(eventCode='71.02.51', count=204),
 Row(eventCode='71.02.0C', count=200),
 Row(eventCode='71.00.14', count=168),
 Row(eventCode='71.00.F9', count=116),
 Row(eventCode='71.02.52', count=108),
 Row(eventCode='71.00.0D', coun

In [19]:
systemErrors.show(2,truncate=200)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                                                                   event|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[2018-08-09T23:18:47.0Z, systemError, [,,,,,,,,,,,,,,,,, hardwareFailure,,,, 71.00.E9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1.0.0,], com.hp.cdm.service.even...|
|[2001-04-28T16:46:26.0Z, systemError, [,,,,,,,,,,,,,,,,, hardwareFailure,,,, 71.00.E9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1.0.0,], com.hp.cdm.se

In [20]:
x.printSchema()

root
 |-- event: struct (nullable = true)
 |    |-- dateTime: string (nullable = true)
 |    |-- eventCategory: string (nullable = true)
 |    |-- eventDetail: struct (nullable = true)
 |    |    |-- brand: string (nullable = true)
 |    |    |-- connectionInfo: struct (nullable = true)
 |    |    |    |-- localIpv4Address: struct (nullable = true)
 |    |    |    |    |-- addr: string (nullable = true)
 |    |    |    |    |-- subnetMask: string (nullable = true)
 |    |    |    |-- sourceInSameSubnet: string (nullable = true)
 |    |    |    |-- sourceIpv4Address: struct (nullable = true)
 |    |    |    |    |-- addr: string (nullable = true)
 |    |    |    |    |-- subnetMask: string (nullable = true)
 |    |    |    |-- sourceIpv6Address: struct (nullable = true)
 |    |    |    |    |-- addr: string (nullable = true)
 |    |    |    |-- sourceMacAddress: string (nullable = true)
 |    |    |-- connectionState: string (nullable = true)
 |    |    |-- connectionStateCurrent: long 

In [32]:
y = x.select("event.eventCategory").groupBy("eventCategory").count().sort("count",ascending=False)

In [33]:
y.show(200)

+-----------------+------+
|    eventCategory| count|
+-----------------+------+
|           supply|744227|
|         jobError|235575|
|      wifiNetwork|229454|
|        jobStatus| 73112|
|      systemError| 64419|
|    errorRecovery| 25194|
|         fwUpdate| 24206|
|        printHead| 12552|
|             test|  2924|
|     system_check|   491|
|       systemTest|   305|
|         testInfo|    92|
|userConfigChanged|     4|
|             info|     2|
+-----------------+------+



In [23]:
for t in y.groupby(['eventCategory']).count().collect():
    print(t)

Row(eventCategory='system_check', count=491)
Row(eventCategory='systemError', count=64419)
Row(eventCategory='printHead', count=12552)
Row(eventCategory='fwUpdate', count=24206)
Row(eventCategory='wifiNetwork', count=229454)
Row(eventCategory='jobStatus', count=73112)
Row(eventCategory='errorRecovery', count=25194)
Row(eventCategory='systemTest', count=305)
Row(eventCategory='jobError', count=235575)
Row(eventCategory='testInfo', count=92)
Row(eventCategory='info', count=2)
Row(eventCategory='test', count=2924)
Row(eventCategory='supply', count=744227)
Row(eventCategory='userConfigChanged', count=4)


In [24]:
fwall = x.select(["event.eventCategory","event.eventDetail.firmwareVersion","event.eventDetail.firmwareAssert.code"])

In [25]:
fwasserts = fwall.filter("eventCategory == 'systemError'")

In [27]:
fwasserts.printSchema()

root
 |-- eventCategory: string (nullable = true)
 |-- firmwareVersion: string (nullable = true)
 |-- code: string (nullable = true)



In [28]:
fwasserts.count()

64419