In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('wiki-changes-dataviz')
         .config("spark.sql.streaming.schemaInference", True) #Stream dataframe infers schema
         .getOrCreate())
sc = spark.sparkContext

In [2]:
# Read parquet stream
df_stream = (
    spark
    .readStream
    .format("parquet")
    #.option("maxFilesPerTrigger", 1) #Read 1 file by trigger
    .load("/home/jovyan/work/data-lake/wiki-changes")
)

In [3]:
df_stream.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- schema: string (nullable = true)
 |-- bot: boolean (nullable = true)
 |-- comment: string (nullable = true)
 |-- id: string (nullable = true)
 |-- length_new: integer (nullable = true)
 |-- length_old: integer (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- namespace: integer (nullable = true)
 |-- parsedcomment: string (nullable = true)
 |-- patrolled: boolean (nullable = true)
 |-- revision_new: integer (nullable = true)
 |-- revision_old: integer (nullable = true)
 |-- server_script_path: string (nullable = true)
 |-- server_url: string (nullable = true)
 |-- change_timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- user: string (nullable = true)
 |-- wiki: string (nullable = true)
 |-- meta_domain: string (nullable = true)
 |-- meta_dt: string (nullable = true)
 

In [4]:
# Create dataframe grouping by window 
from pyspark.sql.functions import window, col, current_timestamp

df_count = (
    df_stream
    .withWatermark("change_timestamp", "10 minutes") #Don't aggregate events arriving more than 10 minutes late
    .withColumn("agg_timestamp", current_timestamp())
    .groupBy(
        window(col("change_timestamp"), "10 minutes", "10 minutes"), #10 minute window, updating every 10 minutes
        col("user"),
        col("agg_timestamp"))
    .count()
    #.orderBy(col("count").desc())
)

In [5]:
df_count.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- user: string (nullable = true)
 |-- agg_timestamp: timestamp (nullable = false)
 |-- count: long (nullable = false)



In [6]:
# Create query stream with memory sink
queryStream = (df_count
 .writeStream
 .format("memory")
 .queryName("wiki_changes")
 #.outputMode("complete")
 #.outputMode("update")
 .outputMode("append")
 #.trigger(processingTime='2 seconds')
 .start())

In [13]:
from time import sleep
from IPython.display import clear_output

try:
    i=1
    while True:
        clear_output(wait=True)
        print("Run:{}".format(i))
        if queryStream.lastProgress:
            print("Watermark:{}".format(queryStream.lastProgress["eventTime"]["watermark"]))
            print("Total Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsTotal"]))
            print("Updated Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsUpdated"]))
            print("Memory used MB:{}".format((queryStream.lastProgress["stateOperators"][0]["memoryUsedBytes"]) * 0.000001))

        
        #df = spark.sql(
        #        """
        #            select
        #                *
        #            from
        #                (
        #                    select
        #                        agg_timestamp
        #                        ,window.start
        #                        ,window.end
        #                        ,user
        #                        ,count
        #                        ,row_number() over (partition by user order by agg_timestamp desc) rn
        #                    from
        #                        wiki_changes
        #                    where
        #                        window.start = (select max(window.start) from wiki_changes)
        #                ) a
        #            where
        #                rn = 1
        #            order by
        #                count desc
        #           limit 10
        #        """
        #).toPandas()
        #df = spark.sql("select window.start, window.end, user, count from wiki_changes order by window.start desc limit 10").toPandas()
        #df = spark.sql("select window.start, window.end, count(1) from wiki_changes group by window.start, window.end order by window.start desc").toPandas()
        df = spark.sql("select * from wiki_changes where user = 'SuccuBot' order by agg_timestamp desc, window.start desc").toPandas()
        display(df)
    
        sleep(10)
        i=i+1
except KeyboardInterrupt:
    print("process interrupted.")

Run:163
Watermark:2020-07-25T10:36:23.000Z
Total Rows:1278
Updated Rows:0
Memory used MB:0.45369499999999996
process interrupted.


In [17]:
from time import sleep
from IPython.display import clear_output
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rc('font', family='DejaVu Sans')
sns.set(style="whitegrid")


try:
    i=1
    while True:
        clear_output(wait=True)
        print("**********************")
        print("General Info")
        print("**********************")
        print("Run:{}".format(i))
        if (len(queryStream.recentProgress) > 0):
            print("Stream timestamp:{}".format(queryStream.lastProgress["timestamp"]))
            print("Watermark:{}".format(queryStream.lastProgress["eventTime"]["watermark"]))
            print("Total Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsTotal"]))
            print("Updated Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsUpdated"]))
            print("Memory used MB:{}".format((queryStream.lastProgress["stateOperators"][0]["memoryUsedBytes"]) * 0.000001))
            
        df = spark.sql(
                """
                    select
                        window.start
                        ,window.end
                        ,user
                        ,count
                    from
                        wiki_changes
                    where
                        window.start = (select max(window.start) from wiki_changes)
                    order by
                        count desc
                    limit 10
                """
        ).toPandas()

        # Plot the total crashes
        sns.set_color_codes("muted")

        # Initialize the matplotlib figure
        plt.figure(figsize=(8,6))

        print("**********************")
        print("Graph")
        print("**********************")
        try:
            # Barplot
            sns.barplot(x="count", y="user", data=df)

            # Show barplot
            plt.show()
        except ValueError:
            # If Dataframe is empty, pass
            pass

        print("**********************")
        print("Table")
        print("**********************")
        display(df)
        
        print("**********************")
        print("Table 2")
        print("**********************")
        df1 = spark.sql(
                """
                    select
                        window.start
                        ,window.end
                        ,sum(count) sum_count
                        ,count(1) qty
                    from
                        wiki_changes
                    group by
                        window.start
                        ,window.end
                    order by
                        window.start desc
                """
        ).toPandas()
        
        display(df1)
        
        sleep(10)
        i=i+1
except KeyboardInterrupt:
    print("process interrupted.")

**********************
General Info
**********************
Run:1
**********************
Graph
**********************
**********************
Table
**********************


Unnamed: 0,start,end,user,count


**********************
Table 2
**********************
process interrupted.


<Figure size 576x432 with 0 Axes>

In [9]:
# Check active streams
for s in spark.streams.active:
    print("ID:{} | NAME:{}".format(s.id, s.name))

ID:b12a5d5b-1bab-41ec-bf12-ef8d2939e824 | NAME:wiki_changes


In [10]:
# Stop stream
queryStream.stop()