In [None]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

spark = SparkSession.builder.appName('MostViewedURL').master('local[*]').getOrCreate()

In [None]:
ssc = StreamingContext(spark.sparkContext, 1) # 1s

In [None]:
from pyspark.sql.functions import regexp_extract
# Monitor the logs directory for new log data, and read in the raw lines as accessLines
accessLines = spark.readStream.text("../SparkCourse/logs")

# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

logsDF = accessLines.select(regexp_extract('value', hostExp, 1).alias('host'),
                         regexp_extract('value', timeExp, 1).alias('timestamp'),
                         regexp_extract('value', generalExp, 1).alias('method'),
                         regexp_extract('value', generalExp, 2).alias('endpoint'),
                         regexp_extract('value', generalExp, 3).alias('protocol'),
                         regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
                         regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size'))

In [None]:
from pyspark.sql import functions as F

In [None]:
# update timestamp, os valores no teste são antigos!
logsDF = logsDF.withColumn('timestampNew', F.current_timestamp())

In [None]:
# 30s window with a 10s slide
endpointsCount = logsDF.groupby(F.window(F.col('timestampNew'),\
                                         windowDuration='30 seconds', \
                                         slideDuration='10 seconds'),\
                                F.col('endpoint')).count()

In [None]:
query = ( endpointsCount.orderBy(F.col('count').desc()).writeStream.outputMode("complete").format("console").queryName("counts").start() )


In [None]:
# Run forever until terminated
query.awaitTermination()


In [13]:
# Cleanly shut down the session
spark.stop()