# Structured Streaming using Sliding Window

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f

## 1. Spark Configuration

In [3]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("SlidingWindow")\
.config("spark.driver.memory","2g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

## 2. Schema is defined for DataFrame structure

In [4]:
schema = StructType(
[
    StructField("Name", StringType(), True),
    StructField("Genre", StringType(), True),
    StructField("Length", IntegerType(), True),
    StructField("Score", FloatType(), True),
    StructField("Country", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Budget", FloatType(), True)
]
)

## 3. Reading of file directory and file type

#### New coming data will be read from "streaming" folder

In [5]:
film_data = spark.readStream \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.schema(schema)\
.load("streaming")

#### Datetime now is added to streaming data using current_timestamp() function

In [6]:
currentTimeDf = film_data.withColumn("processingTime",f.current_timestamp())

##  4. Sliding Window

#### Average length is calculated using 5 seconds window length

In [7]:
average_length_slidingWindow = currentTimeDf\
.groupBy(f.window("processingTime", "5 seconds"), "Genre")\
.agg(f.avg("Length").alias("Average_Length"))\

#### Average length is calculated in 3 seconds window length and 1 second sliding interval

In [8]:
average_length_slidingWindow = currentTimeDf\
.groupBy(f.window("processingTime", "3 seconds", "1 seconds"), "Genre")\
.agg(f.avg("Length").alias("Average_Length"))\

#### Average Length is calculated according to Genre and is sorted.

In [9]:
average_length_sorting = currentTimeDf.groupBy("Genre")\
.agg(f.avg("Length").alias("Average_Length"))\

## 5. Running of the program using awaitTermination which works until stop or error encounter

In [10]:
query = average_length_slidingWindow.writeStream \
.outputMode("complete")\
.format("console")\
.option("truncate", "false")\
.start()

In [None]:
query.awaitTermination()

In [None]:
query.stop()