# Spark Streaming Reading data from CSV file using Structured Streaming API

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f

In [3]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.config("spark.driver.memory","2g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

### Schema is defined for DataFrame structure

In [4]:
schema = StructType(
[
    StructField("Name", StringType(), True),
    StructField("Genre", StringType(), True),
    StructField("Length", IntegerType(), True),
    StructField("Score", FloatType(), True),
    StructField("Country", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Budget", FloatType(), True)
]
)

### Reading of the file directory and file type

In [5]:
film_data = spark.readStream \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.schema(schema)\
.load("streaming")

### Calculation of average film length using groupBy()

In [6]:
genre_average_length = film_data.groupBy("Genre")\
.agg(f.avg("Length").alias("Average_Length"))\
.sort(f.desc("Average_Length"))

### Running of the program using awaitTermination which works until stop or error encounter

The WriteStream format is selected as a console for displaying results in the console screen. The program works after running the awaitTermination function.

In [9]:
query = genre_average_length\
.writeStream \
.outputMode("complete")\
.format("console")\
.start()

In [None]:
query.awaitTermination()

In [None]:
query.stop()