In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

from IPython.display import clear_output

In [2]:
ratingSchema = StructType([
    StructField("user", IntegerType()),
    StructField("movie", IntegerType()),
    StructField("rating", FloatType())
])

In [3]:
def foreach_batch_function(df, epoch_id):
    mostPopularMovies = df.limit(10).toPandas()
    clear_output()
    print(mostPopularMovies)
    #import plotly.express as px
    #fig = px.bar(mostPopularMovies, x='movie', y='num_ratings')
    #fig.show()

In [4]:
#setup spark session
sparkSession = (SparkSession.builder
                .appName("Movie ratings streaming")
                .master("local[*]")
                .config("spark.scheduler.mode", "FAIR")
                .getOrCreate())
sparkSession.sparkContext.setLogLevel("ERROR")

dataset = (sparkSession
        .readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "localhost:29092")
        .option("subscribe", "ratings")
        .load())

In [5]:
dataset = dataset.selectExpr("CAST(value AS STRING)")
dataset = dataset.select(f.from_json(f.col("value"), ratingSchema).alias("data")).select("data.*")

In [6]:
dataset = dataset.select("movie", "rating") \
                .groupBy("movie") \
                .agg(f.count("rating").alias("num_ratings"), f.avg("rating").alias("avg_rating")) \
                .sort(f.desc("num_ratings"))

In [7]:
query = dataset \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .trigger(processingTime='5 seconds') \
    .foreachBatch(foreach_batch_function) \
    .start()

In [8]:
query.explain()
query.awaitTermination()

   movie  num_ratings  avg_rating
0    592           29    3.241379
1    434           28    3.321429
2    380           27    3.629630
3    593           26    4.307692
4    344           26    3.000000
5    296           25    3.800000
6    590           25    3.840000
7    150           24    3.916667
8    349           24    3.833333
9    588           23    3.826087


KeyboardInterrupt: 