# Spark Session

In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master("local")
         .appName("read-postgres")
         # Add postgres jar
         .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-9.4.1207.jar")
         .getOrCreate())
sc = spark.sparkContext

# Read Postgres

In [2]:
df_movies = (
    spark.read
    .format("jdbc")
    .option("url", "jdbc:postgresql://host.docker.internal:15432/rainbow_database")
    .option("dbtable", "public.movies")
    .option("user", "unicorn_user")
    .option("password", "magical_password")
    .load()
)

In [3]:
df_ratings = (
    spark.read
    .format("jdbc")
    .option("url", "jdbc:postgresql://host.docker.internal:15432/rainbow_database")
    .option("dbtable", "public.ratings")
    .option("user", "unicorn_user")
    .option("password", "magical_password")
    .load()
)

# Top 10 movies with more ratings

In [4]:
df_movies = df_movies.alias("m")
df_ratings = df_ratings.alias("r")

df_join = df_ratings.join(df_movies, df_ratings.movieId == df_movies.movieId).select("r.*","m.title")

In [5]:
from pyspark.sql import functions as F

df_result = (
    df_join
    .groupBy("title")
    .agg(
        F.count("timestamp").alias("qty_ratings")
        ,F.mean("rating").alias("avg_rating")
    )
    .sort(F.desc("qty_ratings"))
    .limit(10)
)

In [6]:
df_result.coalesce(1).write.format("csv").mode("overwrite").save("/home/jovyan/work/data/output_postgres", header=True)

In [7]:
df_result.show()

+--------------------+-----------+-----------------+
|               title|qty_ratings|       avg_rating|
+--------------------+-----------+-----------------+
| Forrest Gump (1994)|        329|4.164133738601824|
|Shawshank Redempt...|        317|4.429022082018927|
| Pulp Fiction (1994)|        307|4.197068403908795|
|Silence of the La...|        279|4.161290322580645|
|  Matrix, The (1999)|        278|4.192446043165468|
|Star Wars: Episod...|        251|4.231075697211155|
|Jurassic Park (1993)|        238|             3.75|
|   Braveheart (1995)|        237|4.031645569620253|
|Terminator 2: Jud...|        224|3.970982142857143|
|Schindler's List ...|        220|            4.225|
+--------------------+-----------+-----------------+



In [8]:
df_result.show()

+--------------------+-----------+-----------------+
|               title|qty_ratings|       avg_rating|
+--------------------+-----------+-----------------+
| Forrest Gump (1994)|        329|4.164133738601824|
|Shawshank Redempt...|        317|4.429022082018927|
| Pulp Fiction (1994)|        307|4.197068403908795|
|Silence of the La...|        279|4.161290322580645|
|  Matrix, The (1999)|        278|4.192446043165468|
|Star Wars: Episod...|        251|4.231075697211155|
|Jurassic Park (1993)|        238|             3.75|
|   Braveheart (1995)|        237|4.031645569620253|
|Terminator 2: Jud...|        224|3.970982142857143|
|Schindler's List ...|        220|            4.225|
+--------------------+-----------+-----------------+

