import modules

In [73]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window


read the enviroments

In [3]:
def get_config() -> dict:
    load_dotenv()
    host = os.getenv("PG_HOST")
    user = os.getenv("PG_USER")
    password = os.getenv("PG_PASSWORD")
    db_name = os.getenv("PG_DB")
    port = os.getenv("PG_HOST_PORT")
    url = f"jdbc:postgresql://{host}:{port}/{db_name}"
    res = {
        "host" : host,
        "user" : user,
        "password" : password,
        "db_name" : db_name,
        "port" : port,
        "db_url" : url
    }
    return res

Create the session

In [5]:
spark = SparkSession.builder.appName("spark-practice")\
            .config("spark.jars.packages", "org.postgresql:postgresql:42.7.4")\
            .config("spark.sql.shuffle.partitions", "8")\
            .getOrCreate() 

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/07 12:35:49 WARN Utils: Your hostname, MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 10.231.32.43 instead (on interface en0)
25/11/07 12:35:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/m/.ivy2.5.2/cache
The jars for the packages stored in: /Users/m/.ivy2.5.2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e85055da-4b72-463e-8a97-6fe0314b2e02;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.4 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 68ms :: artifacts dl 3ms
	:: modules in use:
	org.checkerfra

create the function read_table

In [9]:
def read_table(spark : SparkSession, table_name: str)-> DataFrame:
    config = get_config()

    reader = (spark.read.format("jdbc")\
             .option("url", config["db_url"])\
             .option("dbtable", table_name)\
             .option("user", config["user"])\
             .option("password", config["password"])\
             .option("driver", "org.postgresql.Driver"))
    return reader.load()

Create the dataframes of needable tables 

In [11]:
df_film = read_table(spark, "film")
df_film_category = read_table(spark, "film_category")
df_category = read_table(spark, "category")
df_actor = read_table(spark, "actor")
df_film_actor = read_table(spark, "film_actor")
df_inventory = read_table(spark, "inventory")
df_rental = read_table(spark, 'rental')
df_payment = read_table(spark, 'payment')
df_customer = read_table(spark, 'customer')
df_address = read_table(spark, 'address')
df_city = read_table(spark, "city")


make the query: Output the number of movies in each category, sorted in descending order. 

In [48]:
result1 = df_category.alias("c")\
    .join(df_film_category.alias("fc"), F.col("c.category_id") == F.col("fc.category_id"), "inner")\
    .join(df_film.alias("f"), F.col("f.film_id") == F.col("fc.film_id"), "inner")\
    .groupBy("c.name")\
    .agg(F.count("fc.film_id").alias("amount"))\
    .withColumnRenamed("name", "category")\
    .select("category", "amount")\
    .orderBy(F.desc("amount"))

result1.show()

+-----------+------+
|   category|amount|
+-----------+------+
|      Music|   152|
|      Drama|   152|
|     Travel|   151|
|      Games|   150|
|    Foreign|   150|
|   Children|   150|
|     Sci-Fi|   149|
|     Action|   149|
|  Animation|   148|
|   Classics|   147|
|     Family|   147|
|        New|   147|
|Documentary|   145|
|     Sports|   145|
|     Comedy|   143|
|     Horror|   142|
+-----------+------+



make the query: Output the 10 actors whose movies rented the most, sorted in descending order. 

In [45]:
result2 = df_actor.alias("a")\
                .join(df_film_actor.alias("fa"), F.col("a.actor_id") == F.col("fa.actor_id"), "inner")\
                .join(df_inventory.alias("i"), F.col("fa.film_id") == F.col("i.film_id"), "inner")\
                .join(df_rental.alias("r"), F.col("i.inventory_id") == F.col("r.inventory_id"), "inner")\
                .groupBy('a.actor_id', F.concat_ws(' ', F.col("a.last_name"), F.col("a.first_name")).alias('actor_name'))\
                .agg(F.count(F.expr("*")).alias('amount'))\
                .orderBy(F.desc("amount"), F.col('actor_name'))\
                .limit(10)
result2.show()

+--------+------------------+------+
|actor_id|        actor_name|amount|
+--------+------------------+------+
|     107|    DEGENERES GINA|   753|
|     181|    CARREY MATTHEW|   678|
|     198|       KEITEL MARY|   674|
|     144|WITHERSPOON ANGELA|   654|
|     102|       TORN WALTER|   640|
|      60|       BERRY HENRY|   612|
|     150|       NOLTE JAYNE|   611|
|      37|        BOLGER VAL|   605|
|      23|     KILMER SANDRA|   604|
|      90|      GUINESS SEAN|   599|
+--------+------------------+------+



make the query: Output the category of movies on which the most money was spent. 

In [58]:
result3 = df_category.alias("c")\
                .join(df_film_category.alias('fc'), F.col("c.category_id") == F.col("fc.category_id"), "inner")\
                .join(df_inventory.alias('i'), F.col("fc.film_id") == F.col("i.film_id"), "inner")\
                .join(df_rental.alias('r'), F.col("i.inventory_id") == F.col("r.inventory_id"), "inner")\
                .join(df_payment.alias('p'), F.col("r.rental_id") == F.col("p.rental_id"), "inner")\
                .filter(F.col("p.amount") > 0)\
                .groupBy('c.category_id', 'c.name')\
                .agg(F.sum("p.amount").alias("price"))\
                .orderBy(F.desc('price'))\
                .limit(1)
result3.show()

+-----------+-------+--------+
|category_id|   name|   price|
+-----------+-------+--------+
|          9|Foreign|10507.67|
+-----------+-------+--------+



make the query: Output the names of movies that are not in the inventory. 

In [71]:
result4 = df_film.alias('f')\
                .join(df_inventory.alias('i'), F.col("f.film_id") == F.col("i.film_id"), 'left')\
                .filter(F.col("i.film_id").isNull())\
                .select('f.film_id', 'f.title')
result4.show(100, False, True)

-RECORD 0-------------------------
 film_id | 14                     
 title   | ALICE FANTASIA         
-RECORD 1-------------------------
 film_id | 38                     
 title   | ARK RIDGEMONT          
-RECORD 2-------------------------
 film_id | 148                    
 title   | CHOCOLATE DUCK         
-RECORD 3-------------------------
 film_id | 171                    
 title   | COMMANDMENTS EXPRESS   
-RECORD 4-------------------------
 film_id | 198                    
 title   | CRYSTAL BREAKING       
-RECORD 5-------------------------
 film_id | 221                    
 title   | DELIVERANCE MULHOLLAND 
-RECORD 6-------------------------
 film_id | 802                    
 title   | SKY MIRACLE            
-RECORD 7-------------------------
 film_id | 712                    
 title   | RAIDERS ANTITRUST      
-RECORD 8-------------------------
 film_id | 742                    
 title   | ROOF CHAMPION          
-RECORD 9-------------------------
 film_id | 860      

make the query: Output the top 3 actors who have appeared most in movies in the “Children” category. If several actors have the same number of movies, output all of them. 

In [99]:
counts = df_actor.alias('a')\
            .join(df_film_actor.alias('fa'), F.col("a.actor_id") == F.col("fa.actor_id"), "inner")\
            .join(df_film_category.alias('fc'), F.col("fa.film_id") == F.col("fc.film_id"), "inner")\
            .join(df_category.alias("c"), F.col("fc.category_id") == F.col("c.category_id"), 'inner')\
            .where(F.col("c.name") == "Children")\
            .groupBy('a.actor_id', F.concat_ws(' ', "a.last_name", "a.first_name").alias("actor_name"))\
            .agg(F.count('fa.actor_id').alias("films_cnt"))
ranked = counts.withColumn("rnk", F.dense_rank().over(Window.orderBy(F.desc("films_cnt"))))
result5 = ranked.filter(F.col("rnk") <= 3)\
          .select("actor_id", "actor_name", "films_cnt")\
          .orderBy(F.desc("films_cnt"), "actor_name")
result5.show()

+--------+--------------+---------+
|actor_id|    actor_name|films_cnt|
+--------+--------------+---------+
|     105|  CROWE SIDNEY|        9|
|     139|  GOODING EWAN|        9|
|     133|  PENN RICHARD|        9|
|     145|     ALLEN KIM|        8|
|     181|CARREY MATTHEW|        8|
|      56|    HARRIS DAN|        8|
|     131|  JACKMAN JANE|        8|
|      87|  PECK SPENCER|        8|
|     142|    RYDER JADA|        8|
|      66|    TANDY MARY|        8|
|     149|TEMPLE RUSSELL|        8|
|      29|    WAYNE ALEC|        8|
|     123|DENCH JULIANNE|        7|
|      65| HUDSON ANGELA|        7|
|     108|  NOLTE WARREN|        7|
|      34|OLIVIER AUDREY|        7|
|      84|    PITT JAMES|        7|
|      94|  TORN KENNETH|        7|
|      17|  VOIGHT HELEN|        7|
|      95|WAHLBERG DARYL|        7|
+--------+--------------+---------+
only showing top 20 rows


25/11/07 15:54:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/07 15:54:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/07 15:54:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/07 15:54:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/07 15:54:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/07 15:54:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/07 1