import modules

In [13]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F


read the enviroments

In [3]:
def get_config() -> dict:
    load_dotenv()
    host = os.getenv("PG_HOST")
    user = os.getenv("PG_USER")
    password = os.getenv("PG_PASSWORD")
    db_name = os.getenv("PG_DB")
    port = os.getenv("PG_HOST_PORT")
    url = f"jdbc:postgresql://{host}:{port}/{db_name}"
    res = {
        "host" : host,
        "user" : user,
        "password" : password,
        "db_name" : db_name,
        "port" : port,
        "db_url" : url
    }
    return res

Create the session

In [5]:
spark = SparkSession.builder.appName("spark-practice")\
            .config("spark.jars.packages", "org.postgresql:postgresql:42.7.4")\
            .config("spark.sql.shuffle.partitions", "8")\
            .getOrCreate() 

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/07 12:35:49 WARN Utils: Your hostname, MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 10.231.32.43 instead (on interface en0)
25/11/07 12:35:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/m/.ivy2.5.2/cache
The jars for the packages stored in: /Users/m/.ivy2.5.2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e85055da-4b72-463e-8a97-6fe0314b2e02;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.4 in central
	found org.checkerframework#checker-qual;3.42.0 in central
:: resolution report :: resolve 68ms :: artifacts dl 3ms
	:: modules in use:
	org.checkerfra

create the function read_table

In [9]:
def read_table(spark : SparkSession, table_name: str)-> DataFrame:
    config = get_config()

    reader = (spark.read.format("jdbc")\
             .option("url", config["db_url"])\
             .option("dbtable", table_name)\
             .option("user", config["user"])\
             .option("password", config["password"])\
             .option("driver", "org.postgresql.Driver"))
    return reader.load()

Create the dataframes of needable tables 

In [11]:
df_film = read_table(spark, "film")
df_film_category = read_table(spark, "film_category")
df_category = read_table(spark, "category")
df_actor = read_table(spark, "actor")
df_film_actor = read_table(spark, "film_actor")
df_inventory = read_table(spark, "inventory")
df_rental = read_table(spark, 'rental')
df_payment = read_table(spark, 'payment')
df_customer = read_table(spark, 'customer')
df_address = read_table(spark, 'address')
df_city = read_table(spark, "city")


make the query: Output the number of movies in each category, sorted in descending order. 

In [None]:
result1 = df_category.alias("c")\
    .join(df_film_category.alias("fc"), F.col("c.category_id") == F.col("fc.category_id"), "inner")\
    .join(df_film.alias("f"), F.col("f.film_id") == F.col("fc.film_id"), "inner")\
    .groupBy("c.name")\
    .agg(F.count("fc.film_id").alias("amount"))\
    .withColumnRenamed("name", "category")\
    .select("category", "amount")\
    .orderBy(F.desc("amount"))

result1.show()

+-----------+------+
|   category|amount|
+-----------+------+
|      Music|   152|
|      Drama|   152|
|     Travel|   151|
|      Games|   150|
|    Foreign|   150|
|   Children|   150|
|     Sci-Fi|   149|
|     Action|   149|
|  Animation|   148|
|   Classics|   147|
|     Family|   147|
|        New|   147|
|Documentary|   145|
|     Sports|   145|
|     Comedy|   143|
|     Horror|   142|
+-----------+------+

