# 05 Joins (inner) 

- Read 3 dataframes from 3 CSV files and write them to tables
- Using spark.sql() perform a query that joins (inner) all 3 tables
- Ditto using the DataFrame API

---

## The model of the problem


![](../../datasets/club_data_model.jpg)

In [1]:
# 1. Create a spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.version

'4.0.1'

In [None]:
# 2. Read 3 csv files to dataframes, save dataframes to tables
# File1: ../../datasets/facilities.csv
# File2: ../../datasets/members.csv
# File3: ../../datasets/bookings.csv

facilities_schema = """facility_id INT, facility_name STRING, member_cost DOUBLE, guest_cost DOUBLE, 
    initial_outlay DOUBLE, monthly_maintenance DOUBLE"""

members_schema = """member_id INT, last_name STRING, first_name STRING, address STRING, zip_code STRING, 
    telephone STRING, recommended_by STRING, joining_date DATE"""

bookings_schema = "booking_id INT, facility_id INT, member_id INT, start_time TIMESTAMP, slots INT"


facilities_df = (
    spark.read.format("csv")
        .option("header", "true")
        .schema(facilities_schema)
        .load("../../datasets/facilities.csv")
)

members_df = (
    spark.read.format("csv")
        .option("header", "true")
        .schema(members_schema)
        .load("../../datasets/members.csv")
)

bookings_df = (
    spark.read.format("csv")
        .option("header", "true")
        .schema(bookings_schema)
        .load("../../datasets/bookings.csv")
)


facilities_df.write.mode("overwrite").saveAsTable("facilities")
members_df.write.mode("overwrite").saveAsTable("members")
bookings_df.write.mode("overwrite").saveAsTable("bookings")

In [7]:
# 4. Create a facility bookings report as follows:
# member_id | first_name | last_name | facility_name | slots | booking_amount | start_time
# --------------------------------------------------------------------------------------------
# Criteria:
# Facility bookings are made by a person whose last name is Smith
# He has booked more than 5 slots in a single booking
# The report should be sorted by first name of the member in ascending order and booking amount in descending order


sql = """
SELECT
    m.member_id,
    m.first_name,
    m.last_name,
    f.facility_name,
    b.slots,
    b.slots * f.member_cost as booking_amount,
    b.start_time
FROM members AS m
INNER JOIN bookings AS b
    ON m.member_id = b.member_id
INNER JOIN facilities AS f
    ON f.facility_id = b.facility_id
WHERE m.last_name = 'Smith'
  AND b.slots > 5
ORDER BY m.first_name ASC, b.slots DESC;
"""

spark.sql(sql).show()

+---------+----------+---------+---------------+-----+--------------+-------------------+
|member_id|first_name|last_name|  facility_name|slots|booking_amount|         start_time|
+---------+----------+---------+---------------+-----+--------------+-------------------+
|        1|    Darren|    Smith|Badminton Court|    9|           0.0|2022-08-28 16:30:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-07-09 12:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-07-27 15:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-07-29 15:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-08-01 12:30:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-08-07 12:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-08-20 18:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-09-07 17:00:00|
|        1

In [11]:
# 5. Ditto using the DataFrame API

from pyspark.sql.functions import col, expr

bookings_df = spark.table("bookings").alias("b")
members_df = spark.table("members").alias("m")
facilities_df = spark.table("facilities").alias("f")

report_df = (
    bookings_df
        .join(members_df, expr("b.member_id = m.member_id"), "inner")
        .join(facilities_df, expr("b.facility_id = f.facility_id"), "inner")
        .filter("m.last_name == 'Smith' and b.slots > 5")
        .selectExpr("m.member_id", "m.first_name", "m.last_name", "f.facility_name", "b.slots",
            "b.slots * f.member_cost as booking_amount", "b.start_time")
        .orderBy(col("m.first_name").asc(),
                 col("booking_amount").desc())
)

report_df.show()

+---------+----------+---------+---------------+-----+--------------+-------------------+
|member_id|first_name|last_name|  facility_name|slots|booking_amount|         start_time|
+---------+----------+---------+---------------+-----+--------------+-------------------+
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-07-09 12:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-07-27 15:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-07-29 15:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-08-01 12:30:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-08-07 12:00:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-08-20 18:00:00|
|        1|    Darren|    Smith|Badminton Court|    9|           0.0|2022-08-28 16:30:00|
|        1|    Darren|    Smith|Badminton Court|    6|           0.0|2022-09-07 17:00:00|
|        1