## Joins

In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, coalesce, lit

# Create a Spark session
spark = SparkSession.builder.appName("day3").getOrCreate()



Question: You are given two DataFrames: employees_df and departments_df, which contain information about employees and their respective departments. The schema for the DataFrames is as follows:

employees_df schema:
|-- employee_id: integer (nullable = true)
|-- employee_name: string (nullable = true)
|-- department_id: integer (nullable = true)

departments_df schema:

|-- department_id: integer (nullable = true)
|-- department_name: string (nullable = true)

Employees DataFrame:
                                                                                
+-----------+-------------+-------------+
|employee_id|employee_name|department_id|
+-----------+-------------+-------------+
|1          |Pallavi mam  |101          |
|2          |Bob          |102          |
|3          |Cathy        |101          |
|4          |David        |103          |
|5          |Amrit Sir    |104          |
|6          |Alice        |null         |
|7          |Eva          |null         |
|8          |Frank        |110          |
|9          |Grace        |109          |
|10         |Henry        |null         |
+-----------+-------------+-------------+



Departments DataFrame:
+-------------+------------------------+
|department_id|department_name         |
+-------------+------------------------+
|101          |HR                      |
|102          |Engineering             |
|103          |Finance                 |
|104          |Marketing               |
|105          |Operations              |
|106          |null                    |
|107          |Operations              |
|108          |Production              |
|null         |Finance                 |
|110          |Research and Development|
+-------------+----------------------


In [15]:
# Define the schema using StructType and StructField
custom_employee_schema = StructType([
    StructField("employee_id", IntegerType(), True),   # True allows null values
    StructField("employee_name", StringType(), True),
    StructField("department_id", IntegerType(), True)
])

# Define the data
data = [
    (1, "Pallavi mam", 101),
    (2, "Bob", 102),
    (3, "Cathy", 101),
    (4, "David", 103),
    (5, "Amrit Sir", 104),
    (6, "Alice", None),  # Use None for null values
    (7, "Eva", None),
    (8, "Frank", 110),
    (9, "Grace", 109),
    (10, "Henry", None)]

# Create a DataFrame with the custom schema and data
employee_df =spark.createDataFrame(data,schema=custom_employee_schema)

# Show the DataFrame
employee_df.show()
employee_df.printSchema()

+-----------+-------------+-------------+
|employee_id|employee_name|department_id|
+-----------+-------------+-------------+
|          1|  Pallavi mam|          101|
|          2|          Bob|          102|
|          3|        Cathy|          101|
|          4|        David|          103|
|          5|    Amrit Sir|          104|
|          6|        Alice|         null|
|          7|          Eva|         null|
|          8|        Frank|          110|
|          9|        Grace|          109|
|         10|        Henry|         null|
+-----------+-------------+-------------+

root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department_id: integer (nullable = true)



In [26]:
# Define the schema using StructType and StructField
custom_department_schema= StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department_name", StringType(), True)
])

# Define the  data as a list of tuples
department_data = [
    (101, "HR"),
    (102, "Engineering"),
    (103, "Finance"),
    (104, "Marketing"),
    (105, "Operations"),
    (106, None),  # Use None for null values
    (107, "Operations"),
    (108, "Production"),
    (None, "Finance"),  
    (110, "Research and Development")
]

# Create a DataFrame for departments
department_df= spark.createDataFrame(department_data, custom_department_schema)

# Show the DataFrame
department_df.show()
department_df.printSchema()


+-------------+--------------------+
|department_id|     department_name|
+-------------+--------------------+
|          101|                  HR|
|          102|         Engineering|
|          103|             Finance|
|          104|           Marketing|
|          105|          Operations|
|          106|                null|
|          107|          Operations|
|          108|          Production|
|         null|             Finance|
|          110|Research and Deve...|
+-------------+--------------------+

root
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nullable = true)



In [86]:
# Register the DataFrames as temporary SQL tables
employee_df.createOrReplaceTempView("employees")
department_df.createOrReplaceTempView("departments")

### Join Expressions

Question: How can you combine the employees_df and departments_df DataFrames based on the common "department_id" column to get a combined DataFrame with employee names and their respective department names?

In [48]:
# Perform an inner join between the employees_df and departments_df DataFrames based on the "department_id" column.
combined_df= employee_df.join(department_df,
                              on="department_id",
                              how="inner")

# Select and display specific columns
combined_df.select(col("department_id"), col("employee_id"), col("employee_name"), col("department_name")).show()

+-------------+-----------+-------------+--------------------+
|department_id|employee_id|employee_name|     department_name|
+-------------+-----------+-------------+--------------------+
|          101|          1|  Pallavi mam|                  HR|
|          101|          3|        Cathy|                  HR|
|          102|          2|          Bob|         Engineering|
|          103|          4|        David|             Finance|
|          104|          5|    Amrit Sir|           Marketing|
|          110|          8|        Frank|Research and Deve...|
+-------------+-----------+-------------+--------------------+



In [93]:
# Perforn innerjoin operation 
query = """
    SELECT e.department_id, e.employee_id, e.employee_name, d.department_name
    FROM employees  e
    INNER JOIN departments  d ON e.department_id = d.department_id
"""

# Execute the SQL query display result
combined_df = spark.sql(query).show()


+-------------+-----------+-------------+--------------------+
|department_id|employee_id|employee_name|     department_name|
+-------------+-----------+-------------+--------------------+
|          101|          1|  Pallavi mam|                  HR|
|          101|          3|        Cathy|                  HR|
|          102|          2|          Bob|         Engineering|
|          103|          4|        David|             Finance|
|          104|          5|    Amrit Sir|           Marketing|
|          110|          8|        Frank|Research and Deve...|
+-------------+-----------+-------------+--------------------+



### Inner Joins

Question: How can you retrieve employee names and their respective department names for employees belonging to the "Engineering" department?

In [92]:
# Perform an inner join to retrieve names and departments
retrieve_name_department= employee_df.join(department_df, 
                                            on="department_id", 
                                            how="inner")

# Display employees belonging to the "Engineering" department
retrieve_name_department.filter(col('department_name')=="Engineering"
                                ).select("employee_name","department_name"
                                         ).show()

+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|          Bob|    Engineering|
+-------------+---------------+



In [97]:
# Query to retrieve employee names and department names in the "Engineering" department
query2 = """
    SELECT employee_name, department_name
    FROM employees e
    INNER JOIN departments d ON e.department_id = d.department_id
    WHERE d.department_name = 'Engineering'
"""

# Execute the SQL query and store the result in a DataFrame
engineering_employees = spark.sql(query2).show()


+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|          Bob|    Engineering|
+-------------+---------------+



### Outer Joins

Question: Retrieve a DataFrame that contains all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [46]:
#Perform an outer join to retrieve all employees and departments,
# and fill missing department values with "No Department"
retrieve_all_employee= employee_df.join(department_df, 
                                        on="department_id", 
                                        how="outer").na.fill("No Department")

# Select and display employee names and department names
retrieve_all_employee.select(col("employee_name"), col("department_name")).show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|        Alice|       No Department|
|          Eva|       No Department|
|        Henry|       No Department|
|No Department|             Finance|
|  Pallavi mam|                  HR|
|        Cathy|                  HR|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|No Department|          Operations|
|No Department|       No Department|
|No Department|          Operations|
|No Department|          Production|
|        Grace|       No Department|
|        Frank|Research and Deve...|
+-------------+--------------------+



In [100]:
# Query to retrieve all employees and departments,
# and fill missing department values with "No Department"
query3 = """
    SELECT e.employee_name, COALESCE(d.department_name, 'No Department') AS department_name
    FROM employees e
    FULL OUTER JOIN departments d ON e.department_id = d.department_id
"""

# Execute the SQL query and store the result in a DataFrame
all_employees = spark.sql(query3).show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|        Alice|       No Department|
|          Eva|       No Department|
|        Henry|       No Department|
|         null|             Finance|
|  Pallavi mam|                  HR|
|        Cathy|                  HR|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|         null|          Operations|
|         null|       No Department|
|         null|          Operations|
|         null|          Production|
|        Grace|       No Department|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Left Outer Joins

Question: List all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [101]:
# Perform a left outer join to retrieve all employees and their departments,
# and fill missing department values with "No Department"
all_employee= employee_df.join(department_df,
                               on="department_id",
                               how="left_outer")

# Select and display employee names and department names, filling missing department values
all_employee.select(
    col("employee_name"),
    coalesce(col("department_name"), lit("No Department")).alias("department_name")
).show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  HR|
|          Bob|         Engineering|
|        Cathy|                  HR|
|        David|             Finance|
|        Alice|       No Department|
|    Amrit Sir|           Marketing|
|          Eva|       No Department|
|        Henry|       No Department|
|        Grace|       No Department|
|        Frank|Research and Deve...|
+-------------+--------------------+



In [105]:
# Write a Spark SQL query to retrieve all employees and their departments,
# and fill missing department values with "No Department"
query4 = """
    SELECT e.employee_name, COALESCE(d.department_name, 'No Department') AS department_name
    FROM employees e
    LEFT JOIN departments d ON e.department_id = d.department_id
"""

# Execute the SQL query and display the result 
all_employees = spark.sql(query4).show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  HR|
|          Bob|         Engineering|
|        Cathy|                  HR|
|        David|             Finance|
|        Alice|       No Department|
|    Amrit Sir|           Marketing|
|          Eva|       No Department|
|        Henry|       No Department|
|        Grace|       No Department|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Right Outer Joins

Question: Display a list of departments along with employee names. If a department has no employees, display "No Employees".



In [106]:
# Perform a right outer join to retrieve all departments and their employees
all_department= employee_df.join(department_df,
                               on="department_id",
                               how="right_outer")

# Select and display department names and employee names, filling missing employee names
all_department.select(
    col("department_name"), 
    coalesce(col("employee_name"), 
             lit("No Employee")
             ).alias("employee_name")
    ).show()

+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  HR|        Cathy|
|                  HR|  Pallavi mam|
|         Engineering|          Bob|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|          Operations|  No Employee|
|                null|  No Employee|
|          Production|  No Employee|
|             Finance|  No Employee|
|          Operations|  No Employee|
|Research and Deve...|        Frank|
+--------------------+-------------+



In [108]:
# Query to retrieve all departments and their employees
query5 = """
    SELECT d.department_name, COALESCE(e.employee_name, 'No Employee') AS employee_name
    FROM departments d
    LEFT JOIN employees e ON d.department_id = e.department_id
"""

# Execute the SQL query and display the result in a DataFrame
all_departments = spark.sql(query5).show()

+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  HR|        Cathy|
|                  HR|  Pallavi mam|
|         Engineering|          Bob|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|          Operations|  No Employee|
|                null|  No Employee|
|          Production|  No Employee|
|             Finance|  No Employee|
|          Operations|  No Employee|
|Research and Deve...|        Frank|
+--------------------+-------------+



### Left Semi Joins

Question: Retrieve a DataFrame that includes employee names for departments that have employees.



In [73]:
# Perform a left semi-join to retrieve employees who have a department
employee_having_department= employee_df.join(department_df,
                                             on="department_id",
                                             how="left_semi")

# Select and display employee names
employee_having_department.select(col("employee_name")).show()

+-------------+
|employee_name|
+-------------+
|  Pallavi mam|
|        Cathy|
|          Bob|
|        David|
|    Amrit Sir|
|        Frank|
+-------------+



In [111]:
#  Query to retrieve employees who have a department
query5 = """
    SELECT e.employee_name
    FROM employees e
    WHERE EXISTS (
        SELECT 1
        FROM departments d
        WHERE e.department_id = d.department_id
    )
"""

# Execute the SQL query and dispaly the result in a DataFrame
employees_with_departments = spark.sql(query5).show()


+-------------+
|employee_name|
+-------------+
|  Pallavi mam|
|        Cathy|
|          Bob|
|        David|
|    Amrit Sir|
|        Frank|
+-------------+



### Left Anti Joins

Question: Find the employees who don't belong to any department.

In [110]:
# Perform a left anti-join to retrieve employees who do not have a department
employee_without_department= employee_df.join(department_df,
                                             on="department_id",
                                             how="left_anti")

# Select and display employee names not belonging to any deppartment
employee_without_department.select(col("employee_name")).show()

+-------------+
|employee_name|
+-------------+
|        Alice|
|          Eva|
|        Henry|
|        Grace|
+-------------+



In [114]:
# Query to retrieve employees who do not have a department
query6 = """
    SELECT e.employee_name
    FROM employees e
    WHERE NOT EXISTS (
        SELECT 1
        FROM departments d
        WHERE e.department_id = d.department_id
    )
"""

# Execute the SQL query and DISPLAY the result in a DataFrame
employees_without_department = spark.sql(query6).show()

+-------------+
|employee_name|
+-------------+
|        Alice|
|          Eva|
|        Henry|
|        Grace|
+-------------+



### Cross (Cartesian) Joins

Question: Create a DataFrame that contains all possible combinations of employees and departments.

In [115]:
# Perform a cross-join to create all possible combinations of employees and departments
all_possible_combo= employee_df.crossJoin(department_df).orderBy("employee_id")

# Show the result DataFrame
all_possible_combo.show()



+-----------+-------------+-------------+-------------+--------------------+
|employee_id|employee_name|department_id|department_id|     department_name|
+-----------+-------------+-------------+-------------+--------------------+
|          1|  Pallavi mam|          101|          101|                  HR|
|          1|  Pallavi mam|          101|          103|             Finance|
|          1|  Pallavi mam|          101|          107|          Operations|
|          1|  Pallavi mam|          101|          104|           Marketing|
|          1|  Pallavi mam|          101|          105|          Operations|
|          1|  Pallavi mam|          101|          108|          Production|
|          1|  Pallavi mam|          101|          102|         Engineering|
|          1|  Pallavi mam|          101|         null|             Finance|
|          1|  Pallavi mam|          101|          106|                null|
|          1|  Pallavi mam|          101|          110|Research and Deve...|

                                                                                

In [117]:
# Query to perform a cross-join and order the result by "employee_id"
query7 = """
    SELECT *
    FROM employees
    CROSS JOIN departments
    ORDER BY employees.employee_id
"""

# Execute the SQL query and dispaly the result in a DataFrame
all_possible_combinations = spark.sql(query7).show()



+-----------+-------------+-------------+-------------+--------------------+
|employee_id|employee_name|department_id|department_id|     department_name|
+-----------+-------------+-------------+-------------+--------------------+
|          1|  Pallavi mam|          101|          107|          Operations|
|          1|  Pallavi mam|          101|          103|             Finance|
|          1|  Pallavi mam|          101|          101|                  HR|
|          1|  Pallavi mam|          101|          104|           Marketing|
|          1|  Pallavi mam|          101|          105|          Operations|
|          1|  Pallavi mam|          101|          102|         Engineering|
|          1|  Pallavi mam|          101|          108|          Production|
|          1|  Pallavi mam|          101|          106|                null|
|          1|  Pallavi mam|          101|         null|             Finance|
|          1|  Pallavi mam|          101|          110|Research and Deve...|

                                                                                