## Joins

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, lit , avg, coalesce , struct,array , explode, create_map,approx_count_distinct,sumDistinct, sum, mean
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
# Create a Spark session
spark = SparkSession.builder.appName("day3").getOrCreate()


In [4]:


employee_data = [(1,'Pallavi mam',101),
                 (2,'Bob',102),
                 (3,'Cathy',101),
                 (4,'David',103),
                 (5,'Amrit Sir',104),
                 (6,'Alice',None),
                 (7,'Eva',None),
                 (8,'Frank',110),
                 (9,'Grace',109),
                 (10,'Henry',None)]

Department_Data = [(101,'Hr'),
                   (102,'Engineering'),
                   (103,'Finance'),
                   (104,'Marketing'),
                   (105,'Operation'),
                   (106,None),
                   (107,'Operations'),
                   (108,'Production'),
                   (None,'Finance'),
                   (110,'Research and Development')]

employee_schema = StructType([
    StructField("Employee_Id",IntegerType(),True),
    StructField("Employee_name",StringType(),True),
    StructField("department_id",IntegerType(),True)
])

department_schema = StructType([
    StructField("department_id",IntegerType(),True),
    StructField("department_name",StringType(),True)
])

employee_df  = spark.createDataFrame(data=employee_data,schema=employee_schema)
department_df = spark.createDataFrame(data=Department_Data, schema=department_schema)

print("Employee Dataframe")
employee_df.printSchema()
employee_df.show()

print("Department Dataframe")
department_df.printSchema()
department_df.show(truncate=False)

Employee Dataframe
root
 |-- Employee_Id: integer (nullable = true)
 |-- Employee_name: string (nullable = true)
 |-- department_id: integer (nullable = true)



                                                                                

+-----------+-------------+-------------+
|Employee_Id|Employee_name|department_id|
+-----------+-------------+-------------+
|          1|  Pallavi mam|          101|
|          2|          Bob|          102|
|          3|        Cathy|          101|
|          4|        David|          103|
|          5|    Amrit Sir|          104|
|          6|        Alice|         null|
|          7|          Eva|         null|
|          8|        Frank|          110|
|          9|        Grace|          109|
|         10|        Henry|         null|
+-----------+-------------+-------------+

Department Dataframe
root
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nullable = true)

+-------------+------------------------+
|department_id|department_name         |
+-------------+------------------------+
|101          |Hr                      |
|102          |Engineering             |
|103          |Finance                 |
|104          |Marketing               |
|105

In [24]:
#registering the DataFrames as temporary SQL tables
employee_df.createOrReplaceTempView("employees")
department_df.createOrReplaceTempView("departments")

### Join Expressions

Question: How can you combine the employees_df and departments_df DataFrames based on the common "department_id" column to get a combined DataFrame with employee names and their respective department names?

In [9]:
#using spark-sql to perform the join
combined_df = spark.sql("""
    SELECT e.employee_id , d.department_id ,e.Employee_name, d.department_name
    FROM employees e
    JOIN departments d
    ON e.department_id = d.department_id
""")

# Show the resulting combined DataFrame
combined_df.show(truncate=False)

+-----------+-------------+-------------+------------------------+
|employee_id|department_id|Employee_name|department_name         |
+-----------+-------------+-------------+------------------------+
|1          |101          |Pallavi mam  |Hr                      |
|3          |101          |Cathy        |Hr                      |
|2          |102          |Bob          |Engineering             |
|4          |103          |David        |Finance                 |
|5          |104          |Amrit Sir    |Marketing               |
|8          |110          |Frank        |Research and Development|
+-----------+-------------+-------------+------------------------+



### Inner Joins

Question: How can you retrieve employee names and their respective department names for employees belonging to the "Engineering" department?

In [10]:
#using spark-sql to perform the inner join
result_df = spark.sql("""
    SELECT e.Employee_name, d.department_name
    FROM employees e
    JOIN departments d
    ON e.department_id = d.department_id
    WHERE d.department_name = 'Engineering'
""")

#showing the resulting dataframe
result_df.show(truncate=False)

+-------------+---------------+
|Employee_name|department_name|
+-------------+---------------+
|Bob          |Engineering    |
+-------------+---------------+



### Outer Joins

Question: Retrieve a DataFrame that contains all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [14]:
#using spark-sql to perform a left outer join
outer_join_sql = spark.sql("""
    SELECT COALESCE(e.Employee_name, 'No Employee') AS employee_name, 
           COALESCE(d.department_name, 'No Department') AS department_name
    FROM employees e
    FULL OUTER JOIN departments d
    ON d.department_id = e.department_id
""")

#showing the resulting dataframe
outer_join_sql.show(truncate=False)


+-------------+------------------------+
|employee_name|department_name         |
+-------------+------------------------+
|Alice        |No Department           |
|Eva          |No Department           |
|Henry        |No Department           |
|No Employee  |Finance                 |
|Pallavi mam  |Hr                      |
|Cathy        |Hr                      |
|Bob          |Engineering             |
|David        |Finance                 |
|Amrit Sir    |Marketing               |
|No Employee  |Operation               |
|No Employee  |No Department           |
|No Employee  |Operations              |
|No Employee  |Production              |
|Grace        |No Department           |
|Frank        |Research and Development|
+-------------+------------------------+



### Left Outer Joins

Question: List all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [15]:
#using spark-sql to perform a left outer join
left_outer_join_sql = spark.sql("""
    SELECT e.Employee_name, COALESCE(d.department_name, 'No Department') AS department_name
    FROM employees e
    LEFT JOIN departments d
    ON e.department_id = d.department_id
""")
#showing the resulting dataframe
left_outer_join_sql.show()

+-------------+--------------------+
|Employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  Hr|
|          Bob|         Engineering|
|        Cathy|                  Hr|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Alice|       No Department|
|          Eva|       No Department|
|        Frank|Research and Deve...|
|        Grace|       No Department|
|        Henry|       No Department|
+-------------+--------------------+



### Right Outer Joins

Question: Display a list of departments along with employee names. If a department has no employees, display "No Employees".



In [18]:
#using sqark-sql to perform a right outer join
right_outer_join_sql = spark.sql("""
    SELECT COALESCE(e.Employee_name, 'No Employees') AS employee_name, d.department_name
    FROM employees e
    RIGHT JOIN departments d
    ON e.department_id = d.department_id
""")
#showing the resulting dataframe
right_outer_join_sql.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|        Cathy|                  Hr|
|  Pallavi mam|                  Hr|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
| No Employees|           Operation|
| No Employees|                null|
| No Employees|          Operations|
| No Employees|          Production|
| No Employees|             Finance|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Left Semi Joins

Question: Retrieve a DataFrame that includes employee names for departments that have employees.



In [30]:

#using spark-sql to perform a left semi join
left_semi_join_sql = spark.sql("""
    SELECT e.Employee_name
    FROM employees e
    LEFT SEMI JOIN departments d
    ON e.department_id = d.department_id
""")

#showing the resulting DataFrame
left_semi_join_sql.show()


+-------------+
|Employee_name|
+-------------+
|  Pallavi mam|
|        Cathy|
|          Bob|
|        David|
|    Amrit Sir|
|        Frank|
+-------------+



### Left Anti Joins

Question: Find the employees who don't belong to any department.

In [32]:

#using spark-sql to perform a left anti join
left_anti_join_sql = spark.sql("""
    SELECT e.Employee_name
    FROM employees e
    LEFT ANTI JOIN departments d
    ON e.department_id = d.department_id
""")

#showing the resulting dataframe
left_anti_join_sql.show()

+-------------+
|Employee_name|
+-------------+
|        Alice|
|          Eva|
|        Grace|
|        Henry|
+-------------+



### Cross (Cartesian) Joins

Question: Create a DataFrame that contains all possible combinations of employees and departments.

In [37]:
#using spark-sql to perform a cross join (Cartesian join)
cross_join_sql = spark.sql("""
    SELECT *
    FROM employees e
    CROSS JOIN departments d
""")

#showing the resulting dataframe
cross_join_sql.show()




+-----------+-------------+-------------+-------------+--------------------+
|Employee_Id|Employee_name|department_id|department_id|     department_name|
+-----------+-------------+-------------+-------------+--------------------+
|          1|  Pallavi mam|          101|          101|                  Hr|
|          1|  Pallavi mam|          101|          102|         Engineering|
|          1|  Pallavi mam|          101|          103|             Finance|
|          1|  Pallavi mam|          101|          104|           Marketing|
|          1|  Pallavi mam|          101|          105|           Operation|
|          1|  Pallavi mam|          101|          106|                null|
|          1|  Pallavi mam|          101|          107|          Operations|
|          1|  Pallavi mam|          101|          108|          Production|
|          1|  Pallavi mam|          101|         null|             Finance|
|          1|  Pallavi mam|          101|          110|Research and Deve...|

                                                                                