In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, lit , avg, coalesce , struct,array , explode, create_map
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


# Create a Spark session
spark = SparkSession.builder.appName("day4").getOrCreate()


23/09/03 09:34:29 WARN Utils: Your hostname, rojesh-Predator-PHN16-71 resolves to a loopback address: 127.0.1.1; using 192.168.254.218 instead (on interface wlp0s20f3)
23/09/03 09:34:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/03 09:34:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:

#defining the schema for employees_df
employees_schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("employee_name", StringType(), True),
    StructField("department_id", IntegerType(), True)
])

#creating employees_df DataFrame
employees_data = [
    (1, "Pallavi mam", 101),
    (2, "Bob", 102),
    (3, "Cathy", 101),
    (4, "David", 103),
    (5, "Amrit Sir", 104),
    (6, "Alice", None),
    (7, "Eva", None),
    (8, "Frank", 110),
    (9, "Grace", 109),
    (10, "Henry", None)
]

employees_df = spark.createDataFrame(employees_data, schema=employees_schema)

#defining the schema for departments_df
departments_schema = StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department_name", StringType(), True)
])

#creating departments_df DataFrame
departments_data = [
    (101, "HR"),
    (102, "Engineering"),
    (103, "Finance"),
    (104, "Marketing"),
    (105, "Operations"),
    (106, None),
    (107, "Operations"),
    (108, "Production"),
    (None, "Finance"),
    (110, "Research and Development")
]

departments_df = spark.createDataFrame(departments_data, schema=departments_schema)

#showing the DataFrames
employees_df.show()
departments_df.show()



+-----------+-------------+-------------+
|employee_id|employee_name|department_id|
+-----------+-------------+-------------+
|          1|  Pallavi mam|          101|
|          2|          Bob|          102|
|          3|        Cathy|          101|
|          4|        David|          103|
|          5|    Amrit Sir|          104|
|          6|        Alice|         null|
|          7|          Eva|         null|
|          8|        Frank|          110|
|          9|        Grace|          109|
|         10|        Henry|         null|
+-----------+-------------+-------------+

+-------------+--------------------+
|department_id|     department_name|
+-------------+--------------------+
|          101|                  HR|
|          102|         Engineering|
|          103|             Finance|
|          104|           Marketing|
|          105|          Operations|
|          106|                null|
|          107|          Operations|
|          108|          Production|
|   

In [4]:
employees_df.printSchema()


departments_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department_id: integer (nullable = true)

root
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nullable = true)



### Join Expressions

Question: How can you combine the employees_df and departments_df DataFrames based on the common "department_id" column to get a combined DataFrame with employee names and their respective department names?

In [5]:

#combining the DataFrames using an inner join on the "department_id" column
combined_df = employees_df.join(departments_df, on='department_id', how='inner')

#selecting the columns you want to keep in the combined DataFrame
selected_columns = ["employee_name", "department_name"]

#selecting only the desired columns
result_df = combined_df.select(selected_columns)

#showing the result DataFrame
result_df.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  HR|
|        Cathy|                  HR|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Inner Joins

Question: How can you retrieve employee names and their respective department names for employees belonging to the "Engineering" department?

In [6]:
#combining the DataFrames using an inner join on the "department_id" column
combined_df = employees_df.join(departments_df, on='department_id', how='inner')

#filtering the combined DataFrame to get employees in the "Engineering" department
engineering_employees_df = combined_df.filter(combined_df.department_name == "Engineering")

#selecting employee names and department names
result_df = engineering_employees_df.select("employee_name", "department_name")

#showing the result DataFrame
result_df.show()

+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|          Bob|    Engineering|
+-------------+---------------+



In [7]:
#renaming the "department_id" column in employees_df to match departments_df
employees_df = employees_df.withColumnRenamed("department_id", "dept_id")

#performing a right outer join on the renamed "dept_id" column
all_departments_df = departments_df.join(employees_df, departments_df["department_id"] == employees_df["dept_id"], "right_outer")

#replacing null employee names with "No Employees"
result_df = all_departments_df.withColumn(
    "employee_name",
    coalesce(all_departments_df["employee_name"], lit("No Employees"))
)

#selecting department names and employee names
result_df = result_df.select("department_name", "employee_name")

#showing the result DataFrame
result_df.show()

+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  HR|  Pallavi mam|
|         Engineering|          Bob|
|                  HR|        Cathy|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|                null|        Alice|
|                null|          Eva|
|Research and Deve...|        Frank|
|                null|        Grace|
|                null|        Henry|
+--------------------+-------------+



### Left Outer Joins

Question: List all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [8]:


#performing a left outer join on the renamed "dept_id" column
all_employees_df = employees_df.join(departments_df, employees_df["dept_id"] == departments_df["department_id"], "left_outer")

#replacing null department names with "No Department"
result_df = all_employees_df.withColumn(
    "department_name",
    coalesce(all_employees_df["department_name"], lit("No Department"))
)

#selecting employee names and department names
result_df2 = result_df.select("employee_name", "department_name")

#shwoing the result DataFrame
result_df2.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  HR|
|          Bob|         Engineering|
|        Cathy|                  HR|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Alice|       No Department|
|          Eva|       No Department|
|        Frank|Research and Deve...|
|        Grace|       No Department|
|        Henry|       No Department|
+-------------+--------------------+



                                                                                

### Right Outer Joins

Question: Display a list of departments along with employee names. If a department has no employees, display "No Employees".



In [9]:
#renaming the "department_id" column in employees_df to match departments_df
employees_df = employees_df.withColumnRenamed("department_id", "dept_id")

#performing a right outer join on the renamed "dept_id" column
all_departments_df = departments_df.join(employees_df, departments_df["department_id"] == employees_df["dept_id"], "right_outer")

#replacing null department names with "No Employees" and null employee names with "No Employee"
result_df = all_departments_df.withColumn(
    "department_name",
    coalesce(all_departments_df["department_name"], lit("No Employees"))
).withColumn(
    "employee_name",
    coalesce(all_departments_df["employee_name"], lit("No Employee"))
)

#selecting department names and employee names
result_df = result_df.select("department_name", "employee_name")

#shweing the result DataFrame

result_df.show()


+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  HR|  Pallavi mam|
|         Engineering|          Bob|
|                  HR|        Cathy|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|        No Employees|        Alice|
|        No Employees|          Eva|
|Research and Deve...|        Frank|
|        No Employees|        Grace|
|        No Employees|        Henry|
+--------------------+-------------+



### Left Semi Joins

Question: Retrieve a DataFrame that includes employee names for departments that have employees.



In [10]:

#renaming columns in the DataFrames using alias
departments_df = departments_df.withColumnRenamed("department_id", "dept_id")
employees_df = employees_df.withColumnRenamed("department_id", "dept_id")

#performing a left semi join using the "dept_id" column from both DataFrames
departments_with_employees_df = departments_df.join(
    employees_df,
    departments_df["dept_id"] == employees_df["dept_id"],
    "left_semi"
)

#selecting department names and employee names
result_df = departments_with_employees_df.select(col("department_name"), col("employee_name"))

#showing the result DataFrame
result_df.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `employee_name` cannot be resolved. Did you mean one of the following? [`department_name`, `dept_id`].;
'Project [department_name#39, 'employee_name]
+- Join LeftSemi, (dept_id#188 = dept_id#94)
   :- Project [department_id#38 AS dept_id#188, department_name#39]
   :  +- LogicalRDD [department_id#38, department_name#39], false
   +- Project [employee_id#32, employee_name#33, department_id#34 AS dept_id#94]
      +- LogicalRDD [employee_id#32, employee_name#33, department_id#34], false


### Left Anti Joins

Question: Find the employees who don't belong to any department.

In [11]:
#performing a left anti join to find employees who don't belong to any department
employees_without_departments_df = employees_df.join(
    departments_df,
    employees_df["dept_id"] == departments_df["dept_id"],
    "left_anti"
)

#selecting the columns you want to keep
result_df = employees_without_departments_df.select(col("employee_name"))

#shwoing the employees without departments
result_df.show()

+-------------+
|employee_name|
+-------------+
|        Alice|
|          Eva|
|        Grace|
|        Henry|
+-------------+



### Cross (Cartesian) Joins

Question: Create a DataFrame that contains all possible combinations of employees and departments.

In [12]:
#creating a new column with a constant value of 1 in both DataFrames
employees_df = employees_df.withColumn("join_key", lit(1))
departments_df = departments_df.withColumn("join_key", lit(1))

#performing a cross join to create all possible combinations
cross_joined_df = employees_df.crossJoin(departments_df)

#selecting the columns you want to keep
result_df = cross_joined_df.select("employee_id", "department_id", "employee_name", "department_name")

#shwoing the result DataFrame
result_df.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `department_id` cannot be resolved. Did you mean one of the following? [`department_name`, `dept_id`, `dept_id`, `employee_id`, `employee_name`].;
'Project [employee_id#32, 'department_id, employee_name#33, department_name#39]
+- Join Cross
   :- Project [employee_id#32, employee_name#33, dept_id#94, 1 AS join_key#210]
   :  +- Project [employee_id#32, employee_name#33, department_id#34 AS dept_id#94]
   :     +- LogicalRDD [employee_id#32, employee_name#33, department_id#34], false
   +- Project [dept_id#188, department_name#39, 1 AS join_key#215]
      +- Project [department_id#38 AS dept_id#188, department_name#39]
         +- LogicalRDD [department_id#38, department_name#39], false
