In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, lit , avg, coalesce , struct,array , explode, create_map
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


# Create a Spark session
spark = SparkSession.builder.appName("day4").getOrCreate()


23/09/03 19:57:16 WARN Utils: Your hostname, rojesh-Predator-PHN16-71 resolves to a loopback address: 127.0.1.1; using 192.168.18.4 instead (on interface wlp0s20f3)
23/09/03 19:57:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/03 19:57:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/03 19:57:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/09/03 19:57:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:


employee_data = [(1,'Pallavi mam',101),
                 (2,'Bob',102),
                 (3,'Cathy',101),
                 (4,'David',103),
                 (5,'Amrit Sir',104),
                 (6,'Alice',None),
                 (7,'Eva',None),
                 (8,'Frank',110),
                 (9,'Grace',109),
                 (10,'Henry',None)]

Department_Data = [(101,'Hr'),
                   (102,'Engineering'),
                   (103,'Finance'),
                   (104,'Marketing'),
                   (105,'Operation'),
                   (106,None),
                   (107,'Operations'),
                   (108,'Production'),
                   (None,'Finance'),
                   (110,'Research and Development')]

employee_schema = StructType([
    StructField("Employee_Id",IntegerType(),True),
    StructField("Employee_name",StringType(),True),
    StructField("department_id",IntegerType(),True)
])

department_schema = StructType([
    StructField("department_id",IntegerType(),True),
    StructField("department_name",StringType(),True)
])

employee_df  = spark.createDataFrame(data=employee_data,schema=employee_schema)
department_df = spark.createDataFrame(data=Department_Data, schema=department_schema)

print("Employee Dataframe")
employee_df.printSchema()
employee_df.show()

print("Department Dataframe")
department_df.printSchema()
department_df.show(truncate=False)

Employee Dataframe
root
 |-- Employee_Id: integer (nullable = true)
 |-- Employee_name: string (nullable = true)
 |-- department_id: integer (nullable = true)



                                                                                

+-----------+-------------+-------------+
|Employee_Id|Employee_name|department_id|
+-----------+-------------+-------------+
|          1|  Pallavi mam|          101|
|          2|          Bob|          102|
|          3|        Cathy|          101|
|          4|        David|          103|
|          5|    Amrit Sir|          104|
|          6|        Alice|         null|
|          7|          Eva|         null|
|          8|        Frank|          110|
|          9|        Grace|          109|
|         10|        Henry|         null|
+-----------+-------------+-------------+

Department Dataframe
root
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nullable = true)

+-------------+------------------------+
|department_id|department_name         |
+-------------+------------------------+
|101          |Hr                      |
|102          |Engineering             |
|103          |Finance                 |
|104          |Marketing               |
|105

In [6]:
employee_df.printSchema()


department_df.printSchema()

root
 |-- Employee_Id: integer (nullable = true)
 |-- Employee_name: string (nullable = true)
 |-- department_id: integer (nullable = true)

root
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nullable = true)



### Join Expressions

Question: How can you combine the employees_df and departments_df DataFrames based on the common "department_id" column to get a combined DataFrame with employee names and their respective department names?

In [7]:

#combining the DataFrames using an inner join on the "department_id" column
combined_df = employee_df.join(department_df, on='department_id', how='inner')

#selecting the columns you want to keep in the combined DataFrame
selected_columns = ["employee_name", "department_name"]

#selecting only the desired columns
result_df = combined_df.select(selected_columns)

#showing the result DataFrame
result_df.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  Hr|
|        Cathy|                  Hr|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Inner Joins

Question: How can you retrieve employee names and their respective department names for employees belonging to the "Engineering" department?

In [9]:
#combining the DataFrames using an inner join on the "department_id" column
combined_df = employee_df.join(department_df, on='department_id', how='inner')

#filtering the combined DataFrame to get employees in the "Engineering" department
engineering_employees_df = combined_df.filter(combined_df.department_name == "Engineering")

#selecting employee names and department names
result_df = engineering_employees_df.select("employee_name", "department_name")

#showing the result DataFrame
result_df.show()

+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|          Bob|    Engineering|
+-------------+---------------+



### Outer Joins

Question: Retrieve a DataFrame that contains all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [12]:
joinExpression = department_df['department_id'] == employee_df['department_id']
joinType = 'outer'

#Outer joined two dataframes which returned every value possible and null if any value is not present
outer_joined_df = department_df.join(employee_df, joinExpression, joinType).select(employee_df['employee_name'],department_df['department_name'])

#filled null with default value as No Department
No_dept_filled = outer_joined_df.na.fill('No Department')
No_dept_filled.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|No Department|             Finance|
|        Alice|       No Department|
|          Eva|       No Department|
|        Henry|       No Department|
|  Pallavi mam|                  Hr|
|        Cathy|                  Hr|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|No Department|           Operation|
|No Department|       No Department|
|No Department|          Operations|
|No Department|          Production|
|        Grace|       No Department|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Left Outer Joins

Question: List all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [14]:
joinType = 'left_outer'
employee_df.join(department_df,joinExpression,joinType).select(employee_df['employee_name'],department_df['department_name']).na.fill('no Department').show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  Hr|
|          Bob|         Engineering|
|        Cathy|                  Hr|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Alice|       no Department|
|          Eva|       no Department|
|        Frank|Research and Deve...|
|        Grace|       no Department|
|        Henry|       no Department|
+-------------+--------------------+



### Right Outer Joins

Question: Display a list of departments along with employee names. If a department has no employees, display "No Employees".



In [18]:
joinType = 'right_outer'

employee_df.join(department_df,joinExpression,joinType).select(department_df['department_name'],employee_df['employee_name']).na.fill('No Employee', subset=['employee_name']).show()


+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  Hr|        Cathy|
|                  Hr|  Pallavi mam|
|         Engineering|          Bob|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|           Operation|  No Employee|
|                null|  No Employee|
|          Operations|  No Employee|
|          Production|  No Employee|
|             Finance|  No Employee|
|Research and Deve...|        Frank|
+--------------------+-------------+



### Left Semi Joins

Question: Retrieve a DataFrame that includes employee names for departments that have employees.



In [19]:
joinType = 'left_semi'

employee_df.join(department_df,joinExpression,joinType).select(employee_df['employee_name']).show()

+-------------+
|employee_name|
+-------------+
|  Pallavi mam|
|        Cathy|
|          Bob|
|        David|
|    Amrit Sir|
|        Frank|
+-------------+



### Left Anti Joins

Question: Find the employees who don't belong to any department.

In [22]:
#performing a left anti join to find employees who don't belong to any department
employees_without_departments_df = employee_df.join(
    department_df,
    joinExpression,
    "left_anti"
)

#selecting the columns you want to keep
result_df = employees_without_departments_df.select(col("employee_name"))

#shwoing the employees without departments
result_df.show()

+-------------+
|employee_name|
+-------------+
|        Alice|
|          Eva|
|        Grace|
|        Henry|
+-------------+



### Cross (Cartesian) Joins

Question: Create a DataFrame that contains all possible combinations of employees and departments.

In [24]:
joinType = 'cross'
employee_df.crossJoin(department_df).show()




+-----------+-------------+-------------+-------------+--------------------+
|Employee_Id|Employee_name|department_id|department_id|     department_name|
+-----------+-------------+-------------+-------------+--------------------+
|          1|  Pallavi mam|          101|          101|                  Hr|
|          1|  Pallavi mam|          101|          102|         Engineering|
|          1|  Pallavi mam|          101|          103|             Finance|
|          1|  Pallavi mam|          101|          104|           Marketing|
|          1|  Pallavi mam|          101|          105|           Operation|
|          1|  Pallavi mam|          101|          106|                null|
|          1|  Pallavi mam|          101|          107|          Operations|
|          1|  Pallavi mam|          101|          108|          Production|
|          1|  Pallavi mam|          101|         null|             Finance|
|          1|  Pallavi mam|          101|          110|Research and Deve...|

                                                                                