In [2]:
'''
Write a PySpark query to retrieve employees who earn more than the average salary of their respective department. The query should output the employee's name, department name, and salary.

Use the provided employee and department dataframes to solve this challenge.

Use display(df) to show the final DataFrame.

Starter Code:
from pyspark.sql import SparkSession

# Start a Spark session
spark = SparkSession.builder \
    .appName("Employees Above Average Salary") \
    .master("local[*]") \
    .getOrCreate()

# Employee DataFrame
employee_data = [
    (1, "Alice", 5000, 1),
    (2, "Bob", 7000, 2),
    (3, "Charlie", 4000, 1),
    (4, "David", 6000, 2),
    (5, "Eve", 8000, 3),
    (6, "Kev", 9000, 3),
    (7, "Mev", 10000, 3),
    (8, "Mob", 12000, 2)
]
employee_columns = ["employee_id", "employee_name", "salary", "department_id"]
emp_df = spark.createDataFrame(employee_data, employee_columns)

# Department DataFrame
department_data = [
    (1, "HR"),
    (2, "Engineering"),
    (3, "Finance")
]
department_columns = ["department_id", "department_name"]
dept_df = spark.createDataFrame(department_data, department_columns)

# Display dataframes (optional)
emp_df.show()
dept_df.show()
Output Schema:
Column Name	Type	Description
employee_name	string	Name of the employee
department_name	string	Name of the department
salary	int	Salary of the employee
Example Output:
employee_name	department_name	salary
Alice	HR	5000
Mob	Engineering	12000
Mev	Finance	10000
'''
# Initialize Spark session
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

# Employee DataFrame
employee_data = [
    (1, "Alice", 5000, 1),
    (2, "Bob", 7000, 2),
    (3, "Charlie", 4000, 1),
    (4, "David", 6000, 2),
    (5, "Eve", 8000, 3),
    (6, "Kev", 9000, 3),
    (7, "Mev", 10000, 3),
    (8, "Mob", 12000, 2)
]
employee_columns = ["employee_id", "employee_name", "salary", "department_id"]
emp_df = spark.createDataFrame(employee_data, employee_columns)

# Department DataFrame
department_data = [
    (1, "HR"),
    (2, "Engineering"),
    (3, "Finance")
]
department_columns = ["department_id", "department_name"]
dept_df = spark.createDataFrame(department_data, department_columns)

# Compute average salary per department
df_avg_dept_salary = (
  emp_df.groupBy("department_id")
  .agg(F.avg("salary").alias("avg_salary"))
)

df_result = (
  # Join salary averages back to employees
  emp_df.join(
    df_avg_dept_salary,
    on = "department_id",
    how = "inner"
  )
  # Filter employees earning above average
  .filter(F.col("salary") > F.col("avg_salary"))
  # Join departments & select final output
  .join(
    dept_df,
    on = "department_id",
    how = "inner"
  )
  .select("employee_name", "department_name", "salary")
)

# Display results.
df_result.show()

+-------------+---------------+------+
|employee_name|department_name|salary|
+-------------+---------------+------+
|        Alice|             HR|  5000|
|          Mob|    Engineering| 12000|
|          Mev|        Finance| 10000|
+-------------+---------------+------+

