In [4]:
'''
You are given a dataset containing employee attendance records across various departments in a company. Each row contains:

Employee ID
Department Name
Date
Status (Present / Absent / Leave)
Your task is to pivot the data such that for each employee_id, you display the count of each attendance status (Present, Absent, Leave) as separate columns.

Input Schema & Example
Column Name	Data Type
employee_id	Integer
department	String
date	String
status	String
Example Input Table
employee_id	department	date	status
1	HR	2025-07-01	Present
1	HR	2025-07-02	Absent
2	Finance	2025-07-01	Leave
Output Schema
Column Name	Data Type
employee_id	Integer
Present	Integer
Absent	Integer
Leave	Integer
Example Output Table
employee_id	Present	Absent	Leave
1	1	1	0
2	0	0	1
ðŸ’¡ Explanation
Employee 1 has 2 records: 1 Present and 1 Absent.
Employee 2 has 1 Leave record.
The output shows total counts of each status (as columns) per employee.
Starter Code
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

data = [
    (1, "HR", "2025-07-01", "Present"),
    (1, "HR", "2025-07-02", "Absent"),
    (1, "HR", "2025-07-03", "Present"),
    (2, "Finance", "2025-07-01", "Leave"),
    (2, "Finance", "2025-07-02", "Present"),
    (3, "IT", "2025-07-01", "Absent"),
    (3, "IT", "2025-07-02", "Absent"),
    (3, "IT", "2025-07-03", "Present"),
    (4, "IT", "2025-07-01", "Leave"),
    (4, "IT", "2025-07-02", "Leave"),
    (4, "IT", "2025-07-03", "Present"),
]

columns = ["employee_id", "department", "date", "status"]

df = spark.createDataFrame(data, columns)

# Your logic goes here to create df_result

display(df_result)
'''

# Initialize Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

data = [
    (1, "HR", "2025-07-01", "Present"),
    (1, "HR", "2025-07-02", "Absent"),
    (1, "HR", "2025-07-03", "Present"),
    (2, "Finance", "2025-07-01", "Leave"),
    (2, "Finance", "2025-07-02", "Present"),
    (3, "IT", "2025-07-01", "Absent"),
    (3, "IT", "2025-07-02", "Absent"),
    (3, "IT", "2025-07-03", "Present"),
    (4, "IT", "2025-07-01", "Leave"),
    (4, "IT", "2025-07-02", "Leave"),
    (4, "IT", "2025-07-03", "Present"),
]

columns = ["employee_id", "department", "date", "status"]

df = spark.createDataFrame(data, columns)

# Using pivot
df_result = (
  df.groupBy("employee_id")
  .pivot("status", ["Present", "Absent", "Leave"])
  .count()
  .fillna(0)
)

'''
Alternate solution - 
df_result = (
  df.groupBy("employee_id")
  .agg(
    F.sum(F.when(F.col("status") == "Present", 1).otherwise(0)).alias("Present"),
    F.sum(F.when(F.col("status") == "Absent", 1).otherwise(0)).alias("Absent"),
    F.sum(F.when(F.col("status") == "Leave", 1).otherwise(0)).alias("Leave")
  )
)
'''

'''
Bonus Challenge: Can you solve this using Spark SQL and temporary views?

df.createOrReplaceTempView("attendance")

df_result = spark.sql("""
  SELECT
    employee_id,
    SUM(CASE WHEN status = 'Present' THEN 1 ELSE 0 END) AS Present,
    SUM(CASE WHEN status = 'Absent' THEN 1 ELSE 0 END) AS Absent,
    SUM(CASE WHEN status = 'Leave' THEN 1 ELSE 0 END) AS Leave
  FROM attendance
  GROUP BY employee_id
""")
'''

'''
ðŸ§  Alternative: SQL PIVOT (Spark-only)
df_result = spark.sql("""
SELECT *
FROM (
    SELECT employee_id, status
    FROM attendance
)
PIVOT (
    COUNT(status)
    FOR status IN ('Present', 'Absent', 'Leave')
)
ORDER BY employee_id
""")
'''

# Display result.
df_result.show()

+-----------+-------+------+-----+
|employee_id|Present|Absent|Leave|
+-----------+-------+------+-----+
|          1|      2|     1|    0|
|          3|      1|     2|    0|
|          2|      1|     0|    1|
|          4|      1|     0|    2|
+-----------+-------+------+-----+

