In [2]:
'''
You are given records of students, a list of available subjects, and an examinations log that records each time a student attended an exam for a subject.

Your task is to generate a report that shows, for every student and every subject, how many exams the student attended for that subject (use 0 when none).

Return the results ordered by student_id and then subject_name for readability.

Input Schema & Example
Students
Column Name	Data Type
student_id	Integer
student_name	String
Example Input Table (Students):

student_id	student_name
1	Alice
2	Bob
13	John
6	Alex
Subjects
Column Name	Data Type
subject_name	String
Example Input Table (Subjects):

subject_name
Math
Physics
Programming
Examinations
Column Name	Data Type
student_id	Integer
subject_name	String
Example Input Table (Examinations):

student_id	subject_name
1	Math
1	Physics
1	Programming
2	Programming
1	Physics
1	Math
13	Math
13	Programming
13	Physics
2	Math
1	Math
Output Schema
Column Name	Data Type
student_id	Integer
student_name	String
subject_name	String
attended_exams	Integer
Example Output Table
student_id	student_name	subject_name	attended_exams
1	Alice	Math	3
1	Alice	Physics	2
1	Alice	Programming	1
2	Bob	Math	1
2	Bob	Physics	0
2	Bob	Programming	1
6	Alex	Math	0
6	Alex	Physics	0
6	Alex	Programming	0
13	John	Math	1
13	John	Physics	1
13	John	Programming	1
Starter Code (PySpark)
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

# Students
students_data = [
    (1, "Alice"),
    (2, "Bob"),
    (13, "John"),
    (6, "Alex"),
]
students_cols = ["student_id", "student_name"]
df_students = spark.createDataFrame(students_data, students_cols)

# Subjects
subjects_data = [
    ("Math",),
    ("Physics",),
    ("Programming",),
]
subjects_cols = ["subject_name"]
df_subjects = spark.createDataFrame(subjects_data, subjects_cols)

# Examinations
exams_data = [
    (1, "Math"),
    (1, "Physics"),
    (1, "Programming"),
    (2, "Programming"),
    (1, "Physics"),
    (1, "Math"),
    (13, "Math"),
    (13, "Programming"),
    (13, "Physics"),
    (2, "Math"),
    (1, "Math"),
]
exams_cols = ["student_id", "subject_name"]
df_exams = spark.createDataFrame(exams_data, exams_cols)

# Your logic goes here to create df_result
# df_result should have: student_id, student_name, subject_name, attended_exams
# Sort by student_id, subject_name.

display(df_result)
'''

# Initialize Spark session
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

# Students
students_data = [
    (1, "Alice"),
    (2, "Bob"),
    (13, "John"),
    (6, "Alex"),
]
students_cols = ["student_id", "student_name"]
df_students = spark.createDataFrame(students_data, students_cols)

# Subjects
subjects_data = [
    ("Math",),
    ("Physics",),
    ("Programming",),
]
subjects_cols = ["subject_name"]
df_subjects = spark.createDataFrame(subjects_data, subjects_cols)

# Examinations
exams_data = [
    (1, "Math"),
    (1, "Physics"),
    (1, "Programming"),
    (2, "Programming"),
    (1, "Physics"),
    (1, "Math"),
    (13, "Math"),
    (13, "Programming"),
    (13, "Physics"),
    (2, "Math"),
    (1, "Math"),
]
exams_cols = ["student_id", "subject_name"]
df_exams = spark.createDataFrame(exams_data, exams_cols)

# create all (student, subject) combinations using crossJoin
df_student_subjects = df_students.crossJoin(df_subjects)

# count exams per (student, subject)
df_count_student_exams = (
  df_exams.groupBy("student_id", "subject_name")
  .agg(F.count("*").alias("attended_exams"))
)

# join & fill missing with 0
df_result = (
  df_student_subjects.join(
    df_count_student_exams,
    on = ["student_id", "subject_name"],
    how = "left"
  )
  .withColumn("attended_exams", F.coalesce(F.col("attended_exams"), F.lit(0)))
  .select("student_id", "student_name", "subject_name", "attended_exams")
  .orderBy("student_id", "subject_name")
)

# Display result
df_result.show()

+----------+------------+------------+--------------+
|student_id|student_name|subject_name|attended_exams|
+----------+------------+------------+--------------+
|         1|       Alice|        Math|             3|
|         1|       Alice|     Physics|             2|
|         1|       Alice| Programming|             1|
|         2|         Bob|        Math|             1|
|         2|         Bob|     Physics|             0|
|         2|         Bob| Programming|             1|
|         6|        Alex|        Math|             0|
|         6|        Alex|     Physics|             0|
|         6|        Alex| Programming|             0|
|        13|        John|        Math|             1|
|        13|        John|     Physics|             1|
|        13|        John| Programming|             1|
+----------+------------+------------+--------------+

