<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/19_12_2025/Excericise_1_19_12_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

REAL-TIME CASE STUDY

In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when,regexp_replace, split, trim, array_compact, transform, get_json_object
spark = SparkSession.builder.appName("Excercise1").getOrCreate()

In [4]:
user_data = [
("U001","Amit","28","Hyderabad","AI,ML,Cloud"),
("U002","Neha","Thirty","Delhi","Testing"),
("U003","Ravi",None,"Bangalore",["Data","Spark"]),
("U004","Pooja","29","Mumbai","AI|ML"),
("U005","", "31","Chennai",None)
]

In [5]:
from pyspark.sql.types import (StructType, StructField, StringType,LongType,IntegerType,ArrayType,MapType)

In [6]:
user_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("name", StringType(), nullable=True),
    StructField("age", StringType(), nullable=True),
    StructField("city", StringType(), nullable=True),
    StructField("skills", StringType(), nullable=True)
])

df_data = spark.createDataFrame(user_data, user_schema)
df_data.show(truncate=False)

+-------+-----+------+---------+-------------+
|user_id|name |age   |city     |skills       |
+-------+-----+------+---------+-------------+
|U001   |Amit |28    |Hyderabad|AI,ML,Cloud  |
|U002   |Neha |Thirty|Delhi    |Testing      |
|U003   |Ravi |NULL  |Bangalore|[Data, Spark]|
|U004   |Pooja|29    |Mumbai   |AI|ML        |
|U005   |     |31    |Chennai  |NULL         |
+-------+-----+------+---------+-------------+



In [42]:
clean_df = df_data.withColumn("age", when(col("age") == "", None)
    .when(col("age").rlike(r"^\d+$"),
          col("age").cast(IntegerType()))
    .otherwise(None))



In [46]:
clean_data = clean_df.withColumn(
    "skills",
    (when(
        col("skills").isNull(),
        None
    ).otherwise(
        array_compact(
            transform(
                split(
                    regexp_replace(
                        regexp_replace(col("skills"), r"\[|\]", ""),
                        r"'|\|", ","),
                    ","),lambda x: trim(x)
            )
        )
    )).cast(ArrayType(StringType()))
)

clean_data.show(truncate=False)

+-------+-----+----+---------+---------------+
|user_id|name |age |city     |skills         |
+-------+-----+----+---------+---------------+
|U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|
|U002   |Neha |NULL|Delhi    |[Testing]      |
|U003   |Ravi |NULL|Bangalore|[Data, Spark]  |
|U004   |Pooja|29  |Mumbai   |[AI, ML]       |
|U005   |     |31  |Chennai  |NULL           |
+-------+-----+----+---------+---------------+



In [47]:
users_df=clean_data

In [10]:
courses_data = [
("C001","PySpark Mastery","Data Engineering","Advanced","₹9999"),
("C002","AI for Testers","QA","Beginner","8999"),
("C003","ML Foundations","AI","Intermediate",None),
("C004","Data Engineering Bootcamp","Data","Advanced","₹14999")
]

In [12]:
course_schema = StructType([
    StructField("course_id", StringType(), nullable=False),
    StructField("course_name", StringType(), nullable=True),
    StructField("skills", StringType(), nullable=True),
     StructField("level", StringType(), nullable=True),
    StructField("amount", StringType(), nullable=True),
])

df_course = spark.createDataFrame(courses_data, course_schema)
df_course.show(truncate=False)

+---------+-------------------------+----------------+------------+------+
|course_id|course_name              |skills          |level       |amount|
+---------+-------------------------+----------------+------------+------+
|C001     |PySpark Mastery          |Data Engineering|Advanced    |₹9999 |
|C002     |AI for Testers           |QA              |Beginner    |8999  |
|C003     |ML Foundations           |AI              |Intermediate|NULL  |
|C004     |Data Engineering Bootcamp|Data            |Advanced    |₹14999|
+---------+-------------------------+----------------+------------+------+



In [14]:
course_clean_data = df_course.withColumn(
    "amount",
    when(col("amount").isNull() , 0 ).otherwise(regexp_replace(col("amount"), "₹", ""))
    .cast('int')
)
course_clean_data.show()

+---------+--------------------+----------------+------------+------+
|course_id|         course_name|          skills|       level|amount|
+---------+--------------------+----------------+------------+------+
|     C001|     PySpark Mastery|Data Engineering|    Advanced|  9999|
|     C002|      AI for Testers|              QA|    Beginner|  8999|
|     C003|      ML Foundations|              AI|Intermediate|     0|
|     C004|Data Engineering ...|            Data|    Advanced| 14999|
+---------+--------------------+----------------+------------+------+



In [48]:
courses_df=course_clean_data

In [15]:
user_enrollment_data = [
("U001","C001","2024-01-05"),
("U002","C002","05/01/2024"),
("U003","C001","2024/01/06"),
("U004","C003","invalid_date"),
("U001","C004","2024-01-10"),
("U005","C002","2024-01-12")
]

In [16]:
enrollment_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("course_id", StringType(), nullable=False),
    StructField("enrollment_date", StringType(), nullable=True),
])

df_enrollment = spark.createDataFrame(user_enrollment_data, enrollment_schema)
df_enrollment.show(truncate=False)

+-------+---------+---------------+
|user_id|course_id|enrollment_date|
+-------+---------+---------------+
|U001   |C001     |2024-01-05     |
|U002   |C002     |05/01/2024     |
|U003   |C001     |2024/01/06     |
|U004   |C003     |invalid_date   |
|U001   |C004     |2024-01-10     |
|U005   |C002     |2024-01-12     |
+-------+---------+---------------+



In [28]:
from pyspark.sql.functions import coalesce, try_to_timestamp,array
from pyspark.sql.types import DateType

df_enrollment_clean = df_enrollment.withColumn(
    "enrollment_date",
    coalesce(
        try_to_timestamp(col("enrollment_date"), lit("yyyy-MM-dd")).cast(DateType()),
        try_to_timestamp(col("enrollment_date"), lit("dd/MM/yyyy")).cast(DateType()),
        try_to_timestamp(col("enrollment_date"), lit("yyyy/MM/dd")).cast(DateType())
    )
)

df_enrollment_clean.show(truncate=False)

+-------+---------+---------------+
|user_id|course_id|enrollment_date|
+-------+---------+---------------+
|U001   |C001     |2024-01-05     |
|U002   |C002     |2024-01-05     |
|U003   |C001     |2024-01-06     |
|U004   |C003     |NULL           |
|U001   |C004     |2024-01-10     |
|U005   |C002     |2024-01-12     |
+-------+---------+---------------+



In [49]:
enrollments_df=df_enrollment_clean

In [25]:
user_activity_log = [
("U001","login,watch,logout","{'device':'mobile'}",120),
("U002",["login","watch"],"device=laptop",90),
("U003","login|logout",None,30),
("U004",None,"{'device':'tablet'}",60),
("U005","login","{'device':'mobile'}",15)
]

In [27]:
user_activity_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("activity_log", StringType(), nullable=True),
    StructField("device_info", StringType(), nullable=True),
    StructField("time",IntegerType(), nullable=True)
])

df_activity = spark.createDataFrame(user_activity_log, user_activity_schema)
df_activity.show(truncate=False)

+-------+------------------+-------------------+----+
|user_id|activity_log      |device_info        |time|
+-------+------------------+-------------------+----+
|U001   |login,watch,logout|{'device':'mobile'}|120 |
|U002   |[login, watch]    |device=laptop      |90  |
|U003   |login|logout      |NULL               |30  |
|U004   |NULL              |{'device':'tablet'}|60  |
|U005   |login             |{'device':'mobile'}|15  |
+-------+------------------+-------------------+----+



In [57]:
df_activity_clean = df_activity.withColumn(
    "activity_log",
    (when(
        col("activity_log").isNull(),
        None
    ).otherwise(
        array_compact(
            transform(
                split(
                    regexp_replace(
                        regexp_replace(col("activity_log"), r"\[|\]", ""),
                        r"'|\|", ","),
                    ","),lambda x: trim(x)
            )
        )
    )).cast(ArrayType(StringType()))
).withColumn(
    "device_info",
    when(col("device_info").isNull(), None)
    .when(col("device_info").like("{'device':%}"), get_json_object(col("device_info"), "$.device"))
    .when(col("device_info").like("device=%"), split(col("device_info"), "=").getItem(1))
    .otherwise(None)
)

df_activity_clean.show(truncate=False)
df_activity_clean.printSchema()

+-------+----------------------+-----------+----+
|user_id|activity_log          |device_info|time|
+-------+----------------------+-----------+----+
|U001   |[login, watch, logout]|mobile     |120 |
|U002   |[login, watch]        |laptop     |90  |
|U003   |[login, logout]       |NULL       |30  |
|U004   |NULL                  |tablet     |60  |
|U005   |[login]               |mobile     |15  |
+-------+----------------------+-----------+----+

root
 |-- user_id: string (nullable = false)
 |-- activity_log: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- device_info: string (nullable = true)
 |-- time: integer (nullable = true)



In [62]:
activity_df = df_activity_clean

PART A — DATA CLEANING & STRUCTURING

1. Design explicit schemas for all datasets
2. Normalize data types (age, price, dates)
3. Convert skills and actions into arrays
4. Handle missing and invalid records gracefully
5. Produce clean DataFrames:
users_df
courses_df
enrollments_df
activity_df

In [58]:
users_df.show()

+-------+-----+----+---------+---------------+
|user_id| name| age|     city|         skills|
+-------+-----+----+---------+---------------+
|   U001| Amit|  28|Hyderabad|[AI, ML, Cloud]|
|   U002| Neha|NULL|    Delhi|      [Testing]|
|   U003| Ravi|NULL|Bangalore|  [Data, Spark]|
|   U004|Pooja|  29|   Mumbai|       [AI, ML]|
|   U005|     |  31|  Chennai|           NULL|
+-------+-----+----+---------+---------------+



In [59]:
courses_df.show()

+---------+--------------------+----------------+------------+------+
|course_id|         course_name|          skills|       level|amount|
+---------+--------------------+----------------+------------+------+
|     C001|     PySpark Mastery|Data Engineering|    Advanced|  9999|
|     C002|      AI for Testers|              QA|    Beginner|  8999|
|     C003|      ML Foundations|              AI|Intermediate|     0|
|     C004|Data Engineering ...|            Data|    Advanced| 14999|
+---------+--------------------+----------------+------------+------+



In [60]:
enrollments_df.show()

+-------+---------+---------------+
|user_id|course_id|enrollment_date|
+-------+---------+---------------+
|   U001|     C001|     2024-01-05|
|   U002|     C002|     2024-01-05|
|   U003|     C001|     2024-01-06|
|   U004|     C003|           NULL|
|   U001|     C004|     2024-01-10|
|   U005|     C002|     2024-01-12|
+-------+---------+---------------+



In [63]:
activity_df.show()

+-------+--------------------+-----------+----+
|user_id|        activity_log|device_info|time|
+-------+--------------------+-----------+----+
|   U001|[login, watch, lo...|     mobile| 120|
|   U002|      [login, watch]|     laptop|  90|
|   U003|     [login, logout]|       NULL|  30|
|   U004|                NULL|     tablet|  60|
|   U005|             [login]|     mobile|  15|
+-------+--------------------+-----------+----+



PART B — DATA INTEGRATION (JOINS)

6. Join users with enrollments
7. Join enrollments with courses
8. Decide which table(s) should be broadcast
9. Justify your decision using explain(True)
10. Eliminate orphan records

In [67]:
users_enrollments_df = users_df.join(enrollments_df, "user_id", "inner")
users_enrollments_df.show(truncate=False)

+-------+-----+----+---------+---------------+---------+---------------+
|user_id|name |age |city     |skills         |course_id|enrollment_date|
+-------+-----+----+---------+---------------+---------+---------------+
|U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|C001     |2024-01-05     |
|U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|C004     |2024-01-10     |
|U002   |Neha |NULL|Delhi    |[Testing]      |C002     |2024-01-05     |
|U003   |Ravi |NULL|Bangalore|[Data, Spark]  |C001     |2024-01-06     |
|U004   |Pooja|29  |Mumbai   |[AI, ML]       |C003     |NULL           |
|U005   |     |31  |Chennai  |NULL           |C002     |2024-01-12     |
+-------+-----+----+---------+---------------+---------+---------------+



In [68]:
from pyspark.sql.functions import broadcast

final_df = users_enrollments_df.join(broadcast(courses_df), "course_id", "inner")
final_df.show(truncate=False)
final_df.explain(True)

+---------+-------+-----+----+---------+---------------+---------------+-------------------------+----------------+------------+------+
|course_id|user_id|name |age |city     |skills         |enrollment_date|course_name              |skills          |level       |amount|
+---------+-------+-----+----+---------+---------------+---------------+-------------------------+----------------+------------+------+
|C001     |U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|2024-01-05     |PySpark Mastery          |Data Engineering|Advanced    |9999  |
|C004     |U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|2024-01-10     |Data Engineering Bootcamp|Data            |Advanced    |14999 |
|C002     |U002   |Neha |NULL|Delhi    |[Testing]      |2024-01-05     |AI for Testers           |QA              |Beginner    |8999  |
|C001     |U003   |Ravi |NULL|Bangalore|[Data, Spark]  |2024-01-06     |PySpark Mastery          |Data Engineering|Advanced    |9999  |
|C003     |U004   |Pooja|29  |Mumbai   |[AI, ML]

In [69]:
from pyspark.sql.functions import broadcast

users_enrollments_broadcast_df = users_df.join(broadcast(enrollments_df), "user_id", "inner")
users_enrollments_broadcast_df.show(truncate=False)

+-------+-----+----+---------+---------------+---------+---------------+
|user_id|name |age |city     |skills         |course_id|enrollment_date|
+-------+-----+----+---------+---------------+---------+---------------+
|U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|C004     |2024-01-10     |
|U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|C001     |2024-01-05     |
|U002   |Neha |NULL|Delhi    |[Testing]      |C002     |2024-01-05     |
|U003   |Ravi |NULL|Bangalore|[Data, Spark]  |C001     |2024-01-06     |
|U004   |Pooja|29  |Mumbai   |[AI, ML]       |C003     |NULL           |
|U005   |     |31  |Chennai  |NULL           |C002     |2024-01-12     |
+-------+-----+----+---------+---------------+---------+---------------+



In [70]:
final_df_with_both_broadcasts = users_enrollments_broadcast_df.join(broadcast(courses_df), "course_id", "inner")
final_df_with_both_broadcasts.show(truncate=False)
final_df_with_both_broadcasts.explain(True)

+---------+-------+-----+----+---------+---------------+---------------+-------------------------+----------------+------------+------+
|course_id|user_id|name |age |city     |skills         |enrollment_date|course_name              |skills          |level       |amount|
+---------+-------+-----+----+---------+---------------+---------------+-------------------------+----------------+------------+------+
|C004     |U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|2024-01-10     |Data Engineering Bootcamp|Data            |Advanced    |14999 |
|C001     |U001   |Amit |28  |Hyderabad|[AI, ML, Cloud]|2024-01-05     |PySpark Mastery          |Data Engineering|Advanced    |9999  |
|C002     |U002   |Neha |NULL|Delhi    |[Testing]      |2024-01-05     |AI for Testers           |QA              |Beginner    |8999  |
|C001     |U003   |Ravi |NULL|Bangalore|[Data, Spark]  |2024-01-06     |PySpark Mastery          |Data Engineering|Advanced    |9999  |
|C003     |U004   |Pooja|29  |Mumbai   |[AI, ML]

PART C — ANALYTICS & AGGREGATIONS

11. Total enrollments per course
12. Total revenue per course
13. Average engagement time per course
14. Total courses enrolled per user
15. Identify users with zero activity

In [71]:
enrollments_per_course = enrollments_df.groupBy("course_id").count()
enrollments_per_course.show()

+---------+-----+
|course_id|count|
+---------+-----+
|     C001|    2|
|     C002|    2|
|     C003|    1|
|     C004|    1|
+---------+-----+



In [73]:
from pyspark.sql.functions import sum

total_revenue_per_course = enrollments_df.join(courses_df, "course_id", "inner") \
                                         .groupBy("course_id") \
                                         .agg(sum("amount").alias("total_revenue"))

total_revenue_per_course.show()

+---------+-------------+
|course_id|total_revenue|
+---------+-------------+
|     C001|        19998|
|     C002|        17998|
|     C003|            0|
|     C004|        14999|
+---------+-------------+



In [74]:
from pyspark.sql.functions import avg

average_engagement_per_course = enrollments_df.join(activity_df, "user_id", "inner") \
                                               .groupBy("course_id") \
                                               .agg(avg("time").alias("average_engagement_time"))

average_engagement_per_course.show()

+---------+-----------------------+
|course_id|average_engagement_time|
+---------+-----------------------+
|     C003|                   60.0|
|     C004|                  120.0|
|     C001|                   75.0|
|     C002|                   52.5|
+---------+-----------------------+



In [75]:
from pyspark.sql.functions import countDistinct

total_courses_per_user = enrollments_df.groupBy("user_id").agg(countDistinct("course_id").alias("total_courses_enrolled"))
total_courses_per_user.show()

+-------+----------------------+
|user_id|total_courses_enrolled|
+-------+----------------------+
|   U004|                     1|
|   U005|                     1|
|   U002|                     1|
|   U003|                     1|
|   U001|                     2|
+-------+----------------------+



PART D — WINDOW FUNCTIONS

16. Rank users by total time spent
17. Calculate running revenue per course by enrollment date
18. Identify top 2 users per course by engagement
19. Compare GroupBy vs Window results for at least one metric

In [79]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_spec = Window.partitionBy("user_id").orderBy(col("time").desc())
ranked_df = activity_df.withColumn("row_num", row_number().over(window_spec))
ranked_df.show()

+-------+--------------------+-----------+----+-------+
|user_id|        activity_log|device_info|time|row_num|
+-------+--------------------+-----------+----+-------+
|   U001|[login, watch, lo...|     mobile| 120|      1|
|   U002|      [login, watch]|     laptop|  90|      1|
|   U003|     [login, logout]|       NULL|  30|      1|
|   U004|                NULL|     tablet|  60|      1|
|   U005|             [login]|     mobile|  15|      1|
+-------+--------------------+-----------+----+-------+



In [86]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

enrollments_with_revenue = enrollments_df.join(courses_df, "course_id", "inner")

window_spec_running_revenue = Window.partitionBy("course_id").orderBy("enrollment_date")

running_revenue_per_course = enrollments_with_revenue.withColumn(
    "running_revenue",
    sum("amount").over(window_spec_running_revenue)
)

running_revenue_per_course.orderBy("course_id", "enrollment_date").show(truncate=False)

+---------+-------+---------------+-------------------------+----------------+------------+------+---------------+
|course_id|user_id|enrollment_date|course_name              |skills          |level       |amount|running_revenue|
+---------+-------+---------------+-------------------------+----------------+------------+------+---------------+
|C001     |U001   |2024-01-05     |PySpark Mastery          |Data Engineering|Advanced    |9999  |9999           |
|C001     |U003   |2024-01-06     |PySpark Mastery          |Data Engineering|Advanced    |9999  |19998          |
|C002     |U002   |2024-01-05     |AI for Testers           |QA              |Beginner    |8999  |8999           |
|C002     |U005   |2024-01-12     |AI for Testers           |QA              |Beginner    |8999  |17998          |
|C003     |U004   |NULL           |ML Foundations           |AI              |Intermediate|0     |0              |
|C004     |U001   |2024-01-10     |Data Engineering Bootcamp|Data            |Ad

In [85]:
activity_df.orderBy(col("time").desc()).limit(2).show()

+-------+--------------------+-----------+----+
|user_id|        activity_log|device_info|time|
+-------+--------------------+-----------+----+
|   U001|[login, watch, lo...|     mobile| 120|
|   U002|      [login, watch]|     laptop|  90|
+-------+--------------------+-----------+----+



PART E — UDF (ONLY IF REQUIRED)

20. Classify users into engagement levels:
High
Medium
Low

Rules:
Use built-in functions where possible
Use UDF only if unavoidable
Explain why UDF was needed (or avoided)

In [81]:
df = activity_df.withColumn("grade",
    when(col("time") >= 80, "High")
    .when(col("time") >= 60, "Medium")
    .otherwise("Low")
)

df.orderBy("grade",df.time.desc()).show()

+-------+--------------------+-----------+----+------+
|user_id|        activity_log|device_info|time| grade|
+-------+--------------------+-----------+----+------+
|   U001|[login, watch, lo...|     mobile| 120|  High|
|   U002|      [login, watch]|     laptop|  90|  High|
|   U003|     [login, logout]|       NULL|  30|   Low|
|   U005|             [login]|     mobile|  15|   Low|
|   U004|                NULL|     tablet|  60|Medium|
+-------+--------------------+-----------+----+------+



PART F — SORTING & ORDERING

21. Sort courses by total revenue (descending)
22. Sort users by engagement within each city

23. Explain why sorting caused a shuffle

In [82]:
from pyspark.sql.functions import desc

total_revenue_per_course.orderBy(desc("total_revenue")).show()

+---------+-------------+
|course_id|total_revenue|
+---------+-------------+
|     C001|        19998|
|     C002|        17998|
|     C004|        14999|
|     C003|            0|
+---------+-------------+



In [90]:
from pyspark.sql.functions import desc, sum, rank
from pyspark.sql.window import Window

final_df_with_check = final_df_with_both_broadcasts.join(broadcast(activity_df), "user_id", "inner")
final_df_with_check.show(truncate=False)

user_engagement_by_city = final_df_with_check.groupBy("city", "user_id").agg(sum("time").alias("total_engagement_time"))

window_spec_city_engagement = Window.partitionBy("city").orderBy(desc("total_engagement_time"))

ranked_users_by_city_engagement = user_engagement_by_city.withColumn("rank_in_city", rank().over(window_spec_city_engagement))

ranked_users_by_city_engagement.orderBy("city", "rank_in_city").show(truncate=False)

+-------+---------+-----+----+---------+---------------+---------------+-------------------------+----------------+------------+------+----------------------+-----------+----+
|user_id|course_id|name |age |city     |skills         |enrollment_date|course_name              |skills          |level       |amount|activity_log          |device_info|time|
+-------+---------+-----+----+---------+---------------+---------------+-------------------------+----------------+------------+------+----------------------+-----------+----+
|U001   |C004     |Amit |28  |Hyderabad|[AI, ML, Cloud]|2024-01-10     |Data Engineering Bootcamp|Data            |Advanced    |14999 |[login, watch, logout]|mobile     |120 |
|U001   |C001     |Amit |28  |Hyderabad|[AI, ML, Cloud]|2024-01-05     |PySpark Mastery          |Data Engineering|Advanced    |9999  |[login, watch, logout]|mobile     |120 |
|U002   |C002     |Neha |NULL|Delhi    |[Testing]      |2024-01-05     |AI for Testers           |QA              |Begin

PART G — SET OPERATIONS

Create two DataFrames:
Users who enrolled
Users who completed activity



24. Find users who enrolled but never became active
25. Find users who are both enrolled and active
26. Explain why set operations are different from joins

In [93]:
users_enrolled = enrollments_df.select("user_id").distinct()
users_active = activity_df.select("user_id").distinct()

users_enrolled_but_inactive = users_enrolled.join(users_active, "user_id", "left_anti")
users_enrolled_but_inactive.show()

+-------+
|user_id|
+-------+
+-------+



In [96]:
users_enrolled = enrollments_df.select("user_id").distinct()
users_active = activity_df.select("user_id").distinct()

users_enrolled_and_active = users_enrolled.join(users_active, "user_id", "inner")
users_enrolled_and_active.show()

+-------+
|user_id|
+-------+
|   U002|
|   U003|
|   U001|
|   U004|
|   U005|
+-------+



PART H — DAG & PERFORMANCE ANALYSIS

27. For at least three operations, run explain(True)
28. Identify:
Shuffles
Broadcast joins
Sort operations
29. Suggest one performance improvement

In [95]:
users_df.explain(True)
courses_df.explain(True)
enrollments_df.explain(True)
activity_df.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(skills, cast(CASE WHEN 'isNull('skills) THEN null ELSE 'array_compact('transform('split('regexp_replace('regexp_replace('skills, \[|\], ), '|\|, ,), ,, -1), lambdafunction('trim(lambda 'x_8), lambda 'x_8, false))) END as array<string>), None)]
+- Project [user_id#0, name#1, CASE WHEN (age#2 = ) THEN cast(null as int) WHEN RLIKE(age#2, ^\d+$) THEN cast(age#2 as int) ELSE cast(null as int) END AS age#343, city#3, skills#4]
   +- LogicalRDD [user_id#0, name#1, age#2, city#3, skills#4], false

== Analyzed Logical Plan ==
user_id: string, name: string, age: int, city: string, skills: array<string>
Project [user_id#0, name#1, age#343, city#3, cast(CASE WHEN isnull(skills#4) THEN cast(null as array<string>) ELSE array_compact(transform(split(regexp_replace(regexp_replace(skills#4, \[|\], , 1), '|\|, ,, 1), ,, -1), lambdafunction(trim(lambda x_8#345, None), lambda x_8#345, false))) END as array<string>) AS skills#344]
+- Project [us