In [1]:
from pyspark.sql import SparkSession, functions as F, types as T
spark = SparkSession.builder.appName("LearningPlatformCaseStudy").getOrCreate()

In [2]:
raw_users = [
    ("U001","Amit","28","Hyderabad","AI,ML,Cloud"),
    ("U002","Neha","Thirty","Delhi","Testing"),
    ("U003","Ravi",None,"Bangalore",["Data","Spark"]),
    ("U004","Pooja","29","Mumbai","AI|ML"),
    ("U005","", "31","Chennai",None)
]
raw_users_rows = [(u, n, a, c, ",".join(s) if isinstance(s, list) else s) for (u,n,a,c,s) in raw_users]

users_raw_schema = T.StructType([
    T.StructField("user_id", T.StringType(), False),
    T.StructField("name",    T.StringType(), True),
    T.StructField("age_raw", T.StringType(), True),
    T.StructField("city",    T.StringType(), True),
    T.StructField("skills_raw", T.StringType(), True)
])

users_raw_df = spark.createDataFrame(raw_users_rows, schema=users_raw_schema)

raw_courses = [
    ("C001","PySpark Mastery","Data Engineering","Advanced","₹9999"),
    ("C002","AI for Testers","QA","Beginner","8999"),
    ("C003","ML Foundations","AI","Intermediate",None),
    ("C004","Data Engineering Bootcamp","Data","Advanced","₹14999")
]

courses_raw_schema = T.StructType([
    T.StructField("course_id", T.StringType(), False),
    T.StructField("course_name", T.StringType(), True),
    T.StructField("category", T.StringType(), True),
    T.StructField("level", T.StringType(), True),
    T.StructField("price_raw", T.StringType(), True)
])

courses_raw_df = spark.createDataFrame(raw_courses, schema=courses_raw_schema)

raw_enrollments = [
    ("U001","C001","2024-01-05"),
    ("U002","C002","05/01/2024"),
    ("U003","C001","2024/01/06"),
    ("U004","C003","invalid_date"),
    ("U001","C004","2024-01-10"),
    ("U005","C002","2024-01-12")
]

enrollments_raw_schema = T.StructType([
    T.StructField("user_id", T.StringType(), False),
    T.StructField("course_id", T.StringType(), False),
    T.StructField("enroll_date_raw", T.StringType(), True)
])

enrollments_raw_df = spark.createDataFrame(raw_enrollments, schema=enrollments_raw_schema)

raw_activity = [
    ("U001","login,watch,logout","{'device':'mobile'}",120),
    ("U002",["login","watch"],"device=laptop",90),
    ("U003","login|logout",None,30),
    ("U004",None,"{'device':'tablet'}",60),
    ("U005","login","{'device':'mobile'}",15)
]

def normalize_actions(a):
    if isinstance(a, list):
        return ",".join(a)
    return a

raw_activity_rows = [(u, normalize_actions(a), m, d) for (u,a,m,d) in raw_activity]

activity_raw_schema = T.StructType([
    T.StructField("user_id", T.StringType(), False),
    T.StructField("actions_raw", T.StringType(), True),
    T.StructField("metadata_raw", T.StringType(), True),
    T.StructField("duration_seconds", T.IntegerType(), True)
])

activity_raw_df = spark.createDataFrame(raw_activity_rows, schema=activity_raw_schema)


In [3]:

# Final "clean" schemas

users_schema = T.StructType([
    T.StructField("user_id", T.StringType(), False),
    T.StructField("name", T.StringType(), True),
    T.StructField("age", T.IntegerType(), True),
    T.StructField("city", T.StringType(), True),
    T.StructField("skills", T.ArrayType(T.StringType()), True)
])

courses_schema = T.StructType([
    T.StructField("course_id", T.StringType(), False),
    T.StructField("course_name", T.StringType(), True),
    T.StructField("category", T.StringType(), True),
    T.StructField("level", T.StringType(), True),
    T.StructField("price", T.IntegerType(), True)  # price in INR
])

enrollments_schema = T.StructType([
    T.StructField("user_id", T.StringType(), False),
    T.StructField("course_id", T.StringType(), False),
    T.StructField("enroll_date", T.DateType(), True)
])

activity_schema = T.StructType([
    T.StructField("user_id", T.StringType(), False),
    T.StructField("actions", T.ArrayType(T.StringType()), True),
    T.StructField("metadata", T.StructType([T.StructField("device", T.StringType(), True)]), True),
    T.StructField("duration_seconds", T.IntegerType(), True)
])


In [4]:

# Normalize age: digits or known words -> int
age_digits = F.regexp_extract(F.col("age_raw"), r"(\d+)", 1)
age_clean = F.when(age_digits != "", age_digits.cast("int")) \
             .when(F.lower(F.col("age_raw")) == F.lit("thirty"), F.lit(30)) \
             .otherwise(F.lit(None).cast("int"))

# Normalize skills to an array: support comma and pipe delimiters
skills_str = F.regexp_replace(F.col("skills_raw"), r"\|", ",")
skills_arr = F.split(F.coalesce(skills_str, F.lit("")), r"\s*,\s*")
skills_trimmed = F.expr("transform(filter(skills_arr, x -> x <> ''), x -> trim(x))") \
    .alias("skills")

users_df = users_raw_df \
    .withColumn("age", age_clean) \
    .withColumn("skills_arr", skills_arr) \
    .withColumn("skills", F.array_distinct(F.expr("transform(skills_arr, x -> trim(x))"))) \
    .drop("skills_arr") \
    .withColumn("name", F.when(F.length(F.col("name")) == 0, F.lit(None)).otherwise(F.col("name"))) \
    .select("user_id", "name", "age", "city", "skills")

# Optionally enforce schema (not strictly necessary if columns already typed)
users_df = users_df.select(
    F.col("user_id").cast("string"),
    F.col("name").cast("string"),
    F.col("age").cast("int"),
    F.col("city").cast("string"),
    F.col("skills").cast(T.ArrayType(T.StringType()))
)


In [5]:

# Normalize age: digits or known words -> int
age_digits = F.regexp_extract(F.col("age_raw"), r"(\d+)", 1)
age_clean = F.when(age_digits != "", age_digits.cast("int")) \
             .when(F.lower(F.col("age_raw")) == F.lit("thirty"), F.lit(30)) \
             .otherwise(F.lit(None).cast("int"))

# Normalize skills to an array: support comma and pipe delimiters
skills_str = F.regexp_replace(F.col("skills_raw"), r"\|", ",")
skills_arr = F.split(F.coalesce(skills_str, F.lit("")), r"\s*,\s*")
skills_trimmed = F.expr("transform(filter(skills_arr, x -> x <> ''), x -> trim(x))") \
    .alias("skills")

users_df = users_raw_df \
    .withColumn("age", age_clean) \
    .withColumn("skills_arr", skills_arr) \
    .withColumn("skills", F.array_distinct(F.expr("transform(skills_arr, x -> trim(x))"))) \
    .drop("skills_arr") \
    .withColumn("name", F.when(F.length(F.col("name")) == 0, F.lit(None)).otherwise(F.col("name"))) \
    .select("user_id", "name", "age", "city", "skills")

# Optionally enforce schema (not strictly necessary if columns already typed)
users_df = users_df.select(
    F.col("user_id").cast("string"),
    F.col("name").cast("string"),
    F.col("age").cast("int"),
    F.col("city").cast("string"),
    F.col("skills").cast(T.ArrayType(T.StringType()))
)
