<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/19_12_2025/Excericise_1_19_12_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

REAL-TIME CASE STUDY

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when,regexp_replace, split, trim, array_compact, transform
spark = SparkSession.builder.appName("Excercise1").getOrCreate()

In [4]:
user_data = [
("U001","Amit","28","Hyderabad","AI,ML,Cloud"),
("U002","Neha","Thirty","Delhi","Testing"),
("U003","Ravi",None,"Bangalore",["Data","Spark"]),
("U004","Pooja","29","Mumbai","AI|ML"),
("U005","", "31","Chennai",None)
]

In [5]:
from pyspark.sql.types import (StructType, StructField, StringType,LongType,IntegerType,ArrayType,MapType)

In [6]:
user_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("name", StringType(), nullable=True),
    StructField("age", StringType(), nullable=True),
    StructField("city", StringType(), nullable=True),
    StructField("skills", StringType(), nullable=True)
])

df_data = spark.createDataFrame(user_data, user_schema)
df_data.show(truncate=False)

+-------+-----+------+---------+-------------+
|user_id|name |age   |city     |skills       |
+-------+-----+------+---------+-------------+
|U001   |Amit |28    |Hyderabad|AI,ML,Cloud  |
|U002   |Neha |Thirty|Delhi    |Testing      |
|U003   |Ravi |NULL  |Bangalore|[Data, Spark]|
|U004   |Pooja|29    |Mumbai   |AI|ML        |
|U005   |     |31    |Chennai  |NULL         |
+-------+-----+------+---------+-------------+



In [7]:
clean_df = df_data.withColumn("age", when(col("age") == "", None)
    .when(col("age").rlike(r"^\d+$"),
          col("age").cast(IntegerType()))
    .otherwise(None))

clean_data = clean_df.withColumn(
    "skills",
    (when(
        col("skills").isNull(),
        None
    ).otherwise(
        array_compact(
            transform(
                split(
                    regexp_replace(col("skills"), "[']", ""),","),lambda x: trim(x)
            )
        )
    )).cast(ArrayType(StringType()))
)

clean_data.show(truncate=False)

+-------+-----+----+---------+-------------+
|user_id|name |age |city     |skills       |
+-------+-----+----+---------+-------------+
|U001   |Amit |28  |Hyderabad|AI,ML,Cloud  |
|U002   |Neha |NULL|Delhi    |Testing      |
|U003   |Ravi |NULL|Bangalore|[Data, Spark]|
|U004   |Pooja|29  |Mumbai   |AI|ML        |
|U005   |     |31  |Chennai  |NULL         |
+-------+-----+----+---------+-------------+



In [24]:
#Skills Column needs to be fixed

In [10]:
courses_data = [
("C001","PySpark Mastery","Data Engineering","Advanced","₹9999"),
("C002","AI for Testers","QA","Beginner","8999"),
("C003","ML Foundations","AI","Intermediate",None),
("C004","Data Engineering Bootcamp","Data","Advanced","₹14999")
]

In [12]:
course_schema = StructType([
    StructField("course_id", StringType(), nullable=False),
    StructField("course_name", StringType(), nullable=True),
    StructField("skills", StringType(), nullable=True),
     StructField("level", StringType(), nullable=True),
    StructField("amount", StringType(), nullable=True),
])

df_course = spark.createDataFrame(courses_data, course_schema)
df_course.show(truncate=False)

+---------+-------------------------+----------------+------------+------+
|course_id|course_name              |skills          |level       |amount|
+---------+-------------------------+----------------+------------+------+
|C001     |PySpark Mastery          |Data Engineering|Advanced    |₹9999 |
|C002     |AI for Testers           |QA              |Beginner    |8999  |
|C003     |ML Foundations           |AI              |Intermediate|NULL  |
|C004     |Data Engineering Bootcamp|Data            |Advanced    |₹14999|
+---------+-------------------------+----------------+------------+------+



In [14]:
course_clean_data = df_course.withColumn(
    "amount",
    when(col("amount").isNull() , 0 ).otherwise(regexp_replace(col("amount"), "₹", ""))
    .cast('int')
)
course_clean_data.show()

+---------+--------------------+----------------+------------+------+
|course_id|         course_name|          skills|       level|amount|
+---------+--------------------+----------------+------------+------+
|     C001|     PySpark Mastery|Data Engineering|    Advanced|  9999|
|     C002|      AI for Testers|              QA|    Beginner|  8999|
|     C003|      ML Foundations|              AI|Intermediate|     0|
|     C004|Data Engineering ...|            Data|    Advanced| 14999|
+---------+--------------------+----------------+------------+------+



In [15]:
user_enrollment_data = [
("U001","C001","2024-01-05"),
("U002","C002","05/01/2024"),
("U003","C001","2024/01/06"),
("U004","C003","invalid_date"),
("U001","C004","2024-01-10"),
("U005","C002","2024-01-12")
]

In [16]:
enrollment_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("course_id", StringType(), nullable=False),
    StructField("enrollment_date", StringType(), nullable=True),
])

df_enrollment = spark.createDataFrame(user_enrollment_data, enrollment_schema)
df_enrollment.show(truncate=False)

+-------+---------+---------------+
|user_id|course_id|enrollment_date|
+-------+---------+---------------+
|U001   |C001     |2024-01-05     |
|U002   |C002     |05/01/2024     |
|U003   |C001     |2024/01/06     |
|U004   |C003     |invalid_date   |
|U001   |C004     |2024-01-10     |
|U005   |C002     |2024-01-12     |
+-------+---------+---------------+



In [28]:
from pyspark.sql.functions import coalesce, try_to_timestamp,array
from pyspark.sql.types import DateType

df_enrollment_clean = df_enrollment.withColumn(
    "enrollment_date",
    coalesce(
        try_to_timestamp(col("enrollment_date"), lit("yyyy-MM-dd")).cast(DateType()),
        try_to_timestamp(col("enrollment_date"), lit("dd/MM/yyyy")).cast(DateType()),
        try_to_timestamp(col("enrollment_date"), lit("yyyy/MM/dd")).cast(DateType())
    )
)

df_enrollment_clean.show(truncate=False)

+-------+---------+---------------+
|user_id|course_id|enrollment_date|
+-------+---------+---------------+
|U001   |C001     |2024-01-05     |
|U002   |C002     |2024-01-05     |
|U003   |C001     |2024-01-06     |
|U004   |C003     |NULL           |
|U001   |C004     |2024-01-10     |
|U005   |C002     |2024-01-12     |
+-------+---------+---------------+



In [25]:
user_activity_log = [
("U001","login,watch,logout","{'device':'mobile'}",120),
("U002",["login","watch"],"device=laptop",90),
("U003","login|logout",None,30),
("U004",None,"{'device':'tablet'}",60),
("U005","login","{'device':'mobile'}",15)
]

In [27]:
user_activity_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("activity_log", StringType(), nullable=True),
    StructField("device_info", StringType(), nullable=True),
    StructField("time",IntegerType(), nullable=True)
])

df_activity = spark.createDataFrame(user_activity_log, user_activity_schema)
df_activity.show(truncate=False)

+-------+------------------+-------------------+----+
|user_id|activity_log      |device_info        |time|
+-------+------------------+-------------------+----+
|U001   |login,watch,logout|{'device':'mobile'}|120 |
|U002   |[login, watch]    |device=laptop      |90  |
|U003   |login|logout      |NULL               |30  |
|U004   |NULL              |{'device':'tablet'}|60  |
|U005   |login             |{'device':'mobile'}|15  |
+-------+------------------+-------------------+----+



In [32]:
df_activity_clean = df_activity.withColumn(
    "activity_log_clean",
    when(col("activity_log").isNull(), array())  # Empty array for null
    .when(col("activity_log").contains("['"),
          # Handle Python list format: ['login', 'watch']
          split(
              regexp_replace(
                  regexp_replace(col("activity_log"), r"\[|\]|'", ""),
                  " ", ""
              ),
              ","
          )
    )
    .when(col("activity_log").contains("|"),
          # Handle pipe delimiter
          split(col("activity_log"), r"\|")
    )
    .otherwise(
          # Handle comma delimiter (default)
          split(col("activity_log"), ",")
    )
).drop("activity_log")

df_activity_clean.show(truncate=False)

{"ts": "2025-12-19 05:22:28.440", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `[]'` cannot be resolved. Did you mean one of the following? [`time`, `user_id`, `device_info`, `activity_log`]. SQLSTATE: 42703", "context": {"file": "jdk.internal.reflect.GeneratedMethodAccessor59.invoke(Unknown Source)", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o261.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `[]'` cannot be resolved. Did you mean one of the following? [`time`, `user_id`, `device_info`, `activity_log`]. SQLSTATE: 42703;\n'Project [user_id#130, activity_log#131, device_info#132, time#133, CASE WHEN isnull(activity_log#131) THEN array() WHEN Contains(activity_log#13

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `[]'` cannot be resolved. Did you mean one of the following? [`time`, `user_id`, `device_info`, `activity_log`]. SQLSTATE: 42703;
'Project [user_id#130, activity_log#131, device_info#132, time#133, CASE WHEN isnull(activity_log#131) THEN array() WHEN Contains(activity_log#131, [') THEN 'split('regexp_replace('trim('[]', activity_log#131),  , ), ,, -1) WHEN Contains(activity_log#131, |) THEN split(activity_log#131, \|, -1) ELSE split(activity_log#131, ,, -1) END AS activity_log_clean#187]
+- LogicalRDD [user_id#130, activity_log#131, device_info#132, time#133], false


In [None]:
df_activity_