<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/22_12_2025/Milestone_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
raw_drivers = [
("D001","Ramesh","35","Hyderabad","Car,Bike"),
("D002","Suresh","Forty","Bangalore","Auto"),
("D003","Anita",None,"Mumbai",["Car"]),
("D004","Kiran","29","Delhi","Car|Bike"),
("D005","", "42","Chennai",None)
]

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when,regexp_replace, split, trim, array_compact, transform, get_json_object, lower
spark = SparkSession.builder.appName("MileStone1").getOrCreate()
from pyspark.sql.types import (StructType, StructField, StringType,LongType,IntegerType,ArrayType,MapType)

In [None]:
driver_schema = StructType([
    StructField("driverid", StringType(), nullable=False),
    StructField("name", StringType(), nullable=True),
    StructField("age", StringType(), nullable=True),
    StructField("city", StringType(), nullable=True),
    StructField("vechile", StringType(), nullable=True)
])
df = spark.createDataFrame(raw_drivers,driver_schema)
df.show()

+--------+------+-----+---------+--------+
|driverid|  name|  age|     city| vechile|
+--------+------+-----+---------+--------+
|    D001|Ramesh|   35|Hyderabad|Car,Bike|
|    D002|Suresh|Forty|Bangalore|    Auto|
|    D003| Anita| NULL|   Mumbai|   [Car]|
|    D004| Kiran|   29|    Delhi|Car|Bike|
|    D005|      |   42|  Chennai|    NULL|
+--------+------+-----+---------+--------+



Fixing Given Issue Below

Known Issues
Age in mixed formats

Vehicle types in string / array / multiple delimiters

Missing names

Null value

In [None]:
clean_age = df.withColumn("age", when(col("age") == "", None)
    .when(col("age").rlike(r"^\d+$"),
          col("age").cast(IntegerType()))
    .otherwise(None))

clean_name_city_vechile = clean_age.withColumn("name", when(col("name") == "", None)
    .otherwise(col("name"))) \
.withColumn("city",trim(col("city")))\
.withColumn(
    "vechile",
    (when(
        col("vechile").isNull(),
        None
    ).otherwise(
        array_compact(
            transform(
                split(
                    regexp_replace(
                        col("vechile"),
                        r"\[|\]|'|\|", ","),
                    ","),
                lambda x: when(trim(x) != lit(""), trim(x)).otherwise(lit(None))
            )
        )
    )).cast(ArrayType(StringType()))
)

clean_name_city_vechile.show()

driver_df = clean_name_city_vechile

+--------+------+----+---------+-----------+
|driverid|  name| age|     city|    vechile|
+--------+------+----+---------+-----------+
|    D001|Ramesh|  35|Hyderabad|[Car, Bike]|
|    D002|Suresh|NULL|Bangalore|     [Auto]|
|    D003| Anita|NULL|   Mumbai|      [Car]|
|    D004| Kiran|  29|    Delhi|[Car, Bike]|
|    D005|  NULL|  42|  Chennai|       NULL|
+--------+------+----+---------+-----------+



In [None]:
raw_cities = [
("Hyderabad","South"),
("Bangalore","South"),
("Mumbai","West"),
("Delhi","North"),
("Chennai","South")
]

In [None]:
city_schema = StructType([
    StructField("city", StringType(), nullable=True),
    StructField("region", StringType(), nullable=True)
])
city_df = spark.createDataFrame(raw_cities,city_schema)
city_df.show()

+---------+------+
|     city|region|
+---------+------+
|Hyderabad| South|
|Bangalore| South|
|   Mumbai|  West|
|    Delhi| North|
|  Chennai| South|
+---------+------+



Notes

Small reference dataset

Intended for broadcast join

In [None]:
from pyspark.sql.functions import broadcast

In [None]:
driver_join  = driver_df.join(broadcast(city_df), "city", "inner")
driver_join.show()

+---------+--------+------+----+-----------+------+
|     city|driverid|  name| age|    vechile|region|
+---------+--------+------+----+-----------+------+
|Hyderabad|    D001|Ramesh|  35|[Car, Bike]| South|
|Bangalore|    D002|Suresh|NULL|     [Auto]| South|
|   Mumbai|    D003| Anita|NULL|      [Car]|  West|
|    Delhi|    D004| Kiran|  29|[Car, Bike]| North|
|  Chennai|    D005|  NULL|  42|       NULL| South|
+---------+--------+------+----+-----------+------+



In [None]:
raw_trips = [
("T001","D001","Hyderabad","2024-01-05","Completed","450"),
("T002","D002","Bangalore","05/01/2024","Cancelled","0"),
("T003","D003","Mumbai","2024/01/06","Completed","620"),
("T004","D004","Delhi","invalid_date","Completed","540"),
("T005","D001","Hyderabad","2024-01-10","Completed","700"),
("T006","D005","Chennai","2024-01-12","Completed","350")
]

In [None]:
trips_schema = StructType([
    StructField("userid", StringType(), nullable=False),
    StructField("driverid", StringType(), nullable=False),
    StructField("city", StringType(), nullable=True),
    StructField("date", StringType(), nullable=True),
    StructField("status", StringType(), nullable=True),
    StructField("amount", StringType(), nullable=True),
])
trips_df = spark.createDataFrame(raw_trips,trips_schema)
trips_df.show()

+------+--------+---------+------------+---------+------+
|userid|driverid|     city|        date|   status|amount|
+------+--------+---------+------------+---------+------+
|  T001|    D001|Hyderabad|  2024-01-05|Completed|   450|
|  T002|    D002|Bangalore|  05/01/2024|Cancelled|     0|
|  T003|    D003|   Mumbai|  2024/01/06|Completed|   620|
|  T004|    D004|    Delhi|invalid_date|Completed|   540|
|  T005|    D001|Hyderabad|  2024-01-10|Completed|   700|
|  T006|    D005|  Chennai|  2024-01-12|Completed|   350|
+------+--------+---------+------------+---------+------+



In [None]:
from pyspark.sql.functions import col, to_date, coalesce, split, lit, array_remove, try_to_timestamp

In [None]:
clean_date_amount = trips_df.withColumn("amount", col("amount").cast(IntegerType()))\
.withColumn(
    "date",
    coalesce(
        to_date(try_to_timestamp(col("date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("date"), lit("yyyy/MM/dd")))
    )
)

clean_date_amount = clean_date_amount.filter(col("amount") > 0)

clean_date_amount.show()
tripsdf=clean_date_amount

+------+--------+---------+----------+---------+------+
|userid|driverid|     city|      date|   status|amount|
+------+--------+---------+----------+---------+------+
|  T001|    D001|Hyderabad|2024-01-05|Completed|   450|
|  T003|    D003|   Mumbai|2024-01-06|Completed|   620|
|  T004|    D004|    Delhi|      NULL|Completed|   540|
|  T005|    D001|Hyderabad|2024-01-10|Completed|   700|
|  T006|    D005|  Chennai|2024-01-12|Completed|   350|
+------+--------+---------+----------+---------+------+



In [None]:
raw_activity = [
("D001","login,accept_trip,logout","{'device':'mobile'}",180),
("D002",["login","logout"],"device=laptop",60),
("D003","login|accept_trip",None,120),
("D004",None,"{'device':'tablet'}",90),
("D005","login","{'device':'mobile'}",30)
]

In [None]:
activity_schema = StructType([
    StructField("userid", StringType(), nullable=False),
    StructField("actions", StringType(), nullable=True),
    StructField("device", StringType(), nullable=True),
    StructField("amount", IntegerType(), nullable=True),
])
activity_df = spark.createDataFrame(raw_activity,activity_schema)
activity_df.show()

+------+--------------------+-------------------+------+
|userid|             actions|             device|amount|
+------+--------------------+-------------------+------+
|  D001|login,accept_trip...|{'device':'mobile'}|   180|
|  D002|     [login, logout]|      device=laptop|    60|
|  D003|   login|accept_trip|               NULL|   120|
|  D004|                NULL|{'device':'tablet'}|    90|
|  D005|               login|{'device':'mobile'}|    30|
+------+--------------------+-------------------+------+



Known Issues
Actions in multiple formats
Metadata as JSON-like strings
Missing actions

In [None]:
df_activity_clean = activity_df.withColumn(
    "actions",
    (when(
        col("actions").isNull(),
        None
    ).otherwise(
        array_compact(
            transform(
                split(
                    regexp_replace(
                        col("actions"),
                        r"\[|\]|'|\|", ","),
                    ","),
                lambda x: when(trim(x) != lit(""), trim(x)).otherwise(lit(None))
            )
        )
    )).cast(ArrayType(StringType()))
).withColumn(
    "device",
    when(col("device").isNull(), None)
    .when(col("device").like("{'device':%}"), get_json_object(col("device"), "$.device"))
    .when(col("device").like("device=%"), split(col("device"), "=").getItem(1))
    .otherwise(None)
)

df_activity_clean.show(truncate=False)
df_activity_clean.printSchema()

+------+----------------------------+------+------+
|userid|actions                     |device|amount|
+------+----------------------------+------+------+
|D001  |[login, accept_trip, logout]|mobile|180   |
|D002  |[login, logout]             |laptop|60    |
|D003  |[login, accept_trip]        |NULL  |120   |
|D004  |NULL                        |tablet|90    |
|D005  |[login]                     |mobile|30    |
+------+----------------------------+------+------+

root
 |-- userid: string (nullable = false)
 |-- actions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- device: string (nullable = true)
 |-- amount: integer (nullable = true)



All clean data

PART A — DATA CLEANING & STRUCTURING

. Design explicit schemas for all
 datasets
. Normalize:

Age

Fare

Dates

. Convert vehicle types and actions into arrays

. Handle missing and invalid records gracefully

. Produce clean DataFrames:

drivers_df

cities_df

trips_df

activity_df

In [None]:
df_activity_clean.show()
tripsdf.show()
driver_join.show()
driver_df.show()
city_df.show()

+------+--------------------+------+------+
|userid|             actions|device|amount|
+------+--------------------+------+------+
|  D001|[login, accept_tr...|mobile|   180|
|  D002|     [login, logout]|laptop|    60|
|  D003|[login, accept_trip]|  NULL|   120|
|  D004|                NULL|tablet|    90|
|  D005|             [login]|mobile|    30|
+------+--------------------+------+------+

+------+--------+---------+----------+---------+------+
|userid|driverid|     city|      date|   status|amount|
+------+--------+---------+----------+---------+------+
|  T001|    D001|Hyderabad|2024-01-05|Completed|   450|
|  T003|    D003|   Mumbai|2024-01-06|Completed|   620|
|  T004|    D004|    Delhi|      NULL|Completed|   540|
|  T005|    D001|Hyderabad|2024-01-10|Completed|   700|
|  T006|    D005|  Chennai|2024-01-12|Completed|   350|
+------+--------+---------+----------+---------+------+

+---------+--------+------+----+-----------+------+
|     city|driverid|  name| age|    vechile|re

PART B — DATA INTEGRATION (JOINS)

. Join trips with drivers

. Join trips with cities

. Decide which dataset should be
broadcast

. Prove your decision using explain(True)

. Remove orphan trips (drivers not in master

In [None]:
city_df.show()

trips_city_join  = tripsdf.join(broadcast(city_df), "city", "inner")
trips_city_join.show()

trips_city_join.explain(True)

ophan = trips_city_join.filter(~trips_city_join["date"].isNull())
ophan.show()

+---------+------+
|     city|region|
+---------+------+
|Hyderabad| South|
|Bangalore| South|
|   Mumbai|  West|
|    Delhi| North|
|  Chennai| South|
+---------+------+

+---------+------+--------+----------+---------+------+------+
|     city|userid|driverid|      date|   status|amount|region|
+---------+------+--------+----------+---------+------+------+
|Hyderabad|  T001|    D001|2024-01-05|Completed|   450| South|
|   Mumbai|  T003|    D003|2024-01-06|Completed|   620|  West|
|    Delhi|  T004|    D004|      NULL|Completed|   540| North|
|Hyderabad|  T005|    D001|2024-01-10|Completed|   700| South|
|  Chennai|  T006|    D005|2024-01-12|Completed|   350| South|
+---------+------+--------+----------+---------+------+------+

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- Filter (amount#366 > 0)
:  +- Project [userid#246, driverid#247, city#248, coalesce(to_date(try_to_timestamp(date#249, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true

PART C — ANALYTICS & AGGREGATIONS
. Total trips per city
. Total revenue per city
. Average fare per driver
. Total completed trips per driver
. Identify drivers with no completed trip

PART D — WINDOW FUNCTIONS
. Rank drivers by total revenue (overall)
. Rank drivers by revenue within each city
. Calculate running revenue per city by date
. Compare GroupBy vs Window for one metric

In [None]:
trips_df.show()

+------+--------+---------+------------+---------+------+
|userid|driverid|     city|        date|   status|amount|
+------+--------+---------+------------+---------+------+
|  T001|    D001|Hyderabad|  2024-01-05|Completed|   450|
|  T002|    D002|Bangalore|  05/01/2024|Cancelled|     0|
|  T003|    D003|   Mumbai|  2024/01/06|Completed|   620|
|  T004|    D004|    Delhi|invalid_date|Completed|   540|
|  T005|    D001|Hyderabad|  2024-01-10|Completed|   700|
|  T006|    D005|  Chennai|  2024-01-12|Completed|   350|
+------+--------+---------+------------+---------+------+



In [None]:
from pyspark.sql import functions as F

driver_revenue = trips_df.groupBy("driverid") \
    .agg(F.sum("amount").alias("total_revenue")) \
    .orderBy(F.desc("total_revenue"))

driver_revenue.show()

+--------+-------------+
|driverid|total_revenue|
+--------+-------------+
|    D001|       1150.0|
|    D003|        620.0|
|    D004|        540.0|
|    D005|        350.0|
|    D002|          0.0|
+--------+-------------+



In [None]:
from pyspark.sql.window import Window

In [None]:
city_driver_rank = trips_df.groupBy("city", "driverid") \
    .agg(F.sum("amount").alias("city_revenue")) \
    .withColumn("rank", F.rank().over(Window.partitionBy("city").orderBy(F.desc("city_revenue"))))
city_driver_rank.show()

+---------+--------+------------+----+
|     city|driverid|city_revenue|rank|
+---------+--------+------------+----+
|Bangalore|    D002|         0.0|   1|
|  Chennai|    D005|       350.0|   1|
|    Delhi|    D004|       540.0|   1|
|Hyderabad|    D001|      1150.0|   1|
|   Mumbai|    D003|       620.0|   1|
+---------+--------+------------+----+



In [None]:
from pyspark.sql.window import Window

city_date_window = Window.partitionBy("city").orderBy("date") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

running_revenue = trips_df.groupBy("city", "date") \
    .agg(F.sum("amount").alias("daily_revenue")) \
    .withColumn("running_revenue", F.sum("daily_revenue").over(city_date_window))

. Compare GroupBy vs Window for one metric
GroupBy: Aggregates data into fewer rows (e.g., total revenue per driver).
Window: Keeps original granularity but adds computed columns (e.g., rank, cumulative sum).
Use Case:
GroupBy → summary reports.
Window → analytics like ranking, running totals without collapsing rows

PART E — UDF (ONLY IF REQUIRED)
. Classify drivers into performance levels:
High
Medium
Rules:
Low
Prefer built-in functions
Use UDF only if unavoidable
Justify your choic

In [None]:

from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

# Define classification based on revenue
def classify_revenue(revenue):
    if revenue >= 1000:
        return "High"
    elif revenue >= 500:
        return "Medium"
    else:
        return "Low"

# Register UDF
classify_revenue_udf = udf(classify_revenue, StringType())

# Apply UDF on city_revenue column
city_driver_rank.withColumn("revenue_grade", classify_revenue_udf(col("city_revenue"))).show()


{"ts": "2025-12-22 12:10:02.819", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `city_revenue` cannot be resolved. Did you mean one of the following? [`city`, `driverid`, `age`, `name`, `vechile`]. SQLSTATE: 42703", "context": {"file": "line 17 in cell [60]", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o49.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `city_revenue` cannot be resolved. Did you mean one of the following? [`city`, `driverid`, `age`, `name`, `vechile`]. SQLSTATE: 42703;\n'Project [driverid#0, name#1, age#2, city#3, vechile#4, classify_revenue('city_revenue)#761 AS revenue_grade#762]\n+- LogicalRDD [driverid#0, name#1, age#2, city#3, vechile#4], false\n\

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `city_revenue` cannot be resolved. Did you mean one of the following? [`city`, `driverid`, `age`, `name`, `vechile`]. SQLSTATE: 42703;
'Project [driverid#0, name#1, age#2, city#3, vechile#4, classify_revenue('city_revenue)#761 AS revenue_grade#762]
+- LogicalRDD [driverid#0, name#1, age#2, city#3, vechile#4], false
