In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("assessment")\
    .getOrCreate()

In [2]:
raw_drivers = [
("D001","Ramesh","35","Hyderabad","Car,Bike"),
("D002","Suresh","Forty","Bangalore","Auto"),
("D003","Anita",None,"Mumbai",["Car"]),
("D004","Kiran","29","Delhi","Car|Bike"),
("D005","", "42","Chennai",None)
]

raw_cities = [
("Hyderabad","South"),
("Bangalore","South"),
("Mumbai","West"),
("Delhi","North"),
("Chennai","South")
]
raw_trips = [
("T001","D001","Hyderabad","2024-01-05","Completed","450"),
("T002","D002","Bangalore","05/01/2024","Cancelled","0"),
("T003","D003","Mumbai","2024/01/06","Completed","620"),
("T004","D004","Delhi","invalid_date","Completed","540"),
("T005","D001","Hyderabad","2024-01-10","Completed","700"),
("T006","D005","Chennai","2024-01-12","Completed","350")
]
raw_activity = [
("D001","login,accept_trip,logout","{'device':'mobile'}",180),
("D002",["login","logout"],"device=laptop",60),
("D003","login|accept_trip",None,120),
("D004",None,"{'device':'tablet'}",90),
("D005","login","{'device':'mobile'}",30)
]

In [4]:

from pyspark.sql import types as T

drivers_schema = T.StructType([
    T.StructField("driver_id",    T.StringType(), True),
    T.StructField("driver_name",  T.StringType(), True),
    T.StructField("age_raw",      T.StringType(), True),
    T.StructField("city",         T.StringType(), True),
    T.StructField("vehicle_types",T.StringType(), True)
])

cities_schema = T.StructType([
    T.StructField("city",   T.StringType(), False),
    T.StructField("region", T.StringType(), False)
])

trips_schema = T.StructType([
    T.StructField("trip_id",    T.StringType(), False),
    T.StructField("driver_id",  T.StringType(), False),
    T.StructField("city",       T.StringType(), False),
    T.StructField("order_date", T.StringType(), False),
    T.StructField("status",     T.StringType(), False),
    T.StructField("fare_raw",   T.StringType(), True)
])

activity_schema = T.StructType([
    T.StructField("driver_id",    T.StringType(), False),
    T.StructField("actions_raw",  T.StringType(), True),
    T.StructField("metadata_raw", T.StringType(), True),
    T.StructField("duration_sec", T.IntegerType(), True)
])


In [5]:

raw_drivers = [
    ("D001","Ramesh","35","Hyderabad","Car,Bike"),
    ("D002","Suresh","Forty","Bangalore","Auto"),
    ("D003","Anita",None,"Mumbai",["Car"]),
    ("D004","Kiran","29","Delhi","Car|Bike"),
    ("D005","", "42","Chennai",None)
]

raw_cities = [
    ("Hyderabad","South"),
    ("Bangalore","South"),
    ("Mumbai","West"),
    ("Delhi","North"),
    ("Chennai","South")
]

raw_trips = [
    ("T001","D001","Hyderabad","2024-01-05","Completed","450"),
    ("T002","D002","Bangalore","05/01/2024","Cancelled","0"),
    ("T003","D003","Mumbai","2024/01/06","Completed","620"),
    ("T004","D004","Delhi","invalid_date","Completed","540"),
    ("T005","D001","Hyderabad","2024-01-10","Completed","700"),
    ("T006","D005","Chennai","2024-01-12","Completed","350")
]

raw_activity = [
    ("D001","login,accept_trip,logout","{'device':'mobile'}",180),
    ("D002",["login","logout"],"device=laptop",60),
    ("D003","login|accept_trip",None,120),
    ("D004",None,"{'device':'tablet'}",90),
    ("D005","login","{'device':'mobile'}",30)
]


def to_str_list_or_none(x):
    if x is None:
        return None
    if isinstance(x, list):
        return ",".join(map(str, x))
    return str(x)

drivers_sanitized = [
    (d_id, name, age, city, to_str_list_or_none(veh))
    for (d_id, name, age, city, veh) in raw_drivers
]

activity_sanitized = [
    (d_id,
     to_str_list_or_none(actions),
     to_str_list_or_none(meta),
     duration)
    for (d_id, actions, meta, duration) in raw_activity
]


In [6]:

drivers_df  = spark.createDataFrame(drivers_sanitized,  schema=drivers_schema)
cities_df   = spark.createDataFrame(raw_cities,         schema=cities_schema)
trips_df    = spark.createDataFrame(raw_trips,          schema=trips_schema)
activity_df = spark.createDataFrame(activity_sanitized, schema=activity_schema)

drivers_df.printSchema(); drivers_df.show(truncate=False)
activity_df.printSchema(); activity_df.show(truncate=False)


root
 |-- driver_id: string (nullable = true)
 |-- driver_name: string (nullable = true)
 |-- age_raw: string (nullable = true)
 |-- city: string (nullable = true)
 |-- vehicle_types: string (nullable = true)

+---------+-----------+-------+---------+-------------+
|driver_id|driver_name|age_raw|city     |vehicle_types|
+---------+-----------+-------+---------+-------------+
|D001     |Ramesh     |35     |Hyderabad|Car,Bike     |
|D002     |Suresh     |Forty  |Bangalore|Auto         |
|D003     |Anita      |NULL   |Mumbai   |Car          |
|D004     |Kiran      |29     |Delhi    |Car|Bike     |
|D005     |           |42     |Chennai  |NULL         |
+---------+-----------+-------+---------+-------------+

root
 |-- driver_id: string (nullable = false)
 |-- actions_raw: string (nullable = true)
 |-- metadata_raw: string (nullable = true)
 |-- duration_sec: integer (nullable = true)

+---------+------------------------+-------------------+------------+
|driver_id|actions_raw             

In [7]:
from pyspark.sql import functions as F
age_map = F.create_map(
    F.lit("zero"), F.lit(0),   F.lit("one"), F.lit(1),
    F.lit("two"), F.lit(2),    F.lit("three"), F.lit(3),
    F.lit("four"), F.lit(4),   F.lit("five"), F.lit(5),
    F.lit("six"), F.lit(6),    F.lit("seven"), F.lit(7),
    F.lit("eight"), F.lit(8),  F.lit("nine"), F.lit(9),
    F.lit("ten"), F.lit(10),   F.lit("twenty"), F.lit(20),
    F.lit("thirty"), F.lit(30),F.lit("forty"), F.lit(40),
    F.lit("fifty"), F.lit(50)
)
drivers_df = drivers_df.withColumn(
    "age",
    F.when(F.col("age_raw").isNull(), F.lit(None).cast("int"))
     .when(F.col("age_raw").rlike(r"^\d+$"), F.col("age_raw").cast("int"))
     .otherwise(F.element_at(age_map, F.lower(F.col("age_raw"))).cast("int"))
).drop("age_raw")


In [8]:
trips_df = trips_df.withColumn("fare", F.expr("try_cast(fare_raw as int)")).drop("fare_raw")

In [18]:

from pyspark.sql import functions as F

def normalize_vehicles(df):
    if "vehicle_types" in df.columns:
        return df.withColumn(
            "vehicle_arr",
            F.filter(
                F.transform(
                    F.split(F.regexp_replace(F.coalesce(F.col("vehicle_types"), F.lit("")), r"[|]", ","), ","),
                    lambda x: F.initcap(F.trim(x))
                ),
                lambda x: x != ""
            )
        ).drop("vehicle_types")
    else:

        return df

drivers_df = normalize_vehicles(drivers_df)


In [19]:

def normalize_actions(df):
    if "actions_raw" in df.columns:
        return df.withColumn(
            "actions_arr",
            F.filter(
                F.transform(
                    F.split(F.regexp_replace(F.coalesce(F.col("actions_raw"), F.lit("")), r"[|]", ","), ","),
                    lambda x: F.lower(F.trim(x))
                ),
                lambda x: x != ""
            )
        ).drop("actions_raw")
    else:
        return df

activity_df = normalize_actions(activity_df)


In [31]:
drivers_df=drivers_df.dropna(subset=["driver_id"])
drivers_df.show()

+---------+-----------+---------+----+-----------+
|driver_id|driver_name|     city| age|vehicle_arr|
+---------+-----------+---------+----+-----------+
|     D001|     Ramesh|Hyderabad|  35|[Car, Bike]|
|     D002|     Suresh|Bangalore|  40|     [Auto]|
|     D003|      Anita|   Mumbai|NULL|      [Car]|
|     D004|      Kiran|    Delhi|  29|[Car, Bike]|
|     D005|           |  Chennai|  42|         []|
+---------+-----------+---------+----+-----------+



part b

In [21]:

clean = F.regexp_replace(F.trim(F.col("order_date")), r"[./]", "-")
trips_df = trips_df.withColumn(
    "order_dt",
    F.when(clean.rlike(r"^\d{4}-\d{2}-\d{2}$"), F.to_date(clean, "yyyy-MM-dd"))
     .when(clean.rlike(r"^\d{2}-\d{2}-\d{4}$"), F.to_date(clean, "dd-MM-yyyy"))
     .otherwise(F.lit(None).cast("date"))
)
trips_clean = trips_df.filter(
    (F.col("status") == "Completed") &
    F.col("order_dt").isNotNull() &
    F.col("fare").isNotNull() & (F.col("fare") > 0)
)


In [24]:
joined_trips_drivers = trips_clean.join(drivers_df, on="driver_id", how="inner") #6
joined_trips_drivers.show()

+---------+-------+---------+----------+---------+----+----------+-----------+---------+----+-----------+
|driver_id|trip_id|     city|order_date|   status|fare|  order_dt|driver_name|     city| age|vehicle_arr|
+---------+-------+---------+----------+---------+----+----------+-----------+---------+----+-----------+
|     D001|   T001|Hyderabad|2024-01-05|Completed| 450|2024-01-05|     Ramesh|Hyderabad|  35|[Car, Bike]|
|     D001|   T005|Hyderabad|2024-01-10|Completed| 700|2024-01-10|     Ramesh|Hyderabad|  35|[Car, Bike]|
|     D003|   T003|   Mumbai|2024/01/06|Completed| 620|2024-01-06|      Anita|   Mumbai|NULL|      [Car]|
|     D005|   T006|  Chennai|2024-01-12|Completed| 350|2024-01-12|           |  Chennai|  42|         []|
+---------+-------+---------+----------+---------+----+----------+-----------+---------+----+-----------+



In [29]:


from pyspark.sql.functions import broadcast #7
joined_trips_cities = trips_clean.join(broadcast(cities_df), on="city", how="left")
joined_trips_cities.select("trip_id", "driver_id", "city", "region", "fare", "order_dt").show()




+-------+---------+---------+------+----+----------+
|trip_id|driver_id|     city|region|fare|  order_dt|
+-------+---------+---------+------+----+----------+
|   T001|     D001|Hyderabad| South| 450|2024-01-05|
|   T003|     D003|   Mumbai|  West| 620|2024-01-06|
|   T005|     D001|Hyderabad| South| 700|2024-01-10|
|   T006|     D005|  Chennai| South| 350|2024-01-12|
+-------+---------+---------+------+----+----------+



In [28]:

from pyspark.sql.functions import broadcast

joined_final = joined_trips_drivers.join(broadcast(cities_df), on="city", how="left") #8


In [26]:
joined_final.explain(True)  #9

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Project [driver_id#8, trip_id#7, city#9, order_date#10, status#11, fare#47, order_dt#61, driver_name#1, city#3, age#46, vehicle_arr#48]
:  +- Join Inner, (driver_id#8 = driver_id#0)
:     :- Filter ((((status#11 = Completed) AND isnotnull(order_dt#61)) AND isnotnull(fare#47)) AND (fare#47 > 0))
:     :  +- Project [trip_id#7, driver_id#8, city#9, order_date#10, status#11, fare#47, CASE WHEN RLIKE(regexp_replace(trim(order_date#10, None), [./], -, 1), ^\d{4}-\d{2}-\d{2}$) THEN to_date(regexp_replace(trim(order_date#10, None), [./], -, 1), Some(yyyy-MM-dd), Some(Etc/UTC), true) WHEN RLIKE(regexp_replace(trim(order_date#10, None), [./], -, 1), ^\d{2}-\d{2}-\d{4}$) THEN to_date(regexp_replace(trim(order_date#10, None), [./], -, 1), Some(dd-MM-yyyy), Some(Etc/UTC), true) ELSE cast(null as date) END AS order_dt#61]
:     :     +- Project [trip_id#7, driver_id#8, city#9, order_date#10, status#11, fare#47]
:     :        +- Projec

In [27]:

# 10
trips_no_orphans = trips_clean.join(drivers_df.select("driver_id"), on="driver_id", how="left_anti")


part c

In [33]:

from pyspark.sql.functions import broadcast

jtd = trips_clean.alias("t").join(drivers_df.alias("d"), "driver_id", "inner") \
                 .selectExpr("t.*", "d.driver_name", "d.age", "d.vehicle_arr")
joined_final = jtd.join(broadcast(cities_df), "city", "left")


In [34]:
from pyspark.sql import functions as F #11
total_trips_city = joined_final.groupBy("city").agg(F.countDistinct("trip_id").alias("total_trips"))

total_trips_city.orderBy(F.desc("total_trips")).show()


+---------+-----------+
|     city|total_trips|
+---------+-----------+
|Hyderabad|          2|
|  Chennai|          1|
|   Mumbai|          1|
+---------+-----------+



In [38]:

revenue_city = joined_final.groupBy("city") \
    .agg(F.sum("fare").alias("total_revenue"))

revenue_city.orderBy(F.desc("total_revenue")).show() #12


+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Hyderabad|         1150|
|   Mumbai|          620|
|  Chennai|          350|
+---------+-------------+



In [39]:

avg_fare_driver = joined_final.groupBy("driver_id", "driver_name") \
    .agg(F.avg("fare").alias("avg_fare"))

avg_fare_driver.orderBy(F.desc("avg_fare")).show() #13


+---------+-----------+--------+
|driver_id|driver_name|avg_fare|
+---------+-----------+--------+
|     D003|      Anita|   620.0|
|     D001|     Ramesh|   575.0|
|     D005|           |   350.0|
+---------+-----------+--------+



In [40]:

completed_trips_driver = joined_final.groupBy("driver_id", "driver_name") \
    .agg(F.countDistinct("trip_id").alias("completed_trips"))

completed_trips_driver.orderBy(F.desc("completed_trips")).show()#14


+---------+-----------+---------------+
|driver_id|driver_name|completed_trips|
+---------+-----------+---------------+
|     D001|     Ramesh|              2|
|     D003|      Anita|              1|
|     D005|           |              1|
+---------+-----------+---------------+



In [41]:

drivers_no_completed = drivers_df.join(
    trips_clean.select("driver_id").distinct(), on="driver_id", how="left_anti"
)
drivers_no_completed.select("driver_id", "driver_name", "city", "age", "vehicle_arr").show(truncate=False) #15


+---------+-----------+---------+---+-----------+
|driver_id|driver_name|city     |age|vehicle_arr|
+---------+-----------+---------+---+-----------+
|D002     |Suresh     |Bangalore|40 |[Auto]     |
|D004     |Kiran      |Delhi    |29 |[Car, Bike]|
+---------+-----------+---------+---+-----------+



part d

In [42]:
from pyspark.sql import functions as F
from pyspark.sql import Window


In [43]:
revenue_by_driver = joined_final.groupBy("driver_id", "driver_name") \
    .agg(F.sum("fare").alias("driver_revenue"))
w_overall = Window.orderBy(F.desc("driver_revenue"))
rank_overall = revenue_by_driver.withColumn("rank_overall", F.dense_rank().over(w_overall))

rank_overall.orderBy("rank_overall").show(truncate=False) #16


+---------+-----------+--------------+------------+
|driver_id|driver_name|driver_revenue|rank_overall|
+---------+-----------+--------------+------------+
|D001     |Ramesh     |1150          |1           |
|D003     |Anita      |620           |2           |
|D005     |           |350           |3           |
+---------+-----------+--------------+------------+



In [None]:

revenue_driver_city = joined_final.groupBy("city", "driver_id", "driver_name") \
    .agg(F.sum("fare").alias("driver_revenue"))

w_city = Window.partitionBy("city").orderBy(F.desc("driver_revenue"))
rank_by_city = revenue_driver_city.withColumn("rank_in_city", F.dense_rank().over(w_city))

rank_by_city.orderBy("city", "rank_in_city").show(truncate=False)
