In [2]:
'''
You are given two datasets:

rides – containing ride-level data such as pickup time, fare amount, and distance.
drivers – containing driver information including name, status, and rating.
Your task is to:

Join both datasets using the driver_id field.
Select the following columns from the joined data: ride_id, driver_id, driver_name, fare_amount, status.
Input Schema
rides schema
Column Name	Data Type
ride_id	Integer
driver_id	Integer
rider_id	Integer
fare_amount	Double
distance_km	Double
Example rides table
ride_id	driver_id	rider_id	fare_amount	distance_km
1	101	201	300.0	12.4
2	102	202	150.0	8.0
3	101	203	220.0	10.5
drivers schema
Column Name	Data Type
driver_id	Integer
name	String
status	String
rating	Double
Example drivers table
driver_id	name	status	rating
101	Alex	available	4.8
102	Sam	off_duty	4.6
Output Schema
Column Name	Data Type
ride_id	Integer
driver_id	Integer
driver_name	String
fare_amount	Double
status	String
Example Output Table
ride_id	driver_id	driver_name	fare_amount	status
1	101	Alex	300.0	available
2	102	Sam	150.0	off_duty
3	101	Alex	220.0	available
Starter Code
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, IntegerType,
    StringType, DoubleType
)

spark = SparkSession.builder.getOrCreate()

rides_data = [
    (1, 101, 201, 300.0, 12.4),
    (2, 102, 202, 150.0, 8.0),
    (3, 101, 203, 220.0, 10.5),
    (4, 103, 204, 500.0, 15.6),
    (5, 104, 205, 100.0, 5.0),
    (6, 102, 206, 180.0, 9.2),
    (7, 101, 207, 275.0, 11.8),
    (8, 103, 208, 330.0, 13.0),
    (9, 105, 209, 400.0, 14.0),
    (10, 106, 210, 210.0, 9.0),
]

drivers_data = [
    (101, "Alex", "available", 4.8),
    (102, "Sam", "off_duty", 4.6),
    (103, "Rita", "available", 4.9),
    (104, "John", "suspended", 3.2),
    (105, "Priya", "available", 4.5),
    (106, "Ramesh", "off_duty", 4.3),
]

rides_schema = StructType([
    StructField("ride_id", IntegerType(), True),
    StructField("driver_id", IntegerType(), True),
    StructField("rider_id", IntegerType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("distance_km", DoubleType(), True),
])

drivers_schema = StructType([
    StructField("driver_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("rating", DoubleType(), True),
])

rides_df = spark.createDataFrame(rides_data, schema=rides_schema)
drivers_df = spark.createDataFrame(drivers_data, schema=drivers_schema)

# Your logic here
# Save the final output DataFrame as df_result

display(df_result)
'''
# Initialize Spark session
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, IntegerType,
    StringType, DoubleType
)
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

rides_data = [
    (1, 101, 201, 300.0, 12.4),
    (2, 102, 202, 150.0, 8.0),
    (3, 101, 203, 220.0, 10.5),
    (4, 103, 204, 500.0, 15.6),
    (5, 104, 205, 100.0, 5.0),
    (6, 102, 206, 180.0, 9.2),
    (7, 101, 207, 275.0, 11.8),
    (8, 103, 208, 330.0, 13.0),
    (9, 105, 209, 400.0, 14.0),
    (10, 106, 210, 210.0, 9.0),
]

drivers_data = [
    (101, "Alex", "available", 4.8),
    (102, "Sam", "off_duty", 4.6),
    (103, "Rita", "available", 4.9),
    (104, "John", "suspended", 3.2),
    (105, "Priya", "available", 4.5),
    (106, "Ramesh", "off_duty", 4.3),
]

rides_schema = StructType([
    StructField("ride_id", IntegerType(), True),
    StructField("driver_id", IntegerType(), True),
    StructField("rider_id", IntegerType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("distance_km", DoubleType(), True),
])

drivers_schema = StructType([
    StructField("driver_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("rating", DoubleType(), True),
])

rides_df = spark.createDataFrame(rides_data, schema=rides_schema)
drivers_df = spark.createDataFrame(drivers_data, schema=drivers_schema)

df_result = (
  rides_df.join(drivers_df, on="driver_id", how="inner")
  .select(
    "ride_id",
    "driver_id",
    F.col("name").alias("driver_name"),
    "fare_amount",
    "status"
  )
)

# Display the final DataFrame
df_result.show()

+-------+---------+-----------+-----------+---------+
|ride_id|driver_id|driver_name|fare_amount|   status|
+-------+---------+-----------+-----------+---------+
|      1|      101|       Alex|      300.0|available|
|      3|      101|       Alex|      220.0|available|
|      7|      101|       Alex|      275.0|available|
|      2|      102|        Sam|      150.0| off_duty|
|      6|      102|        Sam|      180.0| off_duty|
|      4|      103|       Rita|      500.0|available|
|      8|      103|       Rita|      330.0|available|
|      5|      104|       John|      100.0|suspended|
|      9|      105|      Priya|      400.0|available|
|     10|      106|     Ramesh|      210.0| off_duty|
+-------+---------+-----------+-----------+---------+

