In [2]:

import ConnectionConfig as cc
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [3]:

# Initialize Spark session
cc.setupEnvironment()
spark = cc.startLocalCluster("RidesWithVehicles")
spark.getActiveSession()

In [4]:

# Step 1: Extract rides data from db
ride_src_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "rides") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 5000) \
    .load()

In [5]:

# Step 2: Extract vehicles data
vehicles_src_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "vehicles") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

In [6]:
bikelots_src_df = spark.read.format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "bikelots") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

In [7]:

# Step 3: Create temporary views
ride_src_df.createOrReplaceTempView("rides_source")
vehicles_src_df.createOrReplaceTempView("vehicles_source")
bikelots_src_df.createOrReplaceTempView("bikelots_source")

In [8]:

# Step 5: Use SQL to join and create nested structure (without to_json!)
df_struct = spark.sql("""
    SELECT
        r.rideid AS ride_id,
        r.starttime AS start_time,
        r.endtime AS end_time,
        struct(
            CAST(REGEXP_REPLACE(SPLIT(r.startpoint, ',')[1], '[)]', '') AS DOUBLE) AS latitude,
            CAST(REGEXP_REPLACE(SPLIT(r.startpoint, ',')[0], '[(]', '') AS DOUBLE) AS longitude
        ) AS start_coordinates,
        struct(
            CAST(REGEXP_REPLACE(SPLIT(r.endpoint, ',')[1], '[)]', '') AS DOUBLE) AS latitude,
            CAST(REGEXP_REPLACE(SPLIT(r.endpoint, ',')[0], '[(]', '') AS DOUBLE) AS longitude
        ) AS end_coordinates,
        struct(
            v.vehicleid AS vehicle_id,
            b.biketypeid AS bike_type_id
        ) AS vehicle
    FROM rides_source r
    LEFT JOIN vehicles_source v ON r.vehicleid = v.vehicleid
    LEFT JOIN bikelots_source b ON v.bikelotid = b.bikelotid
""")

In [9]:
df_struct.repartition(1).write.format("json").mode("overwrite").save("output/rides_with_vehicles.json")

In [10]:
df_struct.printSchema()

root
 |-- ride_id: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- start_coordinates: struct (nullable = false)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- end_coordinates: struct (nullable = false)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- vehicle: struct (nullable = false)
 |    |-- vehicle_id: integer (nullable = true)
 |    |-- bike_type_id: integer (nullable = true)

