In [8]:
import ConnectionConfig as cc
from pyspark.sql.functions import col
from pyspark.sql.functions import to_date, count
from pyspark.sql.functions import date_format

cc.setupEnvironment()
cc.listEnvironment()

spark = cc.startLocalCluster("neo4j")
spark.getActiveSession()

df_fact_rides = spark.read.parquet("spark-warehouse/factrides_pq")
df_dim_user = spark.read.parquet("spark-warehouse/dimuser_pq")
df_dim_vehicle = spark.read.parquet("spark-warehouse/dimvehicle_pq")
df_dim_date = spark.read.parquet("spark-warehouse/dimdate_pq")
df_dim_lock = spark.read.parquet("spark-warehouse/dimlock_pq")

df_fact_rides.show(5)
df_dim_user.show(5)
df_dim_vehicle.show(5)
df_dim_date.show(5)
df_dim_lock.show(5)

ALLUSERSPROFILE: C:\ProgramData
APPDATA: C:\Users\dobis\AppData\Roaming
COMMONPROGRAMFILES: C:\Program Files\Common Files
COMMONPROGRAMFILES(X86): C:\Program Files (x86)\Common Files
COMMONPROGRAMW6432: C:\Program Files\Common Files
COMPUTERNAME: VIKI
COMSPEC: C:\WINDOWS\system32\cmd.exe
DRIVERDATA: C:\Windows\System32\Drivers\DriverData
GOPATH: C:\Users\dobis\go
HOMEDRIVE: C:
HOMEPATH: \Users\dobis
IGCCSVC_DB: AQAAANCMnd8BFdERjHoAwE/Cl+sBAAAAAdGjjLHLGEWRGn9vRsCSowQAAAACAAAAAAAQZgAAAAEAACAAAAD7UhTq8CVvkaUfJ5fXTR5kgkvcIed3OfwPabt1yHYIgAAAAAAOgAAAAAIAACAAAABSFZMhyRZv+fj9Q44MNd0sMMQVbnBwNGmcsxiNFFrAcmAAAAAQjo+0swEYFhn4kypkFiEe0Z+EUeRh+XkMWaxY6J5h885R6WUpGPQjsBjQtDBiDzTJJu/Eu8HKO9rNDQ2HtHCLXjOrbciSueB80zvNehaNnexWcFALkN4Q37FMwos4go9AAAAAsBkth/vA4x8SgTkTjgM6mv3GKidgi5oDWFMyb92y29Ab+MuztrSDRGCJMXxOGeO0p8LJ8WZBYOU66GQTsD0WFw==
IPY_INTERRUPT_EVENT: 1784
JPY_INTERRUPT_EVENT: 1784
JPY_PARENT_PID: 1788
JPY_SESSION_NAME: Neo4J_Data_Preparation.ipynb
LANG: en_US.UTF-8
LANGUAGE: 
LC_ALL: en_US.UTF-8

In [9]:
df_operational_rides_for_time = (spark.read
    .format("jdbc")
    .option("driver", cc.get_Property("driver"))
    .option("url", cc.create_jdbc())
    .option("dbtable", "rides")
    .option("user", cc.get_Property("username"))
    .option("password", cc.get_Property("password"))
    .load())

In [15]:
# Convert date_sk to a date
df_dim_date = df_dim_date.withColumnRenamed("date", "full_date")
df_operational_time_start = df_operational_rides_for_time.withColumn("start_time", date_format("starttime", "HH:mm:ss"))
df_operational_time_end = df_operational_rides_for_time.withColumn("end_time", date_format("endtime", "HH:mm:ss"))
df_operational_time_start = df_operational_time_start.withColumnRenamed("rideid", "ride_id")
df_operational_time_end = df_operational_time_end.withColumnRenamed("rideid", "ride_id")

# Join to get the actual dates
df_rides_enriched_v0 = df_fact_rides.join(df_dim_date, "date_sk")
df_rides_enriched_v1 = df_rides_enriched_v0.join(df_operational_time_start, "ride_id")
df_rides_enriched = df_rides_enriched_v1.join(df_operational_time_end, "ride_id")

# Filter for 2 days
top_days = ["2021-02-14", "2021-02-15"]
df_filtered = df_rides_enriched.filter(df_rides_enriched.full_date.isin(top_days))

df_filtered.show(5)

+-------+--------+--------------------+-------------+-----------+----------+----------+-----------------+-------------+--------------------+----------+----+----------+--------+--------+------+----------+-------+-----------------+-----------------+-------------------+-------------------+---------+--------------+-----------+---------+----------+-----------------+-----------------+-------------------+-------------------+---------+--------------+-----------+---------+--------+
|ride_id| date_sk|             user_sk|start_lock_id|end_lock_id|vehicle_id|weather_id|    ride_distance|ride_duration|                 md5| full_date|year|month_name|month_nr|day_name|day_nr|is_weekday|quarter|       startpoint|         endpoint|          starttime|            endtime|vehicleid|subscriptionid|startlockid|endlockid|start_time|       startpoint|         endpoint|          starttime|            endtime|vehicleid|subscriptionid|startlockid|endlockid|end_time|
+-------+--------+--------------------+-----

In [16]:
# Join all relevant tables
df_joined = (
    df_filtered
    .join(df_dim_user, "user_sk", "left")
    .join(df_dim_vehicle.withColumnRenamed("biketypeid", "vehicle_id"), "vehicle_id", "left")
    .join(df_dim_lock.withColumnRenamed("lock_id", "start_lock_id"), "start_lock_id", "left")
    .withColumnRenamed("station_id", "start_station_id")
    .withColumnRenamed("district", "start_district")
    .join(df_dim_lock.withColumnRenamed("lock_id", "end_lock_id"), "end_lock_id", "left")
    .withColumnRenamed("station_id", "end_station_id")
    .withColumnRenamed("district", "end_district")
)

# Select ride-level info
df_rides_json = df_joined.select(
    "ride_id", "full_date", "ride_distance", "ride_duration",
    "user_sk", "vehicle_id",
    "start_lock_id", "start_station_id", "start_district",
    "end_lock_id", "end_station_id", "end_district", "start_time", "end_time"
)

df_rides_json.coalesce(1).write.mode("overwrite").json("output_json/rides.json")

In [17]:
df_check = spark.read.json("output_json/rides.json")
df_check.show(truncate=False)
df_check.printSchema()

+------------+-----------+--------------+--------+----------+------------------+-------------+-------+--------------+-------------+----------------+----------+------------------------------------+----------+
|end_district|end_lock_id|end_station_id|end_time|full_date |ride_distance     |ride_duration|ride_id|start_district|start_lock_id|start_station_id|start_time|user_sk                             |vehicle_id|
+------------+-----------+--------------+--------+----------+------------------+-------------+-------+--------------+-------------+----------------+----------+------------------------------------+----------+
|ANTWERPEN   |1800       |85            |21:36:32|2021-02-15|2.473831156859431 |13.02        |1398115|ANTWERPEN     |664          |32              |21:23:31  |8442487d-e801-4603-985e-34f9f02249e6|1         |
|ANTWERPEN   |3166       |149           |22:05:50|2021-02-15|2.403725037952006 |27.08        |1398118|BORGERHOUT    |4642         |196             |21:38:45  |39e55f17-

In [18]:
df_users_json = df_dim_user.select("user_sk", "name", "email").dropDuplicates()
df_users_json.write.mode("overwrite").json("output_json/users.json")

df_stations_json = df_dim_lock.select("station_id", "station_nr", "gps_coord").dropDuplicates()
df_stations_json.write.mode("overwrite").json("output_json/stations.json")

df_neighborhoods_json = df_dim_lock.select("station_id", "district").dropDuplicates()
df_neighborhoods_json.write.mode("overwrite").json("output_json/neighborhoods.json")

df_vehicles_json = df_dim_vehicle.select("biketypeid", "biketypedescription").dropDuplicates()
df_vehicles_json.write.mode("overwrite").json("output_json/vehicles.json")