In [15]:
##Extract
import random
import pyspark
from pyspark.sql import SparkSession, functions
import ConnectionConfig as cc
from pyspark.sql.functions import *

# Set up environment
cc.setupEnvironment()
cc.listEnvironment()

# Start local Spark cluster
spark = cc.startLocalCluster("vehicle_dim")
spark.getActiveSession()  # To get the active Spark session

# Extract data from the "vehicles" table
cc.set_connectionProfile("veloDB")

# EXTRACT: Loading data from the vehicles table into df_operational_vehicle_dim
df_operational_vehicle_dim = (spark.read
    .format("jdbc")
    .option("driver", cc.get_Property("driver")) 
    .option("url", cc.create_jdbc()) 
    .option("dbtable", "vehicles") 
    .option("user", cc.get_Property("username")) 
    .option("password", cc.get_Property("password")) 
    .option("partitionColumn", "vehicleid") 
    .option("numPartitions", 4) 
    .option("lowerBound", 1000) 
    .option("upperBound", 5000) 
    .load())

# Show the first 20 rows of the operational data to confirm extraction
df_operational_vehicle_dim.show(20)


ALLUSERSPROFILE: C:\ProgramData
APPDATA: C:\Users\dobis\AppData\Roaming
COMMONPROGRAMFILES: C:\Program Files\Common Files
COMMONPROGRAMFILES(X86): C:\Program Files (x86)\Common Files
COMMONPROGRAMW6432: C:\Program Files\Common Files
COMPUTERNAME: VIKI
COMSPEC: C:\WINDOWS\system32\cmd.exe
DRIVERDATA: C:\Windows\System32\Drivers\DriverData
GOPATH: C:\Users\dobis\go
HOMEDRIVE: C:
HOMEPATH: \Users\dobis
IGCCSVC_DB: AQAAANCMnd8BFdERjHoAwE/Cl+sBAAAAAdGjjLHLGEWRGn9vRsCSowQAAAACAAAAAAAQZgAAAAEAACAAAAD7UhTq8CVvkaUfJ5fXTR5kgkvcIed3OfwPabt1yHYIgAAAAAAOgAAAAAIAACAAAABSFZMhyRZv+fj9Q44MNd0sMMQVbnBwNGmcsxiNFFrAcmAAAAAQjo+0swEYFhn4kypkFiEe0Z+EUeRh+XkMWaxY6J5h885R6WUpGPQjsBjQtDBiDzTJJu/Eu8HKO9rNDQ2HtHCLXjOrbciSueB80zvNehaNnexWcFALkN4Q37FMwos4go9AAAAAsBkth/vA4x8SgTkTjgM6mv3GKidgi5oDWFMyb92y29Ab+MuztrSDRGCJMXxOGeO0p8LJ8WZBYOU66GQTsD0WFw==
IPY_INTERRUPT_EVENT: 3108
JPY_INTERRUPT_EVENT: 3108
JPY_PARENT_PID: 3188
JPY_SESSION_NAME: DWH_VehicleDim.ipynb
LANG: en_US.UTF-8
LANGUAGE: 
LC_ALL: en_US.UTF-8
LOCALAP

In [16]:
#Trasform
# Load tables from JDBC for bike lots and bike types
df_bikelots = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "bikelots") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

df_biketypes = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "bike_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

df_vehicles = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver", cc.get_Property("driver")) \
    .option("dbtable", "vehicles") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .load()

# Perform the JOIN operations to combine data
# Join vehicles -> bikelots -> biketypes
df_result = df_vehicles \
    .join(df_bikelots, df_vehicles.bikelotid == df_bikelots.bikelotid, "inner") \
    .join(df_biketypes, df_bikelots.biketypeid == df_biketypes.biketypeid, "inner") \
    .select(df_vehicles.vehicleid.alias("vehicle_id"), df_biketypes.biketypedescription)

# Show the resulting transformed DataFrame
df_result.show()
df_result.printSchema()  # Check the schema after transformation


+----------+-------------------+
|vehicle_id|biketypedescription|
+----------+-------------------+
|      3083|            Scooter|
|      3084|            Scooter|
|      3085|            Scooter|
|      3086|            Scooter|
|      3087|            Scooter|
|      3088|            Scooter|
|      3089|            Scooter|
|      3090|            Scooter|
|      3091|            Scooter|
|      3092|            Scooter|
|      3093|            Scooter|
|      3094|            Scooter|
|      3095|            Scooter|
|      3096|            Scooter|
|      3097|            Scooter|
|      3098|            Scooter|
|      3099|            Scooter|
|      3100|            Scooter|
|      3101|            Scooter|
|      3102|            Scooter|
+----------+-------------------+
only showing top 20 rows

root
 |-- vehicle_id: integer (nullable = true)
 |-- biketypedescription: string (nullable = true)



In [17]:
#Load
# Save the transformed data to a Delta table
spark.sql("DROP TABLE IF EXISTS vehicle_dim")

df_result.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", "./spark-warehouse") \
    .saveAsTable("vehicle_dim")


# Confirmation that the data has been saved to the Delta table
print("Data successfully written to Delta table: vehicle_dim")

Data successfully written to Delta table: vehicle_dim


In [18]:
#Test
# Query the saved Delta table
spark.sql("SELECT * FROM vehicle_dim").show()


+----------+-------------------+
|vehicle_id|biketypedescription|
+----------+-------------------+
|      3083|            Scooter|
|      3084|            Scooter|
|      3085|            Scooter|
|      3086|            Scooter|
|      3087|            Scooter|
|      3088|            Scooter|
|      3089|            Scooter|
|      3090|            Scooter|
|      3091|            Scooter|
|      3092|            Scooter|
|      3093|            Scooter|
|      3094|            Scooter|
|      3095|            Scooter|
|      3096|            Scooter|
|      3097|            Scooter|
|      3098|            Scooter|
|      3099|            Scooter|
|      3100|            Scooter|
|      3101|            Scooter|
|      3102|            Scooter|
+----------+-------------------+
only showing top 20 rows



In [19]:
# Using Spark SQL to count rows
spark.sql("SELECT COUNT(*) FROM vehicle_dim").show()

+--------+
|count(1)|
+--------+
|    7000|
+--------+



In [20]:
spark.sql("DESCRIBE FORMATTED vehicle_dim").show(truncate=False)


+----------------------------+--------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                 |comment|
+----------------------------+--------------------------------------------------------------------------+-------+
|vehicle_id                  |int                                                                       |NULL   |
|biketypedescription         |string                                                                    |NULL   |
|                            |                                                                          |       |
|# Detailed Table Information|                                                                          |       |
|Name                        |spark_catalog.default.vehicle_dim                                         |       |
|Type                        |EXTERNAL                                                  

In [21]:
# Create a temporary view from the Delta table
spark.sql("CREATE OR REPLACE TEMP VIEW vehicle_dim_view AS SELECT * FROM vehicle_dim")


DataFrame[]