In [57]:
import ConnectionConfig as cc
from delta import DeltaTable
from datetime import datetime

In [58]:
cc.setupEnvironment()
spark = cc.startLocalCluster("dimUser")
spark.getActiveSession()

In [59]:
run_timestamp =datetime.now()

In [60]:
dt_dimUserCurrent = DeltaTable.forPath(spark,"./spark-warehouse/user_dim_current")

In [61]:
dt_dimUserCurrent.toDF().createOrReplaceTempView("user_dim_current")

In [62]:
spark.sql("select * from user_dim_current").show()

+------+-------------------+--------------------+--------------------+--------+-------+--------------------+------------+--------------------+-------------------+-------------------+--------------------+-------+
|userid|               name|               email|              street|  number|zipcode|                city|country_code|             user_sk|          scd_start|            scd_end|                 md5|current|
+------+-------------------+--------------------+--------------------+--------+-------+--------------------+------------+--------------------+-------------------+-------------------+--------------------+-------+
| 30001|   de Ruijter Maria|Maria.de.Ruijter@...|            Kallodam| 50 0802|   9120|Beveren-Waas/Haas...|          BE|e408b9fe-27d8-4ff...|1990-01-01 00:00:00|2100-12-12 00:00:00|ca6acc87cdd2dbf67...|   true|
| 30002|    Dijkstra Myrthe|Myrthe.Dijkstra@t...|    Lange Repeldreef|    539 |   2970|s Gravenwezel/Sch...|          BE|9ef0cf0d-b32e-406...|1990-01-01

In [63]:
cc.set_connectionProfile("veloDB")

df_operational_users_new = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 1) \
    .option("upperBound", 60000) \
    .load()

df_operational_users_new.createOrReplaceTempView("operational_users")

df_operational_users_new.show(5)


+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+
|userid|             name|               email|              street|  number|zipcode|               city|country_code|
+------+-----------------+--------------------+--------------------+--------+-------+-------------------+------------+
|     2|van der Zee Julia|Julia.van.der.Zee...|         Groenplaats|     43 |   2610|Wilrijk (Antwerpen)|          BE|
|     3|  de Boer Ricardo|Ricardo.de.Boer@g...|   Maria Clarastraat|     80 |   2160|          Wommelgem|          BE|
|     4|   Willems Angela|Angela.Willems@te...|Graaf Joseph de P...|     15 |   2900|            Schoten|          BE|
|     5| Heijnen Patricia|Patricia.Heijnen@...|          Meylstraat|    111 |   2540|               Hove|          BE|
|     6|   Driessen Anouk|Anouk.Driessen@sc...|   Jan Ockegemstraat|168 0107|   2650|             Edegem|          BE|
+------+-----------------+--------------------+-

In [64]:
#Transform
df_user_dim_new = spark.sql( """
    SELECT uuid() as source_user_sk,
           userid as source_user_id,
           name as source_user_name,
           email as source_user_email,
           street as source_street,
           number as source_number,
           zipcode as source_zipcode,  
           city as source_city,
           country_code as source_country_code,
           md5(concat(name, email)) as source_md5
    FROM operational_users
""")

df_user_dim_new.createOrReplaceTempView("user_dim_new")

# Debugging
df_user_dim_new.show()


+--------------------+--------------+--------------------+--------------------+--------------------+-------------+--------------+--------------------+-------------------+--------------------+
|      source_user_sk|source_user_id|    source_user_name|   source_user_email|       source_street|source_number|source_zipcode|         source_city|source_country_code|          source_md5|
+--------------------+--------------+--------------------+--------------------+--------------------+-------------+--------------+--------------------+-------------------+--------------------+
|2f33f20c-41eb-433...|             2|   van der Zee Julia|Julia.van.der.Zee...|         Groenplaats|          43 |          2610| Wilrijk (Antwerpen)|                 BE|4e2e54541cbdd9474...|
|235726d0-5211-4b9...|             3|     de Boer Ricardo|Ricardo.de.Boer@g...|   Maria Clarastraat|          80 |          2160|           Wommelgem|                 BE|12a0a84de074b278c...|
|b1ccc4e7-8429-4d5...|             4|   

In [65]:
detectedChanges = spark.sql("""
    select * from user_dim_new source \
    left outer join user_dim_current dwh on dwh.userid == source.source_user_id and dwh.current == true \
    where dwh.userid is null or dwh.md5 <> source.source_md5
""")

detectedChanges.createOrReplaceTempView("detectedChanges")
detectedChanges.show()


+--------------+--------------+----------------+-----------------+-------------+-------------+--------------+-----------+-------------------+----------+------+----+-----+------+------+-------+----+------------+-------+---------+-------+---+-------+
|source_user_sk|source_user_id|source_user_name|source_user_email|source_street|source_number|source_zipcode|source_city|source_country_code|source_md5|userid|name|email|street|number|zipcode|city|country_code|user_sk|scd_start|scd_end|md5|current|
+--------------+--------------+----------------+-----------------+-------------+-------------+--------------+-----------+-------------------+----------+------+----+-----+------+------+-------+----+------------+-------+---------+-------+---+-------+
+--------------+--------------+----------------+-----------------+-------------+-------------+--------------+-----------+-------------------+----------+------+----+-----+------+------+-------+----+------------+-------+---------+-------+---+-------+



In [66]:
df_user_upserts = spark.sql(f"""
    select source_user_sk as user_sk, \
    source_user_id as userid, \
    source_user_name as name, \
    source_user_email as email, \
    source_street as street, \
    source_number as number, \
    source_zipcode as zipcode, \
    source_city as city, \
    source_country_code as country_code, \
    to_timestamp('{run_timestamp}') as scd_start, \
    to_timestamp('2100-12-12', 'yyyy-MM-dd') as scd_end, \
    source_md5 as md5, \
    true as current \
    from detectedChanges \
    union \
    select user_sk, \
    userid, \
    name, \
    email, \
    street,\
    number,\
    zipcode,\
    city,\
    country_code,\
    scd_start,\
    to_timestamp('{run_timestamp}') as scd_end, \
    md5, \
    false \
    from detectedChanges \
    where current is not null
""")

df_user_upserts.createOrReplaceTempView("upserts")
df_user_upserts.show()

+-------+------+----+-----+------+------+-------+----+------------+---------+-------+---+-------+
|user_sk|userid|name|email|street|number|zipcode|city|country_code|scd_start|scd_end|md5|current|
+-------+------+----+-----+------+------+-------+----+------------+---------+-------+---+-------+
+-------+------+----+-----+------+------+-------+----+------------+---------+-------+---+-------+



In [67]:

# DEBUG CODE TO SHOW CONTENT OF UPSERTS
spark.sql("select * from upserts").show()

+-------+------+----+-----+------+------+-------+----+------------+---------+-------+---+-------+
|user_sk|userid|name|email|street|number|zipcode|city|country_code|scd_start|scd_end|md5|current|
+-------+------+----+-----+------+------+-------+----+------------+---------+-------+---+-------+
+-------+------+----+-----+------+------+-------+----+------------+---------+-------+---+-------+



In [68]:

spark.sql("""
    MERGE INTO user_dim_current AS target
    USING upserts AS source
    ON target.userid = source.userid AND source.current = false and target.current = true

    WHEN MATCHED THEN UPDATE SET scd_end = source.scd_end, current = source.current
    WHEN NOT MATCHED THEN INSERT (user_sk, userid, name, email, street, number, zipcode, city, country_code, scd_start, scd_end, md5, current) values (source.user_sk, source.userid, source.name, source.email, source.street, source.number, source.zipcode, source.city, source.country_code, source.scd_start, source.scd_end, source.md5, source.current)
""")


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [69]:
dt_dimUserCurrent.toDF().sort("userid", "scd_start").show(100)

+------+--------------------+--------------------+--------------------+--------+-------+--------------------+------------+--------------------+-------------------+-------------------+--------------------+-------+
|userid|                name|               email|              street|  number|zipcode|                city|country_code|             user_sk|          scd_start|            scd_end|                 md5|current|
+------+--------------------+--------------------+--------------------+--------+-------+--------------------+------------+--------------------+-------------------+-------------------+--------------------+-------+
|     1|         Bouman Lars|Lars.Bouman@gmail...|         Somméstraat|    156 |   2060|           Antwerpen|          BE|6d2ac2be-e3bd-484...|1990-01-01 00:00:00|2100-12-12 00:00:00|26b6b7d4dd32d5cd0...|   true|
|     2|   van der Zee Julia|Julia.van.der.Zee...|          Europalaan|     43 |   2610| Wilrijk (Antwerpen)|          BE|cdd69744-9095-476...|1990-

In [70]:
spark.stop()