In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import os

# Disable this so that configuration of 'spark.jars.packages' works correctly
if 'PYSPARK_SUBMIT_ARGS' in os.environ:
    del os.environ['PYSPARK_SUBMIT_ARGS']

# Instantiate a spark configuration object to receive settings
spark_conf = SparkConf()

# Maven coordinates for package containing JDBC drivers
jdbc_driver_packages = 'org.postgresql:postgresql:42.2.9,org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.3'

# Configure spark to see the postgresql driver package
spark_conf.set('spark.jars.packages', jdbc_driver_packages)

# The name of your Spark cluster hostname or ip address
spark_cluster = os.environ['SPARK_CLUSTER']

# Configure some basic spark cluster sizing parameters
spark_conf.set('spark.cores.max', 2)
spark_conf.set('spark.executor.cores', '1')

spark = SparkSession.builder \
    .master('spark://{cluster}:7077'.format(cluster=spark_cluster)) \
    .appName('Spark-Demo') \
    .config(conf = spark_conf) \
    .getOrCreate()

In [2]:
'postgresql' in spark.sparkContext.getConf().get('spark.jars')

True

In [3]:
import pandas as pd
transactionDF = spark.createDataFrame(pd.read_csv("fraud.csv"))
transactionDF.show(5)

+----------+----------+-------+------+-----------+-----------+-------+
| timestamp|     label|user_id|amount|merchant_id| trans_type|foreign|
+----------+----------+-------+------+-----------+-----------+-------+
|1591150620|legitimate|      8|  9.42|          4|     online|  false|
|1591150669|legitimate|      9| 36.38|         11|contactless|  false|
|1591152891|legitimate|      7| 10.56|          8|     online|  false|
|1591157902|legitimate|      9| 25.09|         11|      swipe|  false|
|1591157988|legitimate|      8| 17.73|          4|     manual|  false|
+----------+----------+-------+------+-----------+-----------+-------+
only showing top 5 rows



In [4]:
names = ["Jake", "Sherman", "Morgan", "Bodie", "Ben", "Elwood", "Sandy", "Clover", "Molly", "Linsey"]
raw = [(j, names[j]) for j in range(10)]
tdf = spark.createDataFrame(raw, ['user_id', 'user_name'])
userDF = tdf.select(tdf.user_id.cast("int"), tdf.user_name)
userDF.show(5)

+-------+---------+
|user_id|user_name|
+-------+---------+
|      0|     Jake|
|      1|  Sherman|
|      2|   Morgan|
|      3|    Bodie|
|      4|      Ben|
+-------+---------+
only showing top 5 rows



In [5]:
names = ["Changing Hands", "First Draft", "Anaya's", "SBUX", "Ike's", "Bob's Burgers", "Lovin Spoonful", "Ayse's", "Dawn Treader", "Taco Hut", "Johnny's Hots", "Pavement", "Victrola", "The Barn", "Clockwork"]
raw = [(j, names[j]) for j in range(15)]
tdf = spark.createDataFrame(raw, ['merchant_id', 'merchant_name'])
merchantDF = tdf.select(tdf.merchant_id.cast("int"), tdf.merchant_name)
merchantDF.show(5)

+-----------+--------------+
|merchant_id| merchant_name|
+-----------+--------------+
|          0|Changing Hands|
|          1|   First Draft|
|          2|       Anaya's|
|          3|          SBUX|
|          4|         Ike's|
+-----------+--------------+
only showing top 5 rows



In [6]:
transactionDF.registerTempTable("transactions")
userDF.registerTempTable("users")
merchantDF.registerTempTable("merchants")

In [7]:
joinq = """
SELECT timestamp, user_name, merchant_name, foreign, trans_type, amount FROM
transactions
left join users on transactions.user_id = users.user_id
left join merchants on transactions.merchant_id = merchants.merchant_id
"""

joinDF = spark.sql(joinq)

In [8]:
joinDF.show(5)

+----------+---------+--------------+-------+------------+------+
| timestamp|user_name| merchant_name|foreign|  trans_type|amount|
+----------+---------+--------------+-------+------------+------+
|1592832148|   Clover|Changing Hands|  false| contactless| 20.61|
|1592885322|   Clover|Changing Hands|   true|      online| 26.98|
|1592966814|   Clover|Changing Hands|   true|      online| 16.64|
|1593151665|   Clover|Changing Hands|  false|chip_and_pin| 38.89|
|1594246175|   Clover|Changing Hands|  false|      online| 20.65|
+----------+---------+--------------+-------+------------+------+
only showing top 5 rows



In [9]:
from pyspark.sql import functions as F
g = joinDF.groupBy("user_name").agg(F.mean("amount").alias("avg"))
g.sort(F.desc("avg")).show()

+---------+------------------+
|user_name|               avg|
+---------+------------------+
|   Linsey| 42.80322639780018|
|    Sandy| 40.50369382022472|
|   Morgan| 39.76930656934306|
|    Molly| 36.44709313264344|
|   Elwood| 35.53930167597765|
|    Bodie| 34.17701121794873|
|   Clover|  33.9838043478261|
|  Sherman|33.636657608695636|
|     Jake|30.170527369826427|
|      Ben|26.919920948616625|
+---------+------------------+



In [10]:
g = joinDF.groupBy("merchant_name").agg(F.mean("amount").alias("avg"))
g.sort(F.desc("avg")).show()

+--------------+------------------+
| merchant_name|               avg|
+--------------+------------------+
|        Ayse's|52.653333333333336|
| Bob's Burgers| 47.96741935483872|
|      Victrola| 44.00807692307693|
|          SBUX| 42.44407407407408|
|      Pavement|38.924850213980044|
|     Clockwork| 38.50538082191775|
|Lovin Spoonful|34.724619771863075|
|         Ike's| 34.19377874186547|
|  Dawn Treader|31.942931896883348|
| Johnny's Hots| 30.46848484848485|
|      The Barn|27.206857142857146|
|      Taco Hut| 26.23961538461538|
|   First Draft|23.740312500000005|
|Changing Hands|23.066451612903226|
|       Anaya's|22.162608695652178|
+--------------+------------------+



In [11]:
spark_jdbc_url = 'jdbc:postgresql://{host}:{port}/{database}'.format( \
    host     = 'postgresql', \
    port     = '5432', \
    database = 'demo')

spark_jdbc_prop = { \
    'user':     'demo', \
    'password': 'demo', \
    'driver':   'org.postgresql.Driver'
}

In [12]:
transactionDF.write.jdbc(table='transactions', mode='overwrite', url=spark_jdbc_url, properties=spark_jdbc_prop)

In [13]:
transactionSQLDF = spark.read.jdbc( \
    table      = '({q}) tmp'.format(q='select * from transactions'), \
    url        = spark_jdbc_url, \
    properties = spark_jdbc_prop \
)

transactionSQLDF.show(5)

+----------+----------+-------+------+-----------+------------+-------+
| timestamp|     label|user_id|amount|merchant_id|  trans_type|foreign|
+----------+----------+-------+------+-----------+------------+-------+
|1598608959|legitimate|      8| 18.76|          4|      online|  false|
|1598609843|legitimate|      3| 129.7|          8|      online|   true|
|1598610775|legitimate|      9| 42.84|         14|       swipe|  false|
|1598610872|legitimate|      1| 28.08|          6|chip_and_pin|  false|
|1598610890|legitimate|      2| 10.29|         11|      online|  false|
+----------+----------+-------+------+-----------+------------+-------+
only showing top 5 rows



In [14]:
transactionSQLDF.registerTempTable("transactionsSQL")
joinq = """
SELECT timestamp, user_name, merchant_name, foreign, trans_type, amount FROM
transactionsSQL
left join users on transactionsSQL.user_id = users.user_id
left join merchants on transactionsSQL.merchant_id = merchants.merchant_id
"""
joinSQLDF = spark.sql(joinq)
joinSQLDF.show(5)

+----------+---------+--------------+-------+------------+------+
| timestamp|user_name| merchant_name|foreign|  trans_type|amount|
+----------+---------+--------------+-------+------------+------+
|1593029512|     Jake|Changing Hands|  false|chip_and_pin|  16.2|
|1601491845|     Jake|Changing Hands|  false|chip_and_pin| 16.37|
|1595585679|     Jake|Changing Hands|  false|      manual| 15.51|
|1596822054|     Jake|Changing Hands|  false| contactless| 15.15|
|1592832148|   Clover|Changing Hands|  false| contactless| 20.61|
+----------+---------+--------------+-------+------------+------+
only showing top 5 rows



In [15]:
g = joinSQLDF.groupBy("user_name").agg(F.mean("amount").alias("avg"))
g.sort(F.desc("avg")).show()

+---------+------------------+
|user_name|               avg|
+---------+------------------+
|   Linsey|42.803226397800174|
|    Sandy| 40.50369382022472|
|   Morgan| 39.76930656934306|
|    Molly| 36.44709313264344|
|   Elwood|35.539301675977654|
|    Bodie| 34.17701121794872|
|   Clover|33.983804347826094|
|  Sherman| 33.63665760869564|
|     Jake|30.170527369826434|
|      Ben|26.919920948616618|
+---------+------------------+



In [16]:
g = joinSQLDF.groupBy("merchant_name").agg(F.mean("amount").alias("avg"))
g.sort(F.desc("avg")).show()

+--------------+------------------+
| merchant_name|               avg|
+--------------+------------------+
|        Ayse's|52.653333333333336|
| Bob's Burgers|47.967419354838704|
|      Victrola|44.008076923076935|
|          SBUX|42.444074074074074|
|      Pavement|38.924850213980044|
|     Clockwork| 38.50538082191772|
|Lovin Spoonful| 34.72461977186314|
|         Ike's|34.193778741865515|
|  Dawn Treader| 31.94293189688334|
| Johnny's Hots| 30.46848484848485|
|      The Barn|27.206857142857135|
|      Taco Hut| 26.23961538461538|
|   First Draft|23.740312500000005|
|Changing Hands|23.066451612903226|
|       Anaya's|22.162608695652178|
+--------------+------------------+



In [17]:
transactionsStreamDF = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "odh-message-bus-kafka-bootstrap:9092") \
    .option("subscribe", "demotopic") \
    .option("startingOffsets", "latest") \
    .load() \
    .select(F.from_json(F.col("value").cast("string"), transactionDF.schema).alias("json")) \
    .select(F.col("json.timestamp"),F.col("json.user_id"),F.col("json.merchant_id"),F.col("json.trans_type"),F.col("json.foreign"),F.col("json.amount"))

In [18]:
transactionsStreamDF.registerTempTable("transactionsStream")
joinq = """
SELECT timestamp, user_name, merchant_name, foreign, trans_type, amount FROM
transactionsStream
left join users on transactionsStream.user_id = users.user_id
left join merchants on transactionsStream.merchant_id = merchants.merchant_id
"""
joinStreamDF = spark.sql(joinq)

In [19]:
streamquery_user = joinStreamDF.groupBy("user_name").agg(F.mean("amount").alias("avg")).sort(F.desc("avg")).writeStream \
    .trigger(processingTime='1 seconds') \
    .outputMode("complete") \
    .format("memory") \
    .queryName("stream_results_user") \
    .start()

streamquery_merchant = joinStreamDF.groupBy("merchant_name").agg(F.mean("amount").alias("avg")).sort(F.desc("avg")).writeStream \
    .trigger(processingTime='1 seconds') \
    .outputMode("complete") \
    .format("memory") \
    .queryName("stream_results_merchant") \
    .start()

In [24]:
spark.table("stream_results_user").show()
spark.table("stream_results_merchant").show()

+---------+------------------+
|user_name|               avg|
+---------+------------------+
|   Linsey| 68.18384615384616|
|   Morgan| 60.42649999999999|
|   Elwood|           46.9708|
|    Bodie| 37.15765957446808|
|    Molly| 33.91945945945945|
|     Jake| 31.00166666666667|
|   Clover|            30.175|
|  Sherman| 28.55115384615385|
|      Ben|            28.025|
|    Sandy|23.291999999999998|
+---------+------------------+

+--------------+------------------+
| merchant_name|               avg|
+--------------+------------------+
|        Ayse's|            126.72|
|      Pavement| 48.94244897959183|
|         Ike's|46.383939393939414|
|     Clockwork|37.025238095238095|
| Bob's Burgers|             34.19|
|      Taco Hut|            28.965|
|  Dawn Treader|27.689655172413794|
|Lovin Spoonful| 24.64275862068966|
|          SBUX|             20.08|
| Johnny's Hots|             14.71|
|Changing Hands|             11.75|
+--------------+------------------+



In [25]:
streamquery_user.stop()
streamquery_merchant.stop()