In [14]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [10]:
spark = SparkSession\
    .builder\
    .appName("Retail Test")\
    .getOrCreate()

In [45]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [46]:
staticDataFrame = spark.read.format('csv')\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../../data/retail-data/by-day/*.csv")

In [47]:
staticDataFrame.registerTempTable("retail_data")
staticSchema = staticDataFrame.schema

In [48]:
best_customer_df = staticDataFrame\
    .withColumn("TOTAL", F.col("UnitPrice") * F.col("Quantity"))\
    .groupBy("CustomerID")\
    .mean("TOTAL")\
    .withColumnRenamed("avg(TOTAL)", "AvgSpending")\
    .orderBy(F.desc("AvgSpending"))\
    .select("CustomerID")\
    .limit(5)
best_customers = [customer[0] for customer in best_customer_df.collect()]

In [49]:
best_customers

[15195.0, 13135.0, 17846.0, 16532.0, 15749.0]

In [50]:
staticDataFrame\
    .selectExpr("CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")\
    .groupBy(F.col("CustomerId"), F.window(F.col("InvoiceDate"), "1 day"))\
    .sum("total_cost")\
    .show()

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   14075.0|[2011-12-05 00:00...|316.78000000000003|
|   18180.0|[2011-12-05 00:00...|            310.73|
|   15358.0|[2011-12-05 00:00...| 830.0600000000003|
|   15392.0|[2011-12-05 00:00...|304.40999999999997|
|   15290.0|[2011-12-05 00:00...|263.02000000000004|
|   16811.0|[2011-12-05 00:00...|             232.3|
|   12748.0|[2011-12-05 00:00...| 363.7899999999999|
|   16500.0|[2011-12-05 00:00...| 52.74000000000001|
|   16873.0|[2011-12-05 00:00...|1854.8300000000002|
|   14060.0|[2011-12-05 00:00...|297.47999999999996|
|   14649.0|[2011-12-05 00:00...| 513.9899999999998|
|   16904.0|[2011-12-05 00:00...| 349.0200000000001|
|   17857.0|[2011-12-05 00:00...|            2979.6|
|   14083.0|[2011-12-05 00:00...| 446.5700000000001|
|   14777.0|[2011-12-05 00:00...|             -2.95|
|   16684.0|[2011-12-05 00:00...| 5401.9799999

In [51]:
streaminDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("../../data/retail-data/by-day/*.csv")

In [52]:
streaminDataFrame.isStreaming

True

In [53]:
purchaseByCustomerPerHour = streaminDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")\
    .groupBy(
        F.col("CustomerId"),
        F.window(F.col("InvoiceDate"), "1 day"))\
    .sum("total_cost")

In [54]:
purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f37b19d95b0>

In [66]:
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""")\
    .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2011-03-29 00:00...| 33521.39999999998|
|      null|[2010-12-21 00:00...|31347.479999999938|
|   18102.0|[2010-12-07 00:00...|          25920.37|
|      null|[2010-12-10 00:00...|25399.560000000012|
|      null|[2010-12-17 00:00...|25371.769999999768|
+----------+--------------------+------------------+
only showing top 5 rows



In [65]:
streaminDataFrame.st

True