In [17]:
# To find out where the pyspark
import findspark
findspark.init()

In [18]:
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

In [20]:
staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(r"C:\Users\pohch\Desktop\Spark\spark-2.4.4-bin-hadoop2.7\mnt\defg\retail-data\by-day\*.csv")
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema



In [21]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .sort(desc("sum(total_cost)"))\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   17450.0|[2011-09-19 20:00...|          71601.44|
|      null|[2011-11-13 19:00...|          55316.08|
|      null|[2011-11-06 19:00...|          42939.17|
|      null|[2011-03-28 20:00...| 33521.39999999998|
|      null|[2011-12-07 19:00...|31975.590000000007|
+----------+--------------------+------------------+
only showing top 5 rows



In [22]:
streamingDataFrame = spark.readStream.schema(staticSchema).option("maxFilesPerTrigger", 1).format("csv").option("header", "true").load(r"C:\Users\pohch\Desktop\Spark\spark-2.4.4-bin-hadoop2.7\mnt\defg\retail-data\by-day\*.csv")

In [25]:
purchaseByCustomerPerHour1 = streamingDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost","InvoiceDate").groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost")

In [27]:
purchaseByCustomerPerHour1.writeStream.format("memory").queryName("customer_purchases1").outputMode("complete").start()

<pyspark.sql.streaming.StreamingQuery at 0x1b8e9760f98>

In [35]:
spark.sql("""
  SELECT *
  FROM customer_purchases1
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2011-03-28 20:00...| 33521.39999999998|
|      null|[2010-12-20 19:00...|31347.479999999938|
|   18102.0|[2010-12-06 19:00...|          25920.37|
|      null|[2010-12-09 19:00...|25399.560000000012|
|      null|[2010-12-16 19:00...|25375.189999999766|
+----------+--------------------+------------------+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)

In [None]:
trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.clustering import KMeans

indexer = StringIndexer()\
  .setInputCol("day_of_week")\
  .setOutputCol("day_of_week_index")

encoder = OneHotEncoder()\
  .setInputCol("day_of_week_index")\
  .setOutputCol("day_of_week_encoded")

vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
  .setOutputCol("features")

Kmeansmodelr = KMeans ().setK(20).setSeed(1L)

In [None]:
from pyspark.ml import Pipeline
Kmeansppl = Pipeline().setStages([indexer, encoder, vectorAssembler, Kmeansmodelr])

In [None]:
Kmeansfitppl = Kmeansppl.fit(trainDataFrame)
Kmeanstrainmdl = Kmeansfitppl.transform(trainDataFrame)

In [44]:
Kmeanstestmdl = Kmeansfitppl.transform(testDataFrame)

In [45]:
Kmeanstestmdl.show(20)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|day_of_week_index|day_of_week_encoded|            features|prediction|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(7,[0,1,4],[1.79,...|         7|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(7,[0,1,4],[1.25,...|         0|
|   580538|    22906|12 MESSAGE CARDS ...|      24

In [None]:
# Stopping Spark Context
sc.stop()