# structured streaming


work in progress

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
datapath = "../../data/sdg/"

In [None]:
staticDataFrame = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load(datapath + "/retail-data/by-day/*.csv")
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [None]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
.selectExpr(
"CustomerId",
"(UnitPrice * Quantity) as total_cost",
"InvoiceDate")\
.groupBy(
col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
.sum("total_cost")\
.show(5)

In [None]:
streamingDataFrame = spark.readStream\
.schema(staticSchema)\
.option("maxFilesPerTrigger", 1)\
.format("csv")\
.option("header", "true")\
.load(datapath + "/retail-data/by-day/*.csv")

In [None]:
streamingDataFrame.isStreaming

In [None]:
purchaseByCustomerPerHour = streamingDataFrame\
.selectExpr(
"CustomerId",
"(UnitPrice * Quantity) as total_cost",
"InvoiceDate")\
.groupBy(
col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
.sum("total_cost")


**NOTE:**<br>You shouldn’t use either of these streaming methods in production, but they do make for
convenient demonstration of Structured Streaming’s power.

In [None]:
purchaseByCustomerPerHour.writeStream\
.format("memory")\
.queryName("customer_purchases")\
.outputMode("complete")\
.start()

In [None]:
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""")\
.show(5)

# Using Machine Learning (MLlib)

## eample: using K-MEANS clustering

In [None]:
from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
.na.fill(0)\
.withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
.coalesce(5)

In [None]:
trainDataFrame = preppedDataFrame\
.where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
.where("InvoiceDate >= '2011-07-01'")

In [None]:
trainDataFrame.count(), testDataFrame.count()

### Convert types to numeric

String indexer and One Hot Encoding



In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
.setInputCol("day_of_week")\
.setOutputCol("day_of_week_index")

In [None]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
.setInputCol("day_of_week_index")\
.setOutputCol("day_of_week_encoded")

In [None]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler()\
.setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
.setOutputCol("features")

Create a pipeline so any new data will be processed.

In a real scenario we will need to scale the features.

In [None]:
from pyspark.ml import Pipeline
transformationPipeline = Pipeline()\
.setStages([indexer, encoder, vectorAssembler])

In [None]:
%%time
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [None]:
fittedPipeline

In [None]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [None]:
transformedTraining.cache()

In [None]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
.setK(20)\
.setSeed(1)

In [None]:
kmModel = kmeans.fit(transformedTraining)

In [None]:
kmModel.summary.predictions

In [None]:
# NOTE: the [SDG] book is written with spark 2.x. 
# In spark 3.0 the cost computation has changed.
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
evaluator.setPredictionCol('pred')

transformedTest = fittedPipeline.transform(testDataFrame)

In [None]:
testDataFrame

In [None]:
transformedTraining

In [None]:
transformedTest

In [None]:
silhouette = evaluator.evaluate(transformedTraining.select('features'))
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Check yourself
* disable 'transformedTraining.cache()' and repeat the run. How long is the run now?