## **Machine Learning and Advanced Analytics**

In [2]:
val staticDataFrame = spark.read.format("csv")
                        .option("header", "true")
                        .option("inferSchema", "true")
                        .load("../data/retail-data/by-day/*.csv")
staticDataFrame.createOrReplaceTempView("retail_data") 
val staticSchema = staticDataFrame.schema

staticDataFrame: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]
staticSchema: org.apache.spark.sql.types.StructType = StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,TimestampType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))


In [3]:
staticDataFrame.isStreaming // returns true

res0: Boolean = false


In [4]:
staticDataFrame.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [5]:
import org.apache.spark.sql.functions.date_format

val preppedDataFrame = staticDataFrame 
.na.fill(0)
.withColumn("day_of_week", date_format($"InvoiceDate", "EEEE")) 
.coalesce(5)

import org.apache.spark.sql.functions.date_format
preppedDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 7 more fields]


In [9]:
val trainDataFrame = preppedDataFrame.where("InvoiceDate < '2011-07-01'")
val testDataFrame = preppedDataFrame.where("InvoiceDate >= '2011-07-01'")

trainDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 7 more fields]
testDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 7 more fields]


In [10]:
trainDataFrame.count()

res4: Long = 245903


In [11]:
testDataFrame.count()

res5: Long = 296006


In [12]:
import org.apache.spark.ml.feature.StringIndexer
val indexer = new StringIndexer() 
.setInputCol("day_of_week") 
.setOutputCol("day_of_week_index")

import org.apache.spark.ml.feature.StringIndexer
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_de4beb49ddc5


In [13]:
import org.apache.spark.ml.feature.OneHotEncoder
val encoder = new OneHotEncoder() 
.setInputCol("day_of_week_index") 
.setOutputCol("day_of_week_encoded")

import org.apache.spark.ml.feature.OneHotEncoder
encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_ea2d7b8458bb


In [14]:
import org.apache.spark.ml.feature.VectorAssembler
val vectorAssembler = new VectorAssembler() 
.setInputCols(Array("UnitPrice", "Quantity", "day_of_week_encoded")) 
.setOutputCol("features")

import org.apache.spark.ml.feature.VectorAssembler
vectorAssembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_36ba342772da


## **PIPELINES**

In [24]:
import org.apache.spark.ml.Pipeline
val transformationPipeline = new Pipeline() 
.setStages(Array(indexer, encoder, vectorAssembler))

import org.apache.spark.ml.Pipeline
transformationPipeline: org.apache.spark.ml.Pipeline = pipeline_d1ead3f31d2f


In [25]:
val fittedPipeline = transformationPipeline.fit(trainDataFrame)

fittedPipeline: org.apache.spark.ml.PipelineModel = pipeline_d1ead3f31d2f


In [26]:
val transformedTraining = fittedPipeline.transform(trainDataFrame)

transformedTraining: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 10 more fields]


In [27]:
transformedTraining.cache()

res8: transformedTraining.type = [InvoiceNo: string, StockCode: string ... 10 more fields]


In [28]:
import org.apache.spark.ml.clustering.KMeans
val kmeans = new KMeans() 
.setK(20)
.setSeed(1L)

import org.apache.spark.ml.clustering.KMeans
kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_887b4ae49487


In [29]:
val kmModel = kmeans.fit(transformedTraining)

kmModel: org.apache.spark.ml.clustering.KMeansModel = kmeans_887b4ae49487


In [30]:
kmModel.computeCost(transformedTraining)

res9: Double = 1.0350348110517502E8


In [31]:
val transformedTest = fittedPipeline.transform(testDataFrame)

transformedTest: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 10 more fields]


## **MLlib**

In [36]:
var df = spark.read.json("../data/simple-ml") 
df.orderBy("value2").show()

org.apache.spark.sql.AnalysisException:  Unable to infer schema for JSON. It must be specified manually.;