# Intro to ML with Spark

## Setup

#### Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

#### Configure Spark

In [2]:
# Create SparkSession
spark = SparkSession\
    .builder\
    .appName("ML Example")\
    .master("local[*]")\
    .getOrCreate()
# Conf Setup
spark.conf.set("spark.sql.shuffle.partitions", "5")
spark.sparkContext.setLogLevel('WARN')

In [3]:
spark

## Load Data

In [4]:
staticDataFrame = spark.read.format('csv')\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../../data/retail-data/by-day/*.csv")

In [5]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



## Process Data

#### Fill NAs and Format Day of Week

In [6]:
prepped_data_frame = staticDataFrame\
    .na.fill(0)\
    .withColumn("day_of_week", F.date_format(F.col("InvoiceDate"), "EEEE"))\
    .coalesce(5)

#### Split Data

In [7]:
train_data = prepped_data_frame\
    .where("InvoiceDate < '2011-07-01'")
test_data = prepped_data_frame\
    .where("InvoiceDate >= '2011-07-01'")

In [8]:
print(f"Train Data: {train_data.count()}, Test Data: {test_data.count()}")

Train Data: 245903, Test Data: 296006


## ML Prep

In [9]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

#### Set up Stages and Pipeline

In [10]:
indexer = StringIndexer()\
    .setInputCol("day_of_week")\
    .setOutputCol("day_of_week_index")

encoder = OneHotEncoder()\
    .setInputCol("day_of_week_index")\
    .setOutputCol("day_of_week_encoded")

vector_assembler = VectorAssembler()\
    .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
    .setOutputCol("features")

In [12]:
transformation_pipeline = Pipeline()\
    .setStages([indexer, encoder, vector_assembler])

#### Fit Pipeline

In [13]:
fitted_pipe = transformation_pipeline.fit(train_data)

#### Transform Data

In [14]:
transformed_training = fitted_pipe.transform(train_data)

In [15]:
transformed_training.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

## Training

In [16]:
from pyspark.ml.clustering import KMeans

In [17]:
kmeans = KMeans()\
    .setK(20)\
    .setSeed(42)

In [18]:
kModel = kmeans.fit(transformed_training)

In [19]:
transformed_test = fitted_pipe.transform(test_data)


In [20]:
kModel.transform(transformed_test).groupBy('prediction').count().show()

+----------+------+
|prediction| count|
+----------+------+
|         0|259427|
|        18|  4160|
|        10|     1|
|         7|    16|
|        15|     2|
|         4|     5|
|        13| 31488|
|        11|     2|
|         8|    29|
|        14|    97|
|         1|     1|
|         6|   582|
|         9|   130|
|        17|    41|
|        12|    15|
|        16|     8|
|        19|     1|
|         2|     1|
+----------+------+

