# Chapter 29: Unsupervised learning

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("Chapter29").getOrCreate()

In [2]:
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(inputCols=['UnitPrice', 'Quantity'], outputCol='features')
sales = va.transform(spark.read.format("csv")\
                           .option("header", "True")\
                           .option("inferSchema", "True")\
                           .load("../data/retail-data/by-day/*.csv")\
                           .limit(50).coalesce(1).where("Description IS NOT NULL"))
sales.cache()
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|   features|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|[1.79,48.0]|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[1.25,20.0]|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|[1.65,24.0]|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[1.25,24.0]|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom| [2.55,6.0]|
+---------+---------+-------------------

## K-means

In [3]:
from pyspark.ml.clustering import KMeans

km = KMeans(k=5)
print(km.explainParams())
kmModel = km.fit(sales)

featuresCol: features column name. (default: features)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 5)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: 2896968671297020976)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)


In [4]:
print("Cluster sizes = {}".format(kmModel.summary.clusterSizes))
print("KMeans model cost = {}".format(kmModel.computeCost(sales)))
print("Centers = {}".format(kmModel.clusterCenters()))

Cluster sizes = [10, 20, 5, 12, 3]
KMeans model cost = 273.6364283333328
Centers = [array([ 0.956, 23.2  ]), array([4.5965, 4.55  ]), array([13.04,  2.4 ]), array([ 1.1       , 11.33333333]), array([ 1.16333333, 44.        ])]


## Bisecting K-means

In [5]:
from pyspark.ml.clustering import BisectingKMeans

bkm = BisectingKMeans(k=5, maxIter=5)
print(bkm.explainParams())
bkmModel = bkm.fit(sales)

featuresCol: features column name. (default: features)
k: The desired number of leaf clusters. Must be > 1. (default: 4, current: 5)
maxIter: max number of iterations (>= 0). (default: 20, current: 5)
minDivisibleClusterSize: The minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster. (default: 1.0)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: 5498043195691022326)


In [6]:
print("Cluster sizes = {}".format(bkmModel.summary.clusterSizes))
print("KMeans model cost = {}".format(bkmModel.computeCost(sales)))
print("Centers = {}".format(bkmModel.clusterCenters()))

Cluster sizes = [17, 10, 10, 10, 3]
KMeans model cost = 298.17485078431366
Centers = [array([3.55176471, 5.41176471]), array([ 0.93, 12.  ]), array([10.065,  2.7  ]), array([ 0.956, 23.2  ]), array([ 1.16333333, 44.        ])]


## Gaussian Mixture Models

In [7]:
# note that you should import "Gaussian Mixture" and not "Gaussian Mixture Model", that's the model after the fit
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture(k=5)
print(gmm.explainParams())
gmmModel = gmm.fit(sales)

featuresCol: features column name. (default: features)
k: Number of independent Gaussians in the mixture model. Must be > 1. (default: 2, current: 5)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
seed: random seed. (default: -2238121618897628805)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.01)


In [8]:
print("Model weights = {}".format(gmmModel.weights))
gmmModel.gaussiansDF.show()
gmmModel.summary.cluster.show(10)
print("Cluster sizes = {}".format(gmmModel.summary.clusterSizes))
print(gmmModel.summary.probability.take(10))

Model weights = [0.3764628488521458, 0.06003637026340386, 0.1999636297366328, 0.1999999999095532, 0.16353715123826437]
+--------------------+--------------------+
|                mean|                 cov|
+--------------------+--------------------+
|[3.73111693575823...|1.411795180927723...|
|[1.16290775444429...|0.196554200997605...|
|[0.95609006359898...|0.242797910224151...|
|[0.93000000003671...|0.324399999992557...|
|[11.1043458644732...|6.691868107826227...|
+--------------------+--------------------+

+----------+
|prediction|
+----------+
|         1|
|         2|
|         2|
|         2|
|         0|
|         1|
|         0|
|         2|
|         0|
|         0|
+----------+
only showing top 10 rows

Cluster sizes = [19, 3, 10, 10, 8]
[Row(probability=DenseVector([0.0, 1.0, 0.0, 0.0, 0.0])), Row(probability=DenseVector([0.0, 0.0, 1.0, 0.0, 0.0])), Row(probability=DenseVector([0.0, 0.0, 1.0, 0.0, 0.0])), Row(probability=DenseVector([0.0, 0.0, 1.0, 0.0, 0.0])), Row(probabil