# Chapter 6

## Tuning hyperparameters of clustering algorithms

In [7]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()

In [8]:
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

custom_schema = StructType([
    StructField("Make", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Vehicle Class", StringType(), True),
    StructField("Cylinders", DoubleType(), True),
    StructField("Transmission", StringType(), True),
    StructField("Fuel Type", StringType(), True),
    StructField("Fuel Consumption City (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Hwy (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Comb (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Comb (mpg)", DoubleType(), True),
    StructField("CO2", DoubleType(), True)])


In [None]:
co2_data = spark.read.format("csv")\
    .schema(custom_schema) \
    .option("header", True) \
    .load("./static/CO2_Emissions_Canada.csv")

In [11]:
co2_data.take(2)

25/10/11 19:12:01 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


[Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.0, Transmission='4', Fuel Type='AS5', Fuel Consumption City (L/100 km)=None, Fuel Consumption Hwy (L/100 km)=9.9, Fuel Consumption Comb (L/100 km)=6.7, Fuel Consumption Comb (mpg)=8.5, CO2=33.0),
 Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.4, Transmission='4', Fuel Type='M6', Fuel Consumption City (L/100 km)=None, Fuel Consumption Hwy (L/100 km)=11.2, Fuel Consumption Comb (L/100 km)=7.7, Fuel Consumption Comb (mpg)=9.6, CO2=29.0)]

In [12]:
cols_only_continues_values = {'Fuel Consumption City (L/100 km)':0}
#                               "Fuel Consumption Hwy (L/100 km)",
#         "Fuel Consumption Comb (L/100 km)"}

In [13]:
co2_data = co2_data.fillna(0.0)

In [14]:
co2_data.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)



In [15]:
co2_data.take(2)

25/10/11 19:13:53 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


[Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.0, Transmission='4', Fuel Type='AS5', Fuel Consumption City (L/100 km)=0.0, Fuel Consumption Hwy (L/100 km)=9.9, Fuel Consumption Comb (L/100 km)=6.7, Fuel Consumption Comb (mpg)=8.5, CO2=33.0),
 Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.4, Transmission='4', Fuel Type='M6', Fuel Consumption City (L/100 km)=0.0, Fuel Consumption Hwy (L/100 km)=11.2, Fuel Consumption Comb (L/100 km)=7.7, Fuel Consumption Comb (mpg)=9.6, CO2=29.0)]

# Prep the data for regression

turn the feature columns into one indexed column:

In [None]:
from pyspark.ml.feature import FeatureHasher

cols = ["Make", "Model", "Vehicle Class","Cylinders","Transmission","Fuel Type",
        "Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)",
        "Fuel Consumption Comb (L/100 km)","Fuel Consumption Comb (mpg)"]

cols_only_continues = ["Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)",
        "Fuel Consumption Comb (L/100 km)"]

hasher = FeatureHasher(outputCol="hashed_features", inputCols=cols_only_continues)
data = hasher.transform(co2_data)

In [18]:
data.select("hashed_features").show(5, truncate=False)

+---------------------------------------------+
|hashed_features                              |
+---------------------------------------------+
|(262144,[38607,109231,228390],[0.0,9.9,6.7]) |
|(262144,[38607,109231,228390],[0.0,11.2,7.7])|
|(262144,[38607,109231,228390],[0.0,6.0,5.8]) |
|(262144,[38607,109231,228390],[0.0,12.7,9.1])|
|(262144,[38607,109231,228390],[0.0,12.1,8.7])|
+---------------------------------------------+
only showing top 5 rows


25/10/11 19:14:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [19]:
data.select("hashed_features").take(1)

25/10/11 19:14:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


[Row(hashed_features=SparseVector(262144, {38607: 0.0, 109231: 9.9, 228390: 6.7}))]

In [20]:
data.select("hashed_features").show(5, truncate=False)

+---------------------------------------------+
|hashed_features                              |
+---------------------------------------------+
|(262144,[38607,109231,228390],[0.0,9.9,6.7]) |
|(262144,[38607,109231,228390],[0.0,11.2,7.7])|
|(262144,[38607,109231,228390],[0.0,6.0,5.8]) |
|(262144,[38607,109231,228390],[0.0,12.7,9.1])|
|(262144,[38607,109231,228390],[0.0,12.1,8.7])|
+---------------------------------------------+
only showing top 5 rows


25/10/11 19:15:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [21]:
data.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)
 |-- hashed_features: vector (nullable = true)



# Time for selecting the most meaningful features:

In [22]:
from pyspark.ml.feature import UnivariateFeatureSelector

selector = UnivariateFeatureSelector(outputCol="selectedFeatures", featuresCol="hashed_features", labelCol="CO2")

selector.setFeatureType("continuous")
selector.setLabelType("continuous")

model = selector.fit(data)
data = model.transform(data)

25/10/11 19:19:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (mpg)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km), CO2
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:19:01 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/10/11 19:19:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (mpg)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km), CO2
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-w

In [29]:
data.select("selectedFeatures").show(5, truncate=False)

+-----------------------+
|selectedFeatures       |
+-----------------------+
|(50,[48,49],[9.9,6.7]) |
|(50,[48,49],[11.2,7.7])|
|(50,[48,49],[6.0,5.8]) |
|(50,[48,49],[12.7,9.1])|
|(50,[48,49],[12.1,8.7])|
+-----------------------+
only showing top 5 rows


25/10/11 19:23:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


 ## Tryout LDA clustering algorithm

In [None]:
from pyspark.ml.clustering import LDA


lda = LDA(k=2, seed=1, optimizer="em", featuresCol="selectedFeatures")
lda.setMaxIter(100)


lda.clear(lda.maxIter)
lda_model = lda.fit(data)
lda_model.setSeed(1)

# check if the model itself is distributed across Spark executors
lda_model.isDistributed()

25/10/11 19:24:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:24:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:24:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel 

True

In [31]:
lda_model.describeTopics().show()

+-----+-----------+--------------------+
|topic|termIndices|         termWeights|
+-----+-----------+--------------------+
|    0|   [48, 49]|[0.58104675033297...|
|    1|   [48, 49]|[0.58168999987474...|
+-----+-----------+--------------------+



In [32]:
lda_model.vocabSize()

50

In [33]:
lda_predictions = lda_model.transform(data)

In [34]:
lda_predictions.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)
 |-- hashed_features: vector (nullable = true)
 |-- selectedFeatures: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)



In [35]:
lda_predictions.select("topicDistribution").show(2,truncate=False)

+----------------------------------------+
|topicDistribution                       |
+----------------------------------------+
|[0.4999989319581025,0.5000010680418975] |
|[0.5000014415937216,0.49999855840627844]|
+----------------------------------------+
only showing top 2 rows


25/10/11 19:24:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:24:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


# Tryout KMeans

In [36]:
from pyspark.ml.clustering import KMeans


kmeans = KMeans(k=3)
kmeans.setSeed(10)
kmeans.setFeaturesCol("selectedFeatures")

kmeans_model = kmeans.fit(data)
kmeans_model.getDistanceMeasure()

25/10/11 19:25:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:25:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:25:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel 

'euclidean'

In [37]:
kmeans_predictions = kmeans_model.transform(data)

In [38]:
kmeans_predictions.select("prediction").show(5, truncate=True)

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         2|
|         2|
+----------+
only showing top 5 rows


25/10/11 19:25:16 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [39]:
kmeans_predictions.select("prediction").distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         2|
|         0|
+----------+



25/10/11 19:25:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [40]:
summary = kmeans_model.summary

In [41]:
summary.cluster.printSchema()

root
 |-- prediction: integer (nullable = false)



# Tryout GaussianMixture

In [42]:
from pyspark.ml.clustering import GaussianMixture

gm = GaussianMixture(k=42, tol=0.01, seed=10, featuresCol="selectedFeatures", maxIter=100)
gm_model = gm.fit(data)

gm_predictions = gm_model.transform(data)

25/10/11 19:25:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:25:44 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
25/10/11 19:25:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emission

Print the model params using `explainParams()` functionality:

In [43]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
params = gm_model.explainParams()
pp.pprint(params)

('aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\n'
 'featuresCol: features column name. (default: features, current: '
 'selectedFeatures)\n'
 'k: Number of independent Gaussians in the mixture model. Must be > 1. '
 '(default: 2, current: 42)\n'
 'maxIter: max number of iterations (>= 0). (default: 100, current: 100)\n'
 'predictionCol: prediction column name. (default: prediction)\n'
 'probabilityCol: Column name for predicted class conditional probabilities. '
 'Note: Not all models output well-calibrated probability estimates! These '
 'probabilities should be treated as confidences, not precise probabilities. '
 '(default: probability)\n'
 'seed: random seed. (default: 3052518430336294888, current: 10)\n'
 'tol: the convergence tolerance for iterative algorithms (>= 0). (default: '
 '0.01, current: 0.01)\n'
 'weightCol: weight column name. If this is not set or empty, we treat all '
 'instance weights as 1.0. (undefined)')


# Constructing - The Pipeline API

In [44]:
from pyspark.ml import Pipeline


pipeline = Pipeline(stages=[hasher, selector, gm])
pipeline_model = pipeline.fit(co2_data)

25/10/11 19:26:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (mpg)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km), CO2
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:26:12 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/10/11 19:26:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (mpg)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km), CO2
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-w

In [45]:
transformed_by_pipeline = pipeline_model.transform(co2_data)

In [46]:
transformed_by_pipeline.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)
 |-- hashed_features: vector (nullable = true)
 |-- selectedFeatures: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: integer (nullable = false)



## Evaluating clustering models

Notice we are not using this evaluator for LDA since it outputs topicDistribution and not one numeric prdiction.



In [49]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(featuresCol='selectedFeatures')
evaluator.setPredictionCol("prediction")

#evaluate with eucliden distance
print("kmeans: "+str(evaluator.evaluate(kmeans_predictions)))
print("GM: "+ str(evaluator.evaluate(gm_predictions)))

25/10/11 19:29:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:29:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:29:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel 

kmeans: 0.6791002214675337
GM: -0.1517797715036008


25/10/11 19:29:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [50]:
evaluator.isLargerBetter()

True

In [51]:
evaluator.setDistanceMeasure("cosine")
print("kmeans: "+str(evaluator.evaluate(kmeans_predictions)))
print("GM: "+ str(evaluator.evaluate(gm_predictions)))

25/10/11 19:29:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:29:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:29:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel 

kmeans: -0.07958234502129219
GM: -0.19012403274289733


25/10/11 19:29:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [52]:
evaluator.isLargerBetter()

True

In [53]:
evaluator.explainParams()

"distanceMeasure: The distance measure. Supported options: 'squaredEuclidean' and 'cosine'. (default: squaredEuclidean, current: cosine)\nfeaturesCol: features column name. (default: features, current: selectedFeatures)\nmetricName: metric name in evaluation (silhouette) (default: silhouette)\npredictionCol: prediction column name. (default: prediction, current: prediction)\nweightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)"

### Since evaluator output for `isLargerBetter` was true, we can define that kmeans algorithm produced a better model than GM.

# Hyperparameters and Tuning experiments


In [None]:
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]).build()

tvs = TrainValidationSplit(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True, parallelism=1, seed=42)

tvs_model = tvs.fit(data)
tvs_model.getTrainRatio()


25/10/11 19:30:24 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:30:25 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:30:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:30:26 WAR

0.75

In [55]:
tvs_model.validationMetrics

[-0.06270405194965402, -0.06402059325959049, -0.06402059325959049]

In [56]:
from pyspark.ml.tuning import ParamGridBuilder

grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]) \
        .addGrid(kmeans.distanceMeasure, ['euclidean','cosine']).build()


In [57]:
tvs_model.validationMetrics

[-0.06270405194965402, -0.06402059325959049, -0.06402059325959049]

In [58]:
from pyspark.ml.tuning import TrainValidationSplit , ParamGridBuilder

grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]) \
        .addGrid(kmeans.distanceMeasure, ['euclidean','cosine']) \
        .addGrid(evaluator.distanceMeasure, ['euclidean','cosine']).build()


tvs = TrainValidationSplit(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True, parallelism=1, seed=42, trainRatio=0.8)
tvs_model = tvs.fit(data)
tvs_model.validationMetrics

25/10/11 19:30:59 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:31:00 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:31:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:31:05 WAR

[-0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769]

## Adding evaluator to the grid params:

In [None]:
from pyspark.ml.tuning import TrainValidationSplit , ParamGridBuilder


grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]) \
        .addGrid(kmeans.distanceMeasure, ['euclidean','cosine']) \
        .addGrid(evaluator.distanceMeasure, ['euclidean','cosine'])\
        .baseOn({kmeans.featuresCol: 'selectedFeatures'}) \
        .build()

tvs = TrainValidationSplit(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True, parallelism=1, seed=42, trainRatio=0.8)

tvs_model = tvs.fit(data)
tvs_model.validationMetrics

25/10/11 19:31:40 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:31:41 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:31:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:31:45 WAR

[-0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769]

In [60]:
tvs_model.subModels

[KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasur

In [61]:
arr_models = tvs_model.subModels

# Advanced Split

the subModels are printed here as an example, do not use for real systems!!!

In [62]:
from pyspark.ml.tuning import CrossValidator


cv = CrossValidator(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True,  parallelism=2, numFolds=3)

cv_model = cv.fit(data)
cv_model.subModels

25/10/11 19:32:04 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:32:04 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:32:06 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv
25/10/11 19:32:07 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///Users/gafnts/Documents/Github/ml-w

[[KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_875c43040254, k=3, di

In [63]:
len(cv_model.subModels)

3

In [64]:
len(cv_model.subModels[0])

12

In [65]:
cv_model.avgMetrics

[np.float64(-0.0801749698018508),
 np.float64(-0.0801749698018508),
 np.float64(0.6000525675573857),
 np.float64(0.6000525675573857),
 np.float64(-0.0801749698018508),
 np.float64(-0.0801749698018508),
 np.float64(0.6000525675573857),
 np.float64(0.6000525675573857),
 np.float64(-0.0801749698018508),
 np.float64(-0.0801749698018508),
 np.float64(0.6000525675573857),
 np.float64(0.6000525675573857)]

In [66]:
# Get the best model from cross-validation
best_model = cv_model.bestModel

# Use the best model for predictions
best_predictions = best_model.transform(data)

# Show predictions from the best model
best_predictions.select("selectedFeatures", "prediction").show(5)

+--------------------+----------+
|    selectedFeatures|prediction|
+--------------------+----------+
|(50,[48,49],[9.9,...|         0|
|(50,[48,49],[11.2...|         2|
|(50,[48,49],[6.0,...|         1|
|(50,[48,49],[12.7...|         2|
|(50,[48,49],[12.1...|         2|
+--------------------+----------+
only showing top 5 rows


25/10/11 19:34:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Fuel Type, Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km)
 Schema: Fuel Consumption City (L/100 km), Fuel Consumption Hwy (L/100 km), Fuel Consumption Comb (L/100 km)
Expected: Fuel Consumption City (L/100 km) but found: Fuel Type
CSV file: file:///Users/gafnts/Documents/Github/ml-with-spark/ml-with-spark/static/CO2_Emissions_Canada.csv


In [67]:
# Get the best average metric across all folds
best_avg_metric = max(cv_model.avgMetrics)
print(f"Best average metric: {best_avg_metric}")

# Find which parameter combination achieved this
best_param_index = cv_model.avgMetrics.index(best_avg_metric)
print(f"Best parameter combination index: {best_param_index}")

Best average metric: 0.6000525675573857
Best parameter combination index: 2


In [68]:
# Compare CrossValidator vs TrainValidationSplit
print("Cross-Validation best:", max(cv_model.avgMetrics))
print("Train-Validation Split best:", max(tvs_model.validationMetrics))

Cross-Validation best: 0.6000525675573857
Train-Validation Split best: 0.5520132682136769
