<style>
    div.container {
      max-width: 800px!important;
    }
</style>

# SparkML Basics

In [1]:
def init: Unit = {
  import ammonite.ops._
  val jars = ls! root/'opt/'spark/'jars |? (_.ext == "jar")
  jars.foreach(interp.load.cp(_))   
}

init

defined [32mfunction[39m [36minit[39m

In [2]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.ERROR)

[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m

In [3]:
import org.apache.spark.sql._

[32mimport [39m[36morg.apache.spark.sql._[39m

In [4]:
val spark = SparkSession
  .builder
  .config("hive.metastore.uris","thrift://localhost:9083") 
  .config("spark.sql.warehouse.dir", "/data/hive/warehouse")
  .master("local[*]")
  .appName("Spark SQL Basics")
  .enableHiveSupport()
  .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties


[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@6e04003e

In [5]:
val sc = spark.sparkContext

[36msc[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32mSparkContext[39m = org.apache.spark.SparkContext@2e0bcb77

In [43]:
val iris = spark
  .read
  .format("csv")
  .option("header", "true")
  .option("delimiter", ",")
  .option("inferSchema", "true")
  .load("file:///data/csv/iris.csv")
  .withColumnRenamed("Sepal.Length", "sepalLength")
  .withColumnRenamed("Sepal.Width", "sepalWidth")
  .withColumnRenamed("Petal.Length", "petalLength")
  .withColumnRenamed("Petal.Width", "petalWidth")
  
iris.limit(5).show

+-----------+----------+-----------+----------+-------+
|sepalLength|sepalWidth|petalLength|petalWidth|Species|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| setosa|
|        4.9|       3.0|        1.4|       0.2| setosa|
|        4.7|       3.2|        1.3|       0.2| setosa|
|        4.6|       3.1|        1.5|       0.2| setosa|
|        5.0|       3.6|        1.4|       0.2| setosa|
+-----------+----------+-----------+----------+-------+



[36miris[39m: [32mDataFrame[39m = [sepalLength: double, sepalWidth: double ... 3 more fields]

In [44]:
val List(train, test) = iris.randomSplit(Array(0.8, 0.2), 1).toList

[36mtrain[39m: [32mDataset[39m[[32mRow[39m] = [sepalLength: double, sepalWidth: double ... 3 more fields]
[36mtest[39m: [32mDataset[39m[[32mRow[39m] = [sepalLength: double, sepalWidth: double ... 3 more fields]

In [12]:
train.count

[36mres11[39m: [32mLong[39m = [32m118L[39m

In [13]:
test.count

[36mres12[39m: [32mLong[39m = [32m32L[39m

In [39]:
val labelIndexer = new StringIndexer()
  .setInputCol("Species")
  .setOutputCol("label")

val d1 = labelIndexer.fit(iris).transform(iris)

d1.limit(5).show

+-----------+----------+-----------+----------+-------+-----+
|sepalLength|sepalWidth|petalLength|petalWidth|Species|label|
+-----------+----------+-----------+----------+-------+-----+
|        5.1|       3.5|        1.4|       0.2| setosa|  2.0|
|        4.9|       3.0|        1.4|       0.2| setosa|  2.0|
|        4.7|       3.2|        1.3|       0.2| setosa|  2.0|
|        4.6|       3.1|        1.5|       0.2| setosa|  2.0|
|        5.0|       3.6|        1.4|       0.2| setosa|  2.0|
+-----------+----------+-----------+----------+-------+-----+



[36mlabelIndexer[39m: [32mStringIndexer[39m = strIdx_b26f47000635
[36md1[39m: [32mDataFrame[39m = [sepalLength: double, sepalWidth: double ... 4 more fields]

In [41]:
val featureIndexer = new VectorAssembler()
  .setInputCols(Array("sepalLength", "sepalWidth", "petalLength", "petalWidth"))
  .setOutputCol("features")
  
val d2 = featureIndexer.transform(d1)

d2.limit(5).show

+-----------+----------+-----------+----------+-------+-----+-----------------+
|sepalLength|sepalWidth|petalLength|petalWidth|Species|label|         features|
+-----------+----------+-----------+----------+-------+-----+-----------------+
|        5.1|       3.5|        1.4|       0.2| setosa|  2.0|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2| setosa|  2.0|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2| setosa|  2.0|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2| setosa|  2.0|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2| setosa|  2.0|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-------+-----+-----------------+



[36mfeatureIndexer[39m: [32mVectorAssembler[39m = vecAssembler_d4a6aa201603
[36md2[39m: [32mDataFrame[39m = [sepalLength: double, sepalWidth: double ... 5 more fields]

In [42]:
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, IndexToString}
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator

val kmeans = new KMeans().setK(3).setSeed(1L)
val model = kmeans.fit(d2)

val predictions = model.transform(d2)

val evaluator = new ClusteringEvaluator()
val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")

println("Cluster Centers: ")
model.clusterCenters.foreach(println)

19/07/25 21:14:01 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
19/07/25 21:14:01 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Silhouette with squared euclidean distance = 0.7344130579787816
Cluster Centers: 
[5.88360655737705,2.7409836065573776,4.388524590163936,1.4344262295081969]
[6.853846153846153,3.0769230769230766,5.715384615384615,2.053846153846153]
[5.005999999999999,3.428000000000001,1.4620000000000002,0.2459999999999999]


[36mkmeans[39m: [32mKMeans[39m = kmeans_44e7f4b624f5
[36mmodel[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mclustering[39m.[32mKMeansModel[39m = kmeans_44e7f4b624f5
[36mpredictions[39m: [32mDataFrame[39m = [sepalLength: double, sepalWidth: double ... 6 more fields]
[36mevaluator[39m: [32mClusteringEvaluator[39m = cluEval_d58edc4951d5
[36msilhouette[39m: [32mDouble[39m = [32m0.7344130579787816[39m