Spark MLLib
====

- [Official documentation](http://spark.apache.org/docs/latest/mllib-guide.html): The official documentation is clear, detailed and includes many code examples. You should refer to the official documetnation for furtehr exploration of this rich and rapidly growing library.

One challenging aspect of using MLLib is to manipulate data into the types expected by the various algorithms in the library. We show examples of some of thees data conversions in this notebook.

In [1]:
from pyspark import SparkContext
sc = SparkContext('local[*]')

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
df = (sqlc.read.format('com.databricks.spark.csv')
      .options(header='false', inferschema='true')
      .load('data/sonar.all-data.txt'))

In [4]:
df.printSchema()

root
 |-- C0: double (nullable = true)
 |-- C1: double (nullable = true)
 |-- C2: double (nullable = true)
 |-- C3: double (nullable = true)
 |-- C4: double (nullable = true)
 |-- C5: double (nullable = true)
 |-- C6: double (nullable = true)
 |-- C7: double (nullable = true)
 |-- C8: double (nullable = true)
 |-- C9: double (nullable = true)
 |-- C10: double (nullable = true)
 |-- C11: double (nullable = true)
 |-- C12: double (nullable = true)
 |-- C13: double (nullable = true)
 |-- C14: double (nullable = true)
 |-- C15: double (nullable = true)
 |-- C16: double (nullable = true)
 |-- C17: double (nullable = true)
 |-- C18: double (nullable = true)
 |-- C19: double (nullable = true)
 |-- C20: double (nullable = true)
 |-- C21: double (nullable = true)
 |-- C22: double (nullable = true)
 |-- C23: double (nullable = true)
 |-- C24: double (nullable = true)
 |-- C25: double (nullable = true)
 |-- C26: double (nullable = true)
 |-- C27: double (nullable = true)
 |-- C28: double (nullabl

In [5]:
df = df.withColumnRenamed("C60","label")

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
assembler = VectorAssembler(
    inputCols=['C%d' % i for i in range(60)],
    outputCol="features")
output = assembler.transform(df)

In [11]:
sonar = output.select(['features', 'label'])

In [74]:
from pyspark.ml.feature import StandardScaler

In [75]:
standardizer = StandardScaler(withMean=True, withStd=True, 
                              inputCol='features', 
                              outputCol='std_features')
model = standardizer.fit(sonar)
sonar = model.transform(sonar)

In [76]:
sonar.show(n=5)

+--------------------+-----+--------------------+
|            features|label|        std_features|
+--------------------+-----+--------------------+
|[0.02,0.0371,0.04...|    R|[-0.3985897356694...|
|[0.0453,0.0523,0....|    R|[0.70184498705605...|
|[0.0262,0.0582,0....|    R|[-0.1289179854363...|
|[0.01,0.0171,0.06...|    R|[-0.8335441715294...|
|[0.0762,0.0666,0....|    R|[2.04585419386314...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [46]:
features = sonar.select('features').rdd.map(lambda x: np.array(x))

In [50]:
from pyspark.mllib.clustering import KMeans, KMeansModel, GaussianMixture

In [51]:
gmm = GaussianMixture.train(features, k=2)

In [71]:
predict = gmm.predict(features).collect()

In [68]:
labels = sonar.select('label').rdd.map(lambda r: r[0]).collect()

In [73]:
list(zip(predict, labels))

[(1, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (1, 'R'),
 (0, 'R'),
 (0, 'R'),
 (0, 'R'),
 (1, 'R'),
 (0, 'R'),
 (1, 'R'),

In [64]:
from pyspark.ml.feature import StringIndexer

In [66]:
indexer = StringIndexer(inputCol="label", outputCol="label_idx")
indexed = indexer.fit(sonar).transform(sonar)
indexed.show()

+--------------------+-----+---------+
|            features|label|label_idx|
+--------------------+-----+---------+
|[0.02,0.0371,0.04...|    R|      1.0|
|[0.0453,0.0523,0....|    R|      1.0|
|[0.0262,0.0582,0....|    R|      1.0|
|[0.01,0.0171,0.06...|    R|      1.0|
|[0.0762,0.0666,0....|    R|      1.0|
|[0.0286,0.0453,0....|    R|      1.0|
|[0.0317,0.0956,0....|    R|      1.0|
|[0.0519,0.0548,0....|    R|      1.0|
|[0.0223,0.0375,0....|    R|      1.0|
|[0.0164,0.0173,0....|    R|      1.0|
|[0.0039,0.0063,0....|    R|      1.0|
|[0.0123,0.0309,0....|    R|      1.0|
|[0.0079,0.0086,0....|    R|      1.0|
|[0.009,0.0062,0.0...|    R|      1.0|
|[0.0124,0.0433,0....|    R|      1.0|
|[0.0298,0.0615,0....|    R|      1.0|
|[0.0352,0.0116,0....|    R|      1.0|
|[0.0192,0.0607,0....|    R|      1.0|
|[0.027,0.0092,0.0...|    R|      1.0|
|[0.0126,0.0149,0....|    R|      1.0|
+--------------------+-----+---------+
only showing top 20 rows

