In [3]:
// ML Demo 1: logistic regression

import org.apache.spark.sql
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}
import org.apache.spark.ml.evaluation.RegressionEvaluator
/**
*  Missing values show up as a dot.  The dot function below
*  returns -1 if there is a dot or blank value. And it converts strings to double.
* Then later we will
*  delete all rows that have any -1 values.
*/
def dot (s: String) : Double = {
if (s.contains(".") || s.length == 0) {
return -1
} else {
return s.toDouble
}
}
// To create DataFrame
// Step 1. create an RDD of Rows
val readingsRDD = sc.textFile("breastcancer.txt")
val RDD = readingsRDD.map(_.split(","))
val rowRDD = RDD.map(s => Row(dot(s(0)),dot(s(1)),dot(s(2)),dot(s(3)),dot(s(4)),dot(s(5)),dot(s(6)),
dot(s(7)),dot(s(8)),dot(s(9)),dot(s(10)),dot(s(11)),dot(s(12)),
dot(s(13))))

// Step 2. create a schema
val schema = StructType (List(
StructField("STR", DoubleType, true),
StructField("OBS", DoubleType, true),
StructField("AGMT", DoubleType, true),
StructField("FNDX", DoubleType, true),
StructField("HIGD", DoubleType, true),
StructField("DEG",DoubleType, true),
StructField("CHK", DoubleType, true),
StructField("AGP1", DoubleType, true),
StructField("AGMN", DoubleType, true),
StructField("NLV", DoubleType, true),
StructField("LIV", DoubleType, true),
StructField("WT", DoubleType, true),
StructField("AGLP", DoubleType, true),
StructField("MST", DoubleType, true)))

// Step 3. Apply schema to RDD of Rows
val readingsDF = spark.createDataFrame(rowRDD, schema)
// readingsDF.show(10)
/**
*  Create a new dataframe dropping all of those with missing values.
*/

//var cleanDF = readingsDF.filter(readingsDF("NLV") > -1)
var cleanDF = readingsDF.filter($"STR" > -1 && $"OBS" > -1 && $"AGMT" > -1 && $"FNDX" > -1 
                                && $"HIGD" > -1 && $"DEG" > -1 && $"CHK" > -1 && $"AGP1" > -1
                                && $"AGMN" > -1 && $"NLV" > -1 && $"LIV" > -1 && $"WT" > -1
                                && $"AGLP" > -1 && $"MST" > -1)

//cleanDF.show(10)

/**
*  Now comes something more complicated.  Our dataframe has the column headings
*  we created with the schema.  But we need a column called “label” and one called
* “features” to plug into the LR algorithm.  So we use the VectorAssembler() to do that.
* Features is a Vector of doubles.  These are all the values like patient age, etc. that
* we extracted above.  The label indicated whether the patient has cancer.
*/
val featureCols = Array("STR" , "OBS" , "AGMT" , "HIGD" , "DEG" , "CHK" , "AGP1" , "AGMN" , "NLV" , "LIV" , "WT" , "AGLP",  "MST" )
val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")
val df2 = assembler.transform(cleanDF)


// df2.show(10)

val labelIndexer = new StringIndexer().setInputCol("FNDX").setOutputCol("label")
val df3 = labelIndexer.fit(df2).transform(df2)

val Array(dfTrain,dfTest) = df3.randomSplit(Array(0.7,0.3))
/**
*   Now we declare the LR model and run fit and transform to make predictions.
*/
val lrModel = new LogisticRegression().fit(dfTrain)
val predictions = lrModel.transform(dfTest)

predictions.select ("features", "label", "prediction").show()
val lrEvaluator = new BinaryClassificationEvaluator()                    
                    .setRawPredictionCol("rawPrediction")
val p = lrEvaluator.evaluate(predictions)
println(s"Area under ROC: $p")


org.apache.hadoop.mapred.InvalidInputException:  Input path does not exist: file:/home/jovyan/work/nbs/breastcancer.txt

In [16]:
// ML Demo 2: Decision Tree
// with pipeline
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}

// Load the data stored in LIBSVM format as a DataFrame.
val data = spark.read.format("libsvm").load("file:///home/dr_wang1982/infs3208/data/mllib/sample_libsvm_data.txt")
//data.collect.foreach(println)
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("indexedLabel")
  .fit(data)
// Automatically identify categorical features, and index them.
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
  .fit(data)

// Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
//trainingData.collect.foreach(println)
// Train a DecisionTree model.
val dt = new DecisionTreeClassifier()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")

// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(labelIndexer.labels)

// Chain indexers and tree in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))

// Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)

// Make predictions.
val predictions = model.transform(testData)

// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

// Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("indexedLabel")
  .setPredictionCol("prediction")
  .setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println(s"Test Error = ${(1.0 - accuracy)}")

val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println(s"Learned classification tree model:\n ${treeModel.toDebugString}")

strIdx_5ae239708d0f
+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[98,99,100,1...|
|           0.0|  0.0|(692,[121,122,123...|
|           0.0|  0.0|(692,[122,123,148...|
|           0.0|  0.0|(692,[124,125,126...|
|           1.0|  0.0|(692,[125,126,127...|
+--------------+-----+--------------------+
only showing top 5 rows

Test Error = 0.08333333333333337
Learned classification tree model:
 DecisionTreeClassificationModel (uid=dtc_56e670285563) of depth 2 with 5 nodes
  If (feature 405 <= 21.0)
   If (feature 99 in {2.0})
    Predict: 0.0
   Else (feature 99 not in {2.0})
    Predict: 1.0
  Else (feature 405 > 21.0)
   Predict: 0.0



data = [label: double, features: vector]
labelIndexer = strIdx_5ae239708d0f
featureIndexer = vecIdx_db66e2ab4645
trainingData = [label: double, features: vector]
testData = [label: double, features: vector]


dt: org.apache.spark.m...


[label: double, features: vector]

In [5]:
// ML Demo 3: Clustering
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator

// Loads data.
val dataset = spark.read.format("libsvm").load("mllib/sample_kmeans_data.txt")

// Trains a k-means model.
val kmeans = new KMeans().setK(3).setSeed(1L)
val model = kmeans.fit(dataset)

// Make predictions
val predictions = model.transform(dataset)

// Evaluate clustering by computing Silhouette score
val evaluator = new ClusteringEvaluator()

val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")

// Shows the result.
println("Cluster Centers: ")
model.clusterCenters.foreach(println)

Silhouette with squared euclidean distance = 0.6248737134600261
Cluster Centers: 
[9.1,9.1,9.1]
[0.05,0.05,0.05]
[0.2,0.2,0.2]


import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
dataset: org.apache.spark.sql.DataFrame = [label: double, features: vector]
kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_6942676c1f22
model: org.apache.spark.ml.clustering.KMeansModel = KMeansModel: uid=kmeans_6942676c1f22, k=3, distanceMeasure=euclidean, numFeatures=3
predictions: org.apache.spark.sql.DataFrame = [label: double, features: vector ... 1 more field]
evaluator: org.apache.spark.ml.evaluation.ClusteringEvaluator = ClusteringEvaluator: uid=cluEval_606b0f145708, metricName=silhouette, distanceMeasure=squaredEuclidean
silhouette: Double = 0.6248737134600261


In [None]:
# ML Demo 4: Collaborative Filtering (Movie Ratings)
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

sc = SparkContext('local')
spark = SparkSession(sc)

lines = spark.read.text("mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=long(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [8]:
// ML Demo 5: Frequent Pattern Mining (Basket Analysis)
import org.apache.spark.ml.fpm.FPGrowth

val dataset = spark.createDataset(Seq(
  "Milk Bread",
  "Bread Diaper Beer Eggs",
  "Milk Diaper Beer Coke",
"Bread Milk Diaper Beer",
"Bread Milk Diaper Coke")
).map(t => t.split(" ")).toDF("items")

val testDataset = spark.createDataset(Seq("Milk")).map(t => t.split(" ")).toDF("items")

val fpgrowth = new FPGrowth().setItemsCol("items").setMinSupport(0.6).setMinConfidence(0.6)
val model = fpgrowth.fit(dataset)

// Display frequent itemsets.
model.freqItemsets.show()
// Display generated association rules.
model.associationRules.show()
// transform examines the input items against all the association rules and summarize the
// consequents as prediction
model.transform(testDataset).show()

+---------------+----+
|          items|freq|
+---------------+----+
|        [Bread]|   4|
|       [Diaper]|   4|
|[Diaper, Bread]|   3|
|         [Milk]|   4|
| [Milk, Diaper]|   3|
|  [Milk, Bread]|   3|
|         [Beer]|   3|
| [Beer, Diaper]|   3|
+---------------+----+

+----------+----------+----------+------+
|antecedent|consequent|confidence|  lift|
+----------+----------+----------+------+
|    [Beer]|  [Diaper]|       1.0|  1.25|
|  [Diaper]|   [Bread]|      0.75|0.9375|
|  [Diaper]|    [Milk]|      0.75|0.9375|
|  [Diaper]|    [Beer]|      0.75|  1.25|
|   [Bread]|  [Diaper]|      0.75|0.9375|
|   [Bread]|    [Milk]|      0.75|0.9375|
|    [Milk]|  [Diaper]|      0.75|0.9375|
|    [Milk]|   [Bread]|      0.75|0.9375|
+----------+----------+----------+------+

+------+---------------+
| items|     prediction|
+------+---------------+
|[Milk]|[Diaper, Bread]|
+------+---------------+



import org.apache.spark.ml.fpm.FPGrowth
dataset: org.apache.spark.sql.DataFrame = [items: array<string>]
testDataset: org.apache.spark.sql.DataFrame = [items: array<string>]
fpgrowth: org.apache.spark.ml.fpm.FPGrowth = fpgrowth_47e7d3095f88
model: org.apache.spark.ml.fpm.FPGrowthModel = FPGrowthModel: uid=fpgrowth_47e7d3095f88, numTrainingRecords=5
