In [1]:
val rawdata = sc.textFile("file:///docker/datasets/covtype.data")

In [2]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._

val data = rawdata.map { line => 
    val values = line.split(",").map(_.toDouble)
    val features = Vectors.dense(values.init)
    val label = values.last - 1
    LabeledPoint(label, features)
}

In [3]:
val Array(trData, cvData, teData) = data.randomSplit(Array(0.8, 0.1, 0.1))
trData.cache()
cvData.cache()
teData.cache()

MapPartitionsRDD[6] at randomSplit at <console>:24

In [None]:
import org.apache.spark.mllib.tree._

val treeModel = DecisionTree.trainClassifier(trData, 7, Map[Int, Int](), "gini", 4, 100)

In [None]:
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._

val predectionsAndLabels = teData.map(ex => treeModel.predict(ex.features) -> ex.label)
val metric = new MulticlassMetrics(predectionsAndLabels)

In [None]:
metric.confusionMatrix

In [None]:
metric.precision

In [None]:
(1 until 7) map (cat => metric.precision(cat) -> metric.recall(cat)) foreach println

# Calculate random guessing threashold

In [None]:
import org.apache.spark.rdd.RDD

def classProbabilities(data: RDD[LabeledPoint]): Array[Double] = {
    val countsByClass = data.map(_.label).countByValue
    val counts = data.map(_.label).countByValue.values.toArray.sorted
    counts.map(_.toDouble / counts.sum)
}

In [None]:
(classProbabilities(trData) zip classProbabilities(cvData)).map {
    case (trainProb, cvProb) => trainProb * cvProb
}.sum

# Grid Search

In [None]:
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._

val evaluations = for {
    impurity <- Array("gini", "entropy")
    depth <- Array(1, 20)
    bins <- Array(10, 300)
} yield {
    val model = DecisionTree.trainClassifier(trData, 7, Map[Int, Int](), impurity, depth, bins)
    val predictionsAndLabels = cvData.map(ex => (model.predict(ex.features) -> ex.label))
    val accuracy = new MulticlassMetrics(predictionsAndLabels).precision
    ((impurity, depth, bins), accuracy)
}

In [None]:
evaluations

# Random Forest

In [None]:
val forest = RandomForest.trainClassifier(trData, 7, Map(10 -> 4, 11 -> 40), 20, "auto", "entropy", 30, 300)