In [1]:
sc

<pyspark.context.SparkContext at 0x7fbf710bf790>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Random Forests

### Classification

In [6]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

trainingData, testData = data.randomSplit([0.7, 0.3])

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [7]:
model = RandomForest.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     numTrees=3, 
                                     featureSubsetStrategy="auto",
                                     impurity='gini', 
                                     maxDepth=4, 
                                     maxBins=32)

print(model.toDebugString())

TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 412 <= 0.0)
     If (feature 378 <= 0.0)
      Predict: 0.0
     Else (feature 378 > 0.0)
      Predict: 1.0
    Else (feature 412 > 0.0)
     Predict: 0.0
  Tree 1:
    If (feature 490 <= 31.0)
     If (feature 549 <= 253.0)
      Predict: 0.0
     Else (feature 549 > 253.0)
      Predict: 1.0
    Else (feature 490 > 31.0)
     Predict: 1.0
  Tree 2:
    If (feature 462 <= 0.0)
     If (feature 549 <= 253.0)
      Predict: 0.0
     Else (feature 549 > 253.0)
      Predict: 1.0
    Else (feature 462 > 0.0)
     Predict: 1.0



In [8]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.0


### Regression

In [9]:
model = RandomForest.trainRegressor(trainingData, 
                                    categoricalFeaturesInfo={},
                                    numTrees=3, 
                                    featureSubsetStrategy="auto",
                                    impurity='variance', 
                                    maxDepth=4, 
                                    maxBins=32)

print(model.toDebugString())

TreeEnsembleModel regressor with 3 trees

  Tree 0:
    If (feature 406 <= 72.0)
     If (feature 293 <= 253.0)
      Predict: 0.0
     Else (feature 293 > 253.0)
      Predict: 1.0
    Else (feature 406 > 72.0)
     Predict: 1.0
  Tree 1:
    If (feature 540 <= 41.0)
     Predict: 1.0
    Else (feature 540 > 41.0)
     Predict: 0.0
  Tree 2:
    If (feature 540 <= 65.0)
     Predict: 1.0
    Else (feature 540 > 65.0)
     Predict: 0.0



In [11]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 0.0292397660819
