In [1]:
sc

<pyspark.context.SparkContext at 0x7f5016d0a790>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Gradient Boosted Trees

### Classification

In [6]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt")
trainingData, testData = data.randomSplit([0.7, 0.3])

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [8]:
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={},
                                             numIterations=3)

print(model.toDebugString())

TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 406 <= 20.0)
     Predict: -1.0
    Else (feature 406 > 20.0)
     Predict: 1.0
  Tree 1:
    If (feature 434 <= 0.0)
     Predict: -0.4768116880884702
    Else (feature 434 > 0.0)
     Predict: 0.47681168808847024
  Tree 2:
    If (feature 434 <= 0.0)
     If (feature 215 <= 201.0)
      Predict: -0.43819358104272055
     Else (feature 215 > 201.0)
      Predict: -0.4381935810427207
    Else (feature 434 > 0.0)
     If (feature 517 <= 159.0)
      If (feature 208 <= 0.0)
       Predict: 0.4381935810427206
      Else (feature 208 > 0.0)
       Predict: 0.43819358104272066
     Else (feature 517 > 159.0)
      Predict: 0.4381935810427207



In [9]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())

print('Test Error = ' + str(testErr))

Test Error = 0.0833333333333


### Regression

In [10]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

model = GradientBoostedTrees.trainRegressor(trainingData,
                                            categoricalFeaturesInfo={}, 
                                            numIterations=3)

print(model.toDebugString())

TreeEnsembleModel regressor with 3 trees

  Tree 0:
    If (feature 406 <= 20.0)
     Predict: 0.0
    Else (feature 406 > 20.0)
     Predict: 1.0
  Tree 1:
    Predict: 0.0
  Tree 2:
    Predict: 0.0



In [11]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 0.0833333333333
