In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Gradient Boosted Trees

### Classification

In [None]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt")
trainingData, testData = data.randomSplit([0.7, 0.3])

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [None]:
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={},
                                             numIterations=3)

print(model.toDebugString())

In [None]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda v: v[0] != v[1]).count() / float(testData.count())

print('Test Error = ' + str(testErr))

### Regression

In [None]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

model = GradientBoostedTrees.trainRegressor(trainingData,
                                            categoricalFeaturesInfo={}, 
                                            numIterations=3)

print(model.toDebugString())

In [None]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda v: (v[0] - v[1])**2).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

In [None]:
sc.stop()