In [1]:
sc

<pyspark.context.SparkContext at 0x7ffedec83790>

In [3]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [4]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [5]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Gradient Boosted Trees

### Classification

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

gbtC = GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10)

pipelineGBTC = Pipeline().setStages([labelIndexer, featureIndexer, gbtC, labelConverter])

trainingData, testData = data.randomSplit([0.7, 0.3])

In [12]:
modelGBTC = pipelineGBTC.fit(trainingData)

predictionsGBTC = modelGBTC.transform(testData)

predictionsGBTC.select("predictedLabel", "label", "features").show(3)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[122,123,148...|
|           1.0|  1.0|(692,[123,124,125...|
|           0.0|  0.0|(692,[124,125,126...|
+--------------+-----+--------------------+
only showing top 3 rows



In [13]:
gbtModelC = modelGBTC.stages[2]

print gbtModelC.toDebugString

GBTClassificationModel (uid=GBTClassifier_452db09d245c6ad64f92) with 10 trees
  Tree 0 (weight 1.0):
    If (feature 406 <= 72.0)
     If (feature 99 in {2.0})
      Predict: -1.0
     Else (feature 99 not in {2.0})
      Predict: 1.0
    Else (feature 406 > 72.0)
     Predict: -1.0
  Tree 1 (weight 0.1):
    If (feature 406 <= 72.0)
     If (feature 435 <= 0.0)
      If (feature 577 <= 231.0)
       If (feature 123 <= 66.0)
        If (feature 153 <= 3.0)
         Predict: 0.4768116880884702
        Else (feature 153 > 3.0)
         Predict: 0.4768116880884703
       Else (feature 123 > 66.0)
        Predict: 0.4768116880884703
      Else (feature 577 > 231.0)
       Predict: 0.47681168808847035
     Else (feature 435 > 0.0)
      Predict: -0.4768116880884694
    Else (feature 406 > 72.0)
     If (feature 207 <= 140.0)
      Predict: -0.47681168808847024
     Else (feature 207 > 140.0)
      Predict: -0.4768116880884712
  Tree 2 (weight 0.1):
    If (feature 406 <= 72.0)
     If (feat

### Regression

In [15]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import GBTRegressionModel

gbtR = GBTRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures").setMaxIter(10)

pipelineGBTR = Pipeline().setStages([featureIndexer, gbtR])

modelGBTR = pipelineGBTR.fit(trainingData)

In [16]:
predictionsGBTR = modelGBTR.transform(testData)
predictionsGBTR.show(5)

+--------------------+-----+--------------------+----------+
|            features|label|     indexedFeatures|prediction|
+--------------------+-----+--------------------+----------+
|(692,[122,123,148...|  0.0|(692,[122,123,148...|       0.0|
|(692,[123,124,125...|  1.0|(692,[123,124,125...|       1.0|
|(692,[124,125,126...|  0.0|(692,[124,125,126...|       0.0|
|(692,[124,125,126...|  0.0|(692,[124,125,126...|       0.0|
|(692,[124,125,126...|  1.0|(692,[124,125,126...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows

