In [1]:
sc

<pyspark.context.SparkContext at 0x7f7c9758d790>

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [4]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Random Forests

### Classification

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

rfC = RandomForestClassifier().setLabelCol("indexedLabel") \
                                .setFeaturesCol("indexedFeatures") \
                                .setNumTrees(3)
        
trainingData, testData = data.randomSplit([0.7, 0.3])

In [7]:
pipelineRFC = Pipeline().setStages([labelIndexer, featureIndexer, rfC, labelConverter])

modelRFC = pipelineRFC.fit(trainingData)

predictionsRFC = modelRFC.transform(testData)

In [8]:
predictionsRFC.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           1.0|  1.0|(692,[97,98,99,12...|
|           0.0|  0.0|(692,[122,123,148...|
|           1.0|  1.0|(692,[123,124,125...|
|           1.0|  1.0|(692,[123,124,125...|
|           0.0|  0.0|(692,[124,125,126...|
+--------------+-----+--------------------+
only showing top 5 rows



In [9]:
rfModelC = modelRFC.stages[2]
rfModelC.featureImportances

SparseVector(692, {183: 0.0326, 455: 0.2952, 463: 0.2636, 490: 0.3333, 517: 0.0371, 540: 0.0381})

In [10]:
print rfModelC.toDebugString

RandomForestClassificationModel (uid=rfc_d5d6b33eed79) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 463 <= 0.0)
     If (feature 183 <= 0.0)
      Predict: 0.0
     Else (feature 183 > 0.0)
      If (feature 517 <= 116.0)
       Predict: 1.0
      Else (feature 517 > 116.0)
       Predict: 0.0
    Else (feature 463 > 0.0)
     Predict: 0.0
  Tree 1 (weight 1.0):
    If (feature 490 <= 31.0)
     Predict: 1.0
    Else (feature 490 > 31.0)
     Predict: 0.0
  Tree 2 (weight 1.0):
    If (feature 455 <= 23.0)
     If (feature 540 <= 65.0)
      Predict: 0.0
     Else (feature 540 > 65.0)
      Predict: 1.0
    Else (feature 455 > 23.0)
     Predict: 1.0



### Regression

In [11]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel

rfR = RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineRFR = Pipeline().setStages([featureIndexer, rfR])

modelRFR = pipelineRFR.fit(trainingData)

predictionsRFR = modelRFR.transform(testData)

In [12]:
predictionsRFR = modelRFR.transform(testData)

predictionsRFR.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.9|  1.0|(692,[97,98,99,12...|
|      0.05|  0.0|(692,[122,123,148...|
|       1.0|  1.0|(692,[123,124,125...|
|      0.95|  1.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows

