In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [4]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Random Forests

### Classification

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

rfC = RandomForestClassifier().setLabelCol("indexedLabel") \
                                .setFeaturesCol("indexedFeatures") \
                                .setNumTrees(3)
        
trainingData, testData = data.randomSplit([0.7, 0.3])

In [6]:
pipelineRFC = Pipeline().setStages([labelIndexer, featureIndexer, rfC, labelConverter])

modelRFC = pipelineRFC.fit(trainingData)

predictionsRFC = modelRFC.transform(testData)

In [7]:
predictionsRFC.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           1.0|  1.0|(692,[97,98,99,12...|
|           0.0|  0.0|(692,[122,123,148...|
|           1.0|  1.0|(692,[123,124,125...|
|           1.0|  1.0|(692,[123,124,125...|
|           1.0|  1.0|(692,[125,126,127...|
+--------------+-----+--------------------+
only showing top 5 rows



In [8]:
rfModelC = modelRFC.stages[2]
rfModelC.featureImportances

SparseVector(692, {99: 0.0311, 356: 0.2317, 461: 0.0423, 462: 0.3023, 538: 0.1016, 566: 0.0294, 568: 0.2616})

In [9]:
print(rfModelC.toDebugString)

RandomForestClassificationModel (uid=RandomForestClassifier_4681a8764bc7abe922f5) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 568 <= 0.0)
     If (feature 566 <= 0.0)
      Predict: 0.0
     Else (feature 566 > 0.0)
      Predict: 1.0
    Else (feature 568 > 0.0)
     If (feature 461 <= 0.0)
      Predict: 1.0
     Else (feature 461 > 0.0)
      Predict: 0.0
  Tree 1 (weight 1.0):
    If (feature 462 <= 0.0)
     If (feature 99 in {2.0})
      Predict: 0.0
     Else (feature 99 not in {2.0})
      Predict: 1.0
    Else (feature 462 > 0.0)
     Predict: 0.0
  Tree 2 (weight 1.0):
    If (feature 356 <= 0.0)
     If (feature 538 <= 0.0)
      Predict: 0.0
     Else (feature 538 > 0.0)
      Predict: 1.0
    Else (feature 356 > 0.0)
     Predict: 1.0



### Regression

In [10]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel

rfR = RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineRFR = Pipeline().setStages([featureIndexer, rfR])

modelRFR = pipelineRFR.fit(trainingData)

predictionsRFR = modelRFR.transform(testData)

In [11]:
predictionsRFR = modelRFR.transform(testData)

predictionsRFR.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  1.0|(692,[97,98,99,12...|
|       0.0|  0.0|(692,[122,123,148...|
|       1.0|  1.0|(692,[123,124,125...|
|       1.0|  1.0|(692,[123,124,125...|
|       1.0|  1.0|(692,[125,126,127...|
+----------+-----+--------------------+
only showing top 5 rows



In [None]:
sc.stop()