In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.util import MLUtils

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

--2017-10-15 11:13:05--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt’


2017-10-15 11:13:06 (995 KB/s) - ‘sample_libsvm_data.txt’ saved [104736/104736]



In [4]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Decision Trees

### Classification

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier

labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(data)

dtC = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")

pipelineClass = Pipeline().setStages([labelIndexer, featureIndexer, dtC, labelConverter])

trainingData, testData = data.randomSplit([0.7, 0.3])

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel

modelClassifier = pipelineClass.fit(trainingData)

treeModel = modelClassifier.stages[2]

predictionsClass = modelClassifier.transform(testData)

In [7]:
modelClassifier.stages

[StringIndexer_49f9a8d891d161af66ce,
 VectorIndexer_48d1985dc13633f32bdc,
 DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4a98af7269bad4e2a197) of depth 1 with 3 nodes,
 IndexToString_4a9bae54ba6d74b95602]

In [8]:
print(treeModel.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4a98af7269bad4e2a197) of depth 1 with 3 nodes
  If (feature 351 <= 15.0)
   Predict: 1.0
  Else (feature 351 > 15.0)
   Predict: 0.0



In [9]:
predictionsClass.toPandas()[:5]

Unnamed: 0,features,label,indexedLabel,indexedFeatures,rawPrediction,probability,prediction,predictedLabel
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 23.0]","[0.0, 1.0]",1.0,0.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 23.0]","[0.0, 1.0]",1.0,0.0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 23.0]","[0.0, 1.0]",1.0,0.0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 23.0]","[0.0, 1.0]",1.0,0.0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[40.0, 0.0]","[1.0, 0.0]",0.0,1.0


### Regression

In [10]:
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel

dtR = DecisionTreeRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineReg = Pipeline().setStages([featureIndexer, dtR])

In [11]:
modelRegressor = pipelineReg.fit(trainingData)

treeModel = modelRegressor.stages[1]

print(treeModel.toDebugString)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4315b85b3db4c726b94e) of depth 1 with 3 nodes
  If (feature 351 <= 15.0)
   Predict: 0.0
  Else (feature 351 > 15.0)
   Predict: 1.0



In [12]:
predictionsReg = modelRegressor.transform(testData)

In [13]:
predictionsReg.toPandas()[:5]

Unnamed: 0,features,label,indexedFeatures,prediction
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0


In [14]:
sc.stop()