In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
!rm -rf metastore_db/*.lck
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Decision Trees

### Classification

In [None]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt

In [None]:
data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

trainingData, testData = data.randomSplit([0.7, 0.3])

print(trainingData.take(1))

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [None]:
model = DecisionTree.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

print(model.toDebugString())

In [None]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

print(labelsAndPredictions.collect())

In [None]:
testErr = labelsAndPredictions.filter(lambda v: v[0] != v[1]).count() / float(testData.count())

print('Test Error = ' + str(testErr))

In [None]:
!rm -rf myDecisionTreeClassificationModel.parquet
model.save(sc, "myDecisionTreeClassificationModel.parquet")
sameModel = DecisionTreeModel.load(sc, "myDecisionTreeClassificationModel.parquet")

In [None]:
!ls -l myDecisionTreeClassificationModel.parquet/data/

In [None]:
!cat myDecisionTreeClassificationModel.parquet/metadata/part-00000

### Regression

In [None]:
model = DecisionTree.trainRegressor(trainingData,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5, 
                                    maxBins=32)

print(model.toDebugString())

In [None]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda v: (v[0] - v[1])**2 ).sum() / float(testData.count())

print('Test Mean Squared Error = ' + str(testMSE))

In [None]:
sc.stop()