In [1]:
sc

<pyspark.context.SparkContext at 0x7f8403ee07d0>

In [2]:
!rm -rf metastore_db/*.lck
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Decision Trees

### Classification

In [3]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt

--2017-01-24 11:47:12--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt.1’


2017-01-24 11:47:12 (1,02 MB/s) - ‘sample_libsvm_data.txt.1’ saved [104736/104736]



In [4]:
data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

trainingData, testData = data.randomSplit([0.7, 0.3])

print trainingData.take(1)

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

[LabeledPoint(0.0, (692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0

In [5]:
model = DecisionTree.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

print(model.toDebugString())

DecisionTreeModel classifier of depth 1 with 3 nodes
  If (feature 406 <= 0.0)
   Predict: 0.0
  Else (feature 406 > 0.0)
   Predict: 1.0



In [6]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

print labelsAndPredictions.collect()

[(1.0, 1.0), (0.0, 0.0), (1.0, 1.0), (0.0, 1.0), (1.0, 1.0), (0.0, 1.0), (1.0, 1.0), (1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (0.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (1.0, 1.0), (0.0, 0.0), (1.0, 1.0), (0.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0)]


In [7]:
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())

print('Test Error = ' + str(testErr))

Test Error = 0.0666666666667


In [8]:
!rm -rf myDecisionTreeClassificationModel.parquet
model.save(sc, "myDecisionTreeClassificationModel.parquet")
sameModel = DecisionTreeModel.load(sc, "myDecisionTreeClassificationModel.parquet")

In [9]:
!ls -l myDecisionTreeClassificationModel.parquet/data/

total 4
-rw-r--r-- 1 ubuntu ubuntu 3005 Jan 24 11:47 part-00000-0d2a71c9-2ef5-408a-a10b-5e0300e28a40.snappy.parquet
-rw-r--r-- 1 ubuntu ubuntu    0 Jan 24 11:47 _SUCCESS


In [10]:
!cat myDecisionTreeClassificationModel.parquet/metadata/part-00000

{"class":"org.apache.spark.mllib.tree.DecisionTreeModel","version":"1.0","algo":"Classification","numNodes":3}


### Regression

In [11]:
model = DecisionTree.trainRegressor(trainingData,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5, 
                                    maxBins=32)

print(model.toDebugString())

DecisionTreeModel regressor of depth 1 with 3 nodes
  If (feature 406 <= 0.0)
   Predict: 0.0
  Else (feature 406 > 0.0)
   Predict: 1.0



In [12]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())

print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 0.0666666666667


In [13]:
sc

<pyspark.context.SparkContext at 0x7f8403ee07d0>

In [14]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Random Forests

### Classification

In [15]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

trainingData, testData = data.randomSplit([0.7, 0.3])

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [16]:
model = RandomForest.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     numTrees=3, 
                                     featureSubsetStrategy="auto",
                                     impurity='gini', 
                                     maxDepth=4, 
                                     maxBins=32)

print(model.toDebugString())

TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 511 <= 0.0)
     If (feature 469 <= 0.0)
      Predict: 1.0
     Else (feature 469 > 0.0)
      Predict: 0.0
    Else (feature 511 > 0.0)
     Predict: 0.0
  Tree 1:
    If (feature 511 <= 0.0)
     If (feature 323 <= 0.0)
      If (feature 600 <= 112.0)
       Predict: 1.0
      Else (feature 600 > 112.0)
       Predict: 0.0
     Else (feature 323 > 0.0)
      Predict: 1.0
    Else (feature 511 > 0.0)
     Predict: 0.0
  Tree 2:
    If (feature 345 <= 0.0)
     If (feature 483 <= 0.0)
      Predict: 1.0
     Else (feature 483 > 0.0)
      Predict: 0.0
    Else (feature 345 > 0.0)
     If (feature 464 <= 0.0)
      Predict: 0.0
     Else (feature 464 > 0.0)
      Predict: 1.0



In [17]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.0


### Regression

In [18]:
model = RandomForest.trainRegressor(trainingData, 
                                    categoricalFeaturesInfo={},
                                    numTrees=3, 
                                    featureSubsetStrategy="auto",
                                    impurity='variance', 
                                    maxDepth=4, 
                                    maxBins=32)

print(model.toDebugString())

TreeEnsembleModel regressor with 3 trees

  Tree 0:
    If (feature 434 <= 0.0)
     If (feature 293 <= 253.0)
      Predict: 0.0
     Else (feature 293 > 253.0)
      Predict: 1.0
    Else (feature 434 > 0.0)
     Predict: 1.0
  Tree 1:
    If (feature 434 <= 0.0)
     Predict: 0.0
    Else (feature 434 > 0.0)
     Predict: 1.0
  Tree 2:
    If (feature 407 <= 0.0)
     Predict: 0.0
    Else (feature 407 > 0.0)
     Predict: 1.0



In [19]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 0.0


In [20]:
sc

<pyspark.context.SparkContext at 0x7f8403ee07d0>

In [21]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Gradient Boosted Trees

### Classification

In [22]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt")
trainingData, testData = data.randomSplit([0.7, 0.3])

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [23]:
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={},
                                             numIterations=3)

print(model.toDebugString())

TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 434 <= 0.0)
     Predict: -1.0
    Else (feature 434 > 0.0)
     Predict: 1.0
  Tree 1:
    If (feature 462 <= 0.0)
     If (feature 183 <= 15.0)
      Predict: -0.4768116880884702
     Else (feature 183 > 15.0)
      Predict: -0.47681168808847035
    Else (feature 462 > 0.0)
     If (feature 376 <= 168.0)
      If (feature 377 <= 41.0)
       Predict: 0.47681168808847013
      Else (feature 377 > 41.0)
       Predict: 0.4768116880884703
     Else (feature 376 > 168.0)
      Predict: 0.4768116880884712
  Tree 2:
    If (feature 434 <= 0.0)
     If (feature 183 <= 236.0)
      If (feature 126 <= 28.0)
       Predict: -0.4381935810427206
      Else (feature 126 > 28.0)
       Predict: -0.43819358104272055
     Else (feature 183 > 236.0)
      Predict: -0.43819358104272066
    Else (feature 434 > 0.0)
     If (feature 433 <= 251.0)
      If (feature 324 <= 0.0)
       Predict: 0.4381935810427206
      Else (feature 324 >

In [24]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())

print('Test Error = ' + str(testErr))

Test Error = 0.0454545454545


### Regression

In [25]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

model = GradientBoostedTrees.trainRegressor(trainingData,
                                            categoricalFeaturesInfo={}, 
                                            numIterations=3)

print(model.toDebugString())

TreeEnsembleModel regressor with 3 trees

  Tree 0:
    If (feature 434 <= 0.0)
     Predict: 0.0
    Else (feature 434 > 0.0)
     Predict: 1.0
  Tree 1:
    Predict: 0.0
  Tree 2:
    Predict: 0.0



In [26]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 0.0454545454545


In [27]:
sc

<pyspark.context.SparkContext at 0x7f8403ee07d0>

In [28]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Linear Methods

### Logistic Regression

In [29]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_svm_data.txt

--2017-01-24 11:47:40--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_svm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39474 (39K) [text/plain]
Saving to: ‘sample_svm_data.txt’


2017-01-24 11:47:41 (713 KB/s) - ‘sample_svm_data.txt’ saved [39474/39474]



In [30]:
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("sample_svm_data.txt")

parsedData = data.map(parsePoint)

In [31]:
parsedData.take(1)

[LabeledPoint(1.0, [0.0,2.52078447202,0.0,0.0,0.0,2.00468443649,2.00034729927,0.0,2.22838704274,2.22838704274,0.0,0.0,0.0,0.0,0.0,0.0])]

In [32]:
model = LogisticRegressionWithLBFGS.train(parsedData)

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))

trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

Training Error = 0.366459627329


### Linear Regression

In [33]:
from pyspark.mllib.regression import LinearRegressionWithSGD, LinearRegressionModel

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/ridge-data/lpsa.data

--2017-01-24 11:47:44--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/ridge-data/lpsa.data
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10395 (10K) [text/plain]
Saving to: ‘lpsa.data’


2017-01-24 11:47:44 (4,55 MB/s) - ‘lpsa.data’ saved [10395/10395]



In [34]:
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("lpsa.data")
parsedData = data.map(parsePoint)

In [35]:
parsedData.take(1)

[LabeledPoint(-0.4307829, [-1.63735562648,-2.00621178481,-1.86242597251,-1.02470580167,-0.522940888712,-0.863171185426,-1.04215728919,-0.864466507337])]

In [36]:
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)

valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))

MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()

print("Mean Squared Error = " + str(MSE))



Mean Squared Error = 7.4510328101


In [37]:
sc

<pyspark.context.SparkContext at 0x7f8403ee07d0>

In [38]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Evaluation Metrics

### Binary Classification

In [39]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_binary_classification_data.txt

--2017-01-24 11:47:47--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_binary_classification_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_binary_classification_data.txt’


2017-01-24 11:47:47 (922 KB/s) - ‘sample_binary_classification_data.txt’ saved [104736/104736]



In [40]:
data = MLUtils.loadLibSVMFile(sc, "sample_binary_classification_data.txt")

training, test = data.randomSplit([0.6, 0.4], seed=11L)

In [41]:
data.take(1)

[LabeledPoint(0.0, (692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0

In [42]:
!rm -rf metastore_db/*.lck

model = LogisticRegressionWithLBFGS.train(training)

predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

metrics = BinaryClassificationMetrics(predictionAndLabels)

print("Area under PR = %s" % metrics.areaUnderPR)

print("Area under ROC = %s" % metrics.areaUnderROC)

Area under PR = 0.992324561404
Area under ROC = 0.979166666667


In [43]:
print predictionAndLabels.take(1)

[(1.0, 1.0)]


### Multiclassification

In [44]:
from pyspark.mllib.evaluation import MulticlassMetrics

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_multiclass_classification_data.txt

--2017-01-24 11:47:50--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_multiclass_classification_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6953 (6,8K) [text/plain]
Saving to: ‘sample_multiclass_classification_data.txt’


2017-01-24 11:47:50 (5,35 MB/s) - ‘sample_multiclass_classification_data.txt’ saved [6953/6953]



In [45]:
data = MLUtils.loadLibSVMFile(sc, "sample_multiclass_classification_data.txt")

training, test = data.randomSplit([0.6, 0.4], seed=11L)

data.take(1)

[LabeledPoint(1.0, (4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333]))]

In [46]:
model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

metrics = MulticlassMetrics(predictionAndLabels)

In [47]:
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)



Summary Stats
Precision = 0.879310344828
Recall = 0.879310344828
F1 Score = 0.879310344828




In [48]:
labels = data.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

Class 0.0 precision = 0.818181818182
Class 0.0 recall = 0.857142857143
Class 0.0 F1 Measure = 0.837209302326
Class 1.0 precision = 1.0
Class 1.0 recall = 1.0
Class 1.0 F1 Measure = 1.0
Class 2.0 precision = 0.833333333333
Class 2.0 recall = 0.789473684211
Class 2.0 F1 Measure = 0.810810810811


In [49]:
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Weighted recall = 0.879310344828
Weighted precision = 0.879571577847
Weighted F(1) Score = 0.879082771625
Weighted F(0.5) Score = 0.879289486218
Weighted false positive rate = 0.0643415298588


### Regression

In [50]:
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import DenseVector

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_linear_regression_data.txt

--2017-01-24 11:47:55--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_linear_regression_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119069 (116K) [text/plain]
Saving to: ‘sample_linear_regression_data.txt’


2017-01-24 11:47:55 (1,14 MB/s) - ‘sample_linear_regression_data.txt’ saved [119069/119069]



In [51]:
def parsePoint(line):
    values = line.split()
    return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]]))

data = sc.textFile("sample_linear_regression_data.txt")
parsedData = data.map(parsePoint)

parsedData.take(1)

[LabeledPoint(-9.49000987882, [0.455127360066,0.36644694352,-0.382561089335,-0.445843019852,0.331097903589,0.806744529344,-0.262434173177,-0.448503861117,-0.0726928483817,0.56580355758])]

In [52]:
model = LinearRegressionWithSGD.train(parsedData)

valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))

metrics = RegressionMetrics(valuesAndPreds)

In [53]:
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)

print("R-squared = %s" % metrics.r2)

print("MAE = %s" % metrics.meanAbsoluteError)

print("Explained variance = %s" % metrics.explainedVariance)

MSE = 103.309686818
RMSE = 10.1641372884
R-squared = 0.0276391109678
MAE = 8.14869190795
Explained variance = 2.88839520172
