In [1]:
sc

<pyspark.context.SparkContext at 0x7f3e5d740710>

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.util import MLUtils

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

--2017-02-07 14:42:52--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt.1’


2017-02-07 14:42:52 (1,04 MB/s) - ‘sample_libsvm_data.txt.1’ saved [104736/104736]



In [4]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Decision Trees

### Classification

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer

labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(data)

dtC = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")

pipelineClass = Pipeline().setStages([labelIndexer, featureIndexer, dtC, labelConverter])

trainingData, testData = data.randomSplit([0.7, 0.3])

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel

modelClassifier = pipelineClass.fit(trainingData)

treeModel = modelClassifier.stages[2]

predictionsClass = modelClassifier.transform(testData)

In [7]:
modelClassifier.stages

[StringIndexer_4060b8fbdaa01e6e664f,
 VectorIndexer_49cc9b0e69d1b82d4dad,
 DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4e88a70d392f75c207fe) of depth 2 with 5 nodes,
 IndexToString_41c69abf49da405a7ef1]

In [8]:
print treeModel.toDebugString

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4e88a70d392f75c207fe) of depth 2 with 5 nodes
  If (feature 406 <= 72.0)
   If (feature 99 in {2.0})
    Predict: 0.0
   Else (feature 99 not in {2.0})
    Predict: 1.0
  Else (feature 406 > 72.0)
   Predict: 0.0



In [9]:
predictionsClass.toPandas()[:5]

Unnamed: 0,features,label,indexedLabel,indexedFeatures,rawPrediction,probability,prediction,predictedLabel
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[35.0, 0.0]","[1.0, 0.0]",0.0,1.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 29.0]","[0.0, 1.0]",1.0,0.0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[35.0, 0.0]","[1.0, 0.0]",0.0,1.0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[35.0, 0.0]","[1.0, 0.0]",0.0,1.0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 29.0]","[0.0, 1.0]",1.0,0.0


### Regression

In [10]:
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel

dtR = DecisionTreeRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineReg = Pipeline().setStages([featureIndexer, dtR])

In [11]:
modelRegressor = pipelineReg.fit(trainingData)

treeModel = modelRegressor.stages[1]

print treeModel.toDebugString

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4220915f1fe28972014b) of depth 2 with 5 nodes
  If (feature 406 <= 72.0)
   If (feature 99 in {0.0,3.0})
    Predict: 0.0
   Else (feature 99 not in {0.0,3.0})
    Predict: 1.0
  Else (feature 406 > 72.0)
   Predict: 1.0



In [12]:
predictionsReg = modelRegressor.transform(testData)

In [13]:
predictionsReg.toPandas()[:5]

Unnamed: 0,features,label,indexedFeatures,prediction
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


In [14]:
sc

<pyspark.context.SparkContext at 0x7f3e5d740710>

In [15]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [16]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [17]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Random Forests

### Classification

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

rfC = RandomForestClassifier().setLabelCol("indexedLabel") \
                                .setFeaturesCol("indexedFeatures") \
                                .setNumTrees(3)
        
trainingData, testData = data.randomSplit([0.7, 0.3])

In [19]:
pipelineRFC = Pipeline().setStages([labelIndexer, featureIndexer, rfC, labelConverter])

modelRFC = pipelineRFC.fit(trainingData)

predictionsRFC = modelRFC.transform(testData)

In [20]:
predictionsRFC.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           1.0|  1.0|(692,[97,98,99,12...|
|           0.0|  0.0|(692,[98,99,100,1...|
|           1.0|  1.0|(692,[99,100,101,...|
|           0.0|  0.0|(692,[122,123,148...|
|           0.0|  0.0|(692,[126,127,128...|
+--------------+-----+--------------------+
only showing top 5 rows



In [21]:
rfModelC = modelRFC.stages[2]
rfModelC.featureImportances

SparseVector(692, {329: 0.0373, 455: 0.2992, 457: 0.0175, 490: 0.3333, 538: 0.2961, 540: 0.0166})

In [22]:
print rfModelC.toDebugString

RandomForestClassificationModel (uid=rfc_1d2da6331491) with 3 trees
  Tree 0 (weight 1.0):
    If (feature 455 <= 0.0)
     If (feature 540 <= 65.0)
      If (feature 457 <= 0.0)
       Predict: 0.0
      Else (feature 457 > 0.0)
       Predict: 1.0
     Else (feature 540 > 65.0)
      Predict: 1.0
    Else (feature 455 > 0.0)
     Predict: 1.0
  Tree 1 (weight 1.0):
    If (feature 490 <= 0.0)
     Predict: 1.0
    Else (feature 490 > 0.0)
     Predict: 0.0
  Tree 2 (weight 1.0):
    If (feature 538 <= 0.0)
     If (feature 329 <= 0.0)
      Predict: 0.0
     Else (feature 329 > 0.0)
      Predict: 1.0
    Else (feature 538 > 0.0)
     Predict: 1.0



### Regression

In [23]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel

rfR = RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineRFR = Pipeline().setStages([featureIndexer, rfR])

modelRFR = pipelineRFR.fit(trainingData)

predictionsRFR = modelRFR.transform(testData)

In [24]:
predictionsRFR = modelRFR.transform(testData)

predictionsRFR.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  1.0|(692,[97,98,99,12...|
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.1|  1.0|(692,[99,100,101,...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows



In [25]:
sc

<pyspark.context.SparkContext at 0x7f3e5d740710>

In [26]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [27]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [28]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Gradient Boosted Trees

### Classification

In [29]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

gbtC = GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10)

pipelineGBTC = Pipeline().setStages([labelIndexer, featureIndexer, gbtC, labelConverter])

trainingData, testData = data.randomSplit([0.7, 0.3])

In [30]:
modelGBTC = pipelineGBTC.fit(trainingData)

predictionsGBTC = modelGBTC.transform(testData)

predictionsGBTC.select("predictedLabel", "label", "features").show(3)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  1.0|(692,[99,100,101,...|
|           1.0|  1.0|(692,[119,120,121...|
|           1.0|  1.0|(692,[123,124,125...|
+--------------+-----+--------------------+
only showing top 3 rows



In [31]:
gbtModelC = modelGBTC.stages[2]

print gbtModelC.toDebugString

GBTClassificationModel (uid=GBTClassifier_4fb78cb5d8561d32efe0) with 10 trees
  Tree 0 (weight 1.0):
    If (feature 434 <= 0.0)
     Predict: 1.0
    Else (feature 434 > 0.0)
     Predict: -1.0
  Tree 1 (weight 0.1):
    If (feature 462 <= 0.0)
     Predict: 0.47681168808847024
    Else (feature 462 > 0.0)
     If (feature 409 <= 25.0)
      Predict: -0.47681168808847024
     Else (feature 409 > 25.0)
      Predict: -0.4768116880884712
  Tree 2 (weight 0.1):
    If (feature 434 <= 0.0)
     If (feature 186 <= 15.0)
      Predict: 0.4381935810427206
     Else (feature 186 > 15.0)
      Predict: 0.43819358104272066
    Else (feature 434 > 0.0)
     If (feature 437 <= 15.0)
      Predict: -0.4381935810427206
     Else (feature 437 > 15.0)
      Predict: -0.43819358104272155
  Tree 3 (weight 0.1):
    If (feature 490 <= 0.0)
     Predict: 0.40514968028459825
    Else (feature 490 > 0.0)
     If (feature 459 <= 189.0)
      If (feature 459 <= 64.0)
       If (feature 266 <= 0.0)
        If

### Regression

In [32]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import GBTRegressionModel

gbtR = GBTRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures").setMaxIter(10)

pipelineGBTR = Pipeline().setStages([featureIndexer, gbtR])

modelGBTR = pipelineGBTR.fit(trainingData)

In [33]:
predictionsGBTR = modelGBTR.transform(testData)
predictionsGBTR.show(5)

+--------------------+-----+--------------------+----------+
|            features|label|     indexedFeatures|prediction|
+--------------------+-----+--------------------+----------+
|(692,[99,100,101,...|  1.0|(692,[99,100,101,...|       0.0|
|(692,[119,120,121...|  1.0|(692,[119,120,121...|       1.0|
|(692,[123,124,125...|  1.0|(692,[123,124,125...|       1.0|
|(692,[125,126,127...|  1.0|(692,[125,126,127...|       1.0|
|(692,[125,126,153...|  1.0|(692,[125,126,153...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows



In [34]:
sc

<pyspark.context.SparkContext at 0x7f3e5d740710>

In [35]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [36]:
!rm -rf metastore_db/ 
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [37]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Linear Methods

### Logistic Regression

In [38]:
from pyspark.ml.classification import LogisticRegression

trainingData, testData = data.randomSplit([0.7, 0.3])

logr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

logrModel = logr.fit(trainingData)

print "Weights: %s Intercept: %s" % (logrModel.coefficients, logrModel.intercept)

Weights: (692,[235,263,272,300,356,378,405,406,407,433,434,461,462,484,489,490,496,511,512,539,540,568],[-0.000184340020759,-0.00052020775993,-6.538669629e-05,-5.18477807096e-05,-4.8157070928e-05,0.000419623480238,0.000184843519456,0.000928670263834,0.00019572960232,0.000611997139151,0.00091992892924,0.000396707675263,0.000457937869717,-6.05102100784e-06,0.000194054328354,4.72616986833e-05,-5.26929702748e-06,-0.000303136407921,-0.000260957969962,-0.000280680238859,-0.00124563037765,-0.00129370037932]) Intercept: 0.0484695213565


In [39]:
trainingSummaryLR = logrModel.summary

In [40]:
trainingSummaryLR.roc.show()

+---+-------------------+
|FPR|                TPR|
+---+-------------------+
|0.0|                0.0|
|0.0|0.02564102564102564|
|0.0|0.05128205128205128|
|0.0|0.07692307692307693|
|0.0|0.10256410256410256|
|0.0| 0.1282051282051282|
|0.0|0.15384615384615385|
|0.0| 0.1794871794871795|
|0.0|0.20512820512820512|
|0.0|0.23076923076923078|
|0.0| 0.2564102564102564|
|0.0|0.28205128205128205|
|0.0| 0.3076923076923077|
|0.0| 0.3333333333333333|
|0.0|  0.358974358974359|
|0.0|0.38461538461538464|
|0.0|0.41025641025641024|
|0.0| 0.4358974358974359|
|0.0|0.46153846153846156|
|0.0|0.48717948717948717|
+---+-------------------+
only showing top 20 rows



In [41]:
trainingSummaryLR.areaUnderROC

1.0

### Linear Regression

In [42]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

lrModel = lr.fit(trainingData)

print "Weights: %s Intercept: %s" % (lrModel.coefficients, lrModel.intercept)

Weights: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.

In [43]:
trainingSummaryLLS = lrModel.summary

print "numIterations: %s" % trainingSummaryLLS.totalIterations

numIterations: 11


In [44]:
trainingSummaryLLS.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|0.34509718096855513|
|-0.2524915778203864|
|0.19613366690992318|
| 0.2149702950954704|
| 0.3148710175072822|
+-------------------+
only showing top 5 rows



In [45]:
sc

<pyspark.context.SparkContext at 0x7f3e5d740710>

In [46]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [47]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [48]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Evaluation

### Binary Classification

In [49]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import BinaryLogisticRegressionSummary

trainingData, testData = data.randomSplit([0.7, 0.3])

logr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

logrModel = logr.fit(trainingData)

print "Weights: %s Intercept: %s" % (logrModel.coefficients, logrModel.intercept)

Weights: (692,[243,244,262,263,272,290,300,323,350,351,377,378,379,400,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,545,568,605,606,632],[-0.000339050678432,-5.5366680824e-06,-9.62816266919e-06,-0.00027433697171,-0.000362645776348,-9.36657429677e-06,-0.000327002184676,0.000104553610923,0.000154237573607,0.000151196195048,9.45942057644e-05,0.000223749607342,0.00019929149022,-1.10459264548e-05,0.000161416795613,0.00114844242452,0.000231993831837,-5.18597860749e-05,0.000236426801222,0.000778332744722,-8.44411190964e-05,-8.60709153075e-05,0.00022403449151,0.000229668781053,-0.000406434046326,-6.72101135132e-05,0.000221462851066,0.000139115480739,-5.80851145937e-05,-0.000823552541309,-0.000350876710739,0.000233110528072,-0.000636265842505,-0.00127800472718,5.93584906113e-05,-0.000382629540403,-7.02092445983e-05,-7.82607115707e-05,-3.99466577477e-05]) Intercept: 0.128768478595


In [50]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictionsLogR = logrModel.transform(testData)

evaluator = BinaryClassificationEvaluator().setLabelCol("label") \
                            .setRawPredictionCol("rawPrediction") \
                            .setMetricName("areaUnderROC")

roc = evaluator.evaluate(predictionsLogR)
print roc

1.0


### Multiclass Classification

In [51]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

rfC = RandomForestClassifier().setLabelCol("indexedLabel") \
                                .setFeaturesCol("indexedFeatures") \
                                .setNumTrees(3)
        
pipelineRFC = Pipeline().setStages([labelIndexer, featureIndexer, rfC, labelConverter])

modelRFC = pipelineRFC.fit(trainingData)

predictionsRFC = modelRFC.transform(testData)

In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel") \
                                        .setPredictionCol("prediction") \
                                        .setMetricName("accuracy")

accuracy = evaluator.evaluate(predictionsRFC)

print "Test Error = %s" % (1.0 - accuracy)

Test Error = 0.0


### Regression

In [53]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel

rfR = RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineRFR = Pipeline().setStages([featureIndexer, rfR])

modelRFR = pipelineRFR.fit(trainingData)

predictionsRFR = modelRFR.transform(testData)

In [54]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator().setLabelCol("label") \
                                .setPredictionCol("prediction") \
                                .setMetricName("rmse")

rmse = evaluator.evaluate(predictionsRFR)

print "Root Mean Squared Error (RMSE) = %s" % rmse

Root Mean Squared Error (RMSE) = 0.0869718492623


### Logistic Regression

In [55]:
from pyspark.ml.classification import LogisticRegression

logr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

logrModel = logr.fit(trainingData)

print "Weights: %s Intercept: %s" % (logrModel.coefficients, logrModel.intercept)

Weights: (692,[243,244,262,263,272,290,300,323,350,351,377,378,379,400,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,545,568,605,606,632],[-0.000339050678432,-5.5366680824e-06,-9.62816266919e-06,-0.00027433697171,-0.000362645776348,-9.36657429677e-06,-0.000327002184676,0.000104553610923,0.000154237573607,0.000151196195048,9.45942057644e-05,0.000223749607342,0.00019929149022,-1.10459264548e-05,0.000161416795613,0.00114844242452,0.000231993831837,-5.18597860749e-05,0.000236426801222,0.000778332744722,-8.44411190964e-05,-8.60709153075e-05,0.00022403449151,0.000229668781053,-0.000406434046326,-6.72101135132e-05,0.000221462851066,0.000139115480739,-5.80851145937e-05,-0.000823552541309,-0.000350876710739,0.000233110528072,-0.000636265842505,-0.00127800472718,5.93584906113e-05,-0.000382629540403,-7.02092445983e-05,-7.82607115707e-05,-3.99466577477e-05]) Intercept: 0.128768478595


In [56]:
trainingSummaryLR = logrModel.summary
trainingSummaryLR.areaUnderROC

1.0

In [57]:
fMeasure = trainingSummaryLR.fMeasureByThreshold

fMeasure.show(3)

+------------------+-------------------+
|         threshold|          F-Measure|
+------------------+-------------------+
| 0.783800770148522|0.05882352941176471|
|0.7830613181129249| 0.1142857142857143|
|0.7812208460751318|0.16666666666666669|
+------------------+-------------------+
only showing top 3 rows



In [58]:
from pyspark.sql import functions as F

maxFMeasure = fMeasure.agg({"F-Measure": "max"}).head()[0]
print maxFMeasure
maxFMeasure = fMeasure.agg(F.max(F.col("F-Measure"))).head()[0]
print maxFMeasure

bestThreshold = fMeasure.where(F.col("F-Measure") == maxFMeasure).select("threshold").head()[0]
print bestThreshold

1.0
1.0
0.550683014429


In [59]:
trainingSummaryLR.pr.show(3)
trainingSummaryLR.precisionByThreshold.show(3)

+--------------------+---------+
|              recall|precision|
+--------------------+---------+
|                 0.0|      1.0|
|0.030303030303030304|      1.0|
| 0.06060606060606061|      1.0|
+--------------------+---------+
only showing top 3 rows

+------------------+---------+
|         threshold|precision|
+------------------+---------+
| 0.783800770148522|      1.0|
|0.7830613181129249|      1.0|
|0.7812208460751318|      1.0|
+------------------+---------+
only showing top 3 rows



In [60]:
trainingSummaryLR.recallByThreshold.show(3)
trainingSummaryLR.roc.show(3)

+------------------+--------------------+
|         threshold|              recall|
+------------------+--------------------+
| 0.783800770148522|0.030303030303030304|
|0.7830613181129249| 0.06060606060606061|
|0.7812208460751318| 0.09090909090909091|
+------------------+--------------------+
only showing top 3 rows

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|0.030303030303030304|
|0.0| 0.06060606060606061|
+---+--------------------+
only showing top 3 rows



### Linear Regression

In [61]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

lrModel = lr.fit(trainingData)

print "Weights: %s Intercept: %s" % (lrModel.coefficients, lrModel.intercept)

Weights: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.24179431308e-06

In [62]:
trainingSummaryLLS = lrModel.summary

print trainingSummaryLLS.explainedVariance

print trainingSummaryLLS.meanAbsoluteError

print trainingSummaryLLS.meanSquaredError

print trainingSummaryLLS.r2

0.0563067869876
0.26645263604
0.0744390790341
0.700229639518


In [63]:
trainingSummaryLLS.residuals.show(3)

print trainingSummaryLLS.rootMeanSquaredError

+-------------------+
|          residuals|
+-------------------+
|  0.252507432655962|
| 0.5787310312766889|
|-0.2733425522661087|
+-------------------+
only showing top 3 rows

0.272835259881


In [64]:
sc

<pyspark.context.SparkContext at 0x7f3e5d740710>

In [65]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Saving and Loading Models

In [66]:
from pyspark.ml.linalg import Vectors

df = sqlc.createDataFrame([(1.0, Vectors.dense(1.0, 2.0, 3.0)),
                           (1.0, Vectors.dense(2.0, 3.0, 4.0)),
                           (0.0, Vectors.dense(-1.0, 1.0, 2.0)),
                           (0.0, Vectors.dense(-2.0, 3.0, 5.0))]).toDF("label", "features")

In [67]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()

lrModel = lr.fit(df)
lrModel.transform(df).show()

+-----+--------------+--------------------+--------------------+----------+
|label|      features|       rawPrediction|         probability|prediction|
+-----+--------------+--------------------+--------------------+----------+
|  1.0| [1.0,2.0,3.0]|[-18.070405604444...|[1.41945802370898...|       1.0|
|  1.0| [2.0,3.0,4.0]|[-38.987081234650...|[1.16983808020820...|       1.0|
|  0.0|[-1.0,1.0,2.0]|[19.2085506510249...|[0.99999999545187...|       0.0|
|  0.0|[-2.0,3.0,5.0]|[29.1902958840810...|[0.99999999999978...|       0.0|
+-----+--------------+--------------------+--------------------+----------+



In [68]:
lrModel.save("lrModel.parquet")

In [69]:
from pyspark.ml.classification import LogisticRegressionModel

sameModel = LogisticRegressionModel.load("lrModel.parquet")
sameModel.transform(df).show()

+-----+--------------+--------------------+--------------------+----------+
|label|      features|       rawPrediction|         probability|prediction|
+-----+--------------+--------------------+--------------------+----------+
|  1.0| [1.0,2.0,3.0]|[-18.070405604444...|[1.41945802370898...|       1.0|
|  1.0| [2.0,3.0,4.0]|[-38.987081234650...|[1.16983808020820...|       1.0|
|  0.0|[-1.0,1.0,2.0]|[19.2085506510249...|[0.99999999545187...|       0.0|
|  0.0|[-2.0,3.0,5.0]|[29.1902958840810...|[0.99999999999978...|       0.0|
+-----+--------------+--------------------+--------------------+----------+



In [70]:
!cat lrModel.parquet/metadata/part-00000 

{"class":"org.apache.spark.ml.classification.LogisticRegressionModel","timestamp":1486475028230,"sparkVersion":"2.1.0","uid":"LogisticRegression_4846b56f70bf3abd7e5a","paramMap":{"predictionCol":"prediction","labelCol":"label","tol":1.0E-6,"fitIntercept":true,"probabilityCol":"probability","maxIter":100,"regParam":0.0,"standardization":true,"family":"auto","featuresCol":"features","threshold":0.5,"elasticNetParam":0.0,"rawPredictionCol":"rawPrediction","aggregationDepth":2}}
