In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [4]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Evaluation

### Binary Classification

In [5]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import BinaryLogisticRegressionSummary

trainingData, testData = data.randomSplit([0.7, 0.3])

logr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

logrModel = logr.fit(trainingData)

print("Weights: %s Intercept: %s" % (logrModel.coefficients, logrModel.intercept))

Weights: (692,[235,243,244,271,272,273,300,301,323,328,350,351,356,372,373,378,379,385,400,405,406,407,413,427,428,433,434,435,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-9.92459585713e-05,-7.44271434606e-05,-0.00014946668662,-8.6967482907e-06,-0.000333320469687,-1.92548634575e-05,-0.000374657909184,-5.83975274944e-05,1.85147640777e-05,-0.000169305742381,4.08441117653e-05,9.64558496765e-05,-4.68086020898e-05,-8.783272557e-05,-3.61104292768e-05,0.000170455837293,0.000114090039273,-4.36014282826e-05,-8.45178184506e-05,8.85236144315e-05,0.000806253528489,0.000306768274123,-7.80793924807e-05,-5.83311217755e-06,-7.96741259275e-05,0.000293854701602,0.000921747131981,0.000120244897667,-0.000152916771459,-7.85612401905e-05,0.000168090224153,0.000879306698266,-0.000410431307045,-4.55008151983e-05,0.000159085881894,0.000189891435973,-9.17662279933e-05,-0.000490257348971,-0.00032795102547,0.000169783642593,-0.0007204108039,-0.000430967786898,-0.000331023310072]) Intercept: 0.19

In [6]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictionsLogR = logrModel.transform(testData)

evaluator = BinaryClassificationEvaluator().setLabelCol("label") \
                            .setRawPredictionCol("rawPrediction") \
                            .setMetricName("areaUnderROC")

roc = evaluator.evaluate(predictionsLogR)
print(roc)

1.0


### Multiclass Classification

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

labelIndexer = StringIndexer().setInputCol("label") \
                                .setOutputCol("indexedLabel").fit(data)

labelConverter = IndexToString().setInputCol("prediction") \
                                .setOutputCol("predictedLabel") \
                                .setLabels(labelIndexer.labels)

featureIndexer = VectorIndexer().setInputCol("features") \
                                .setOutputCol("indexedFeatures") \
                                .setMaxCategories(4).fit(data)

rfC = RandomForestClassifier().setLabelCol("indexedLabel") \
                                .setFeaturesCol("indexedFeatures") \
                                .setNumTrees(3)
        
pipelineRFC = Pipeline().setStages([labelIndexer, featureIndexer, rfC, labelConverter])

modelRFC = pipelineRFC.fit(trainingData)

predictionsRFC = modelRFC.transform(testData)

In [8]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel") \
                                        .setPredictionCol("prediction") \
                                        .setMetricName("accuracy")

accuracy = evaluator.evaluate(predictionsRFC)

print("Test Error = %s" % (1.0 - accuracy))

Test Error = 0.0


### Regression

In [9]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel

rfR = RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures")

pipelineRFR = Pipeline().setStages([featureIndexer, rfR])

modelRFR = pipelineRFR.fit(trainingData)

predictionsRFR = modelRFR.transform(testData)

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator().setLabelCol("label") \
                                .setPredictionCol("prediction") \
                                .setMetricName("rmse")

rmse = evaluator.evaluate(predictionsRFR)

print("Root Mean Squared Error (RMSE) = %s" % rmse)

Root Mean Squared Error (RMSE) = 0.10648608225625411


### Logistic Regression

In [11]:
from pyspark.ml.classification import LogisticRegression

logr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

logrModel = logr.fit(trainingData)

print("Weights: %s Intercept: %s" % (logrModel.coefficients, logrModel.intercept))

Weights: (692,[235,243,244,271,272,273,300,301,323,328,350,351,356,372,373,378,379,385,400,405,406,407,413,427,428,433,434,435,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-9.92459585713e-05,-7.44271434606e-05,-0.00014946668662,-8.6967482907e-06,-0.000333320469687,-1.92548634575e-05,-0.000374657909184,-5.83975274944e-05,1.85147640777e-05,-0.000169305742381,4.08441117653e-05,9.64558496765e-05,-4.68086020898e-05,-8.783272557e-05,-3.61104292768e-05,0.000170455837293,0.000114090039273,-4.36014282826e-05,-8.45178184506e-05,8.85236144315e-05,0.000806253528489,0.000306768274123,-7.80793924807e-05,-5.83311217755e-06,-7.96741259275e-05,0.000293854701602,0.000921747131981,0.000120244897667,-0.000152916771459,-7.85612401905e-05,0.000168090224153,0.000879306698266,-0.000410431307045,-4.55008151983e-05,0.000159085881894,0.000189891435973,-9.17662279933e-05,-0.000490257348971,-0.00032795102547,0.000169783642593,-0.0007204108039,-0.000430967786898,-0.000331023310072]) Intercept: 0.19

In [12]:
trainingSummaryLR = logrModel.summary
trainingSummaryLR.areaUnderROC

1.0

In [13]:
fMeasure = trainingSummaryLR.fMeasureByThreshold

fMeasure.show(3)

+------------------+-------------------+
|         threshold|          F-Measure|
+------------------+-------------------+
|0.7923937273050735|0.04878048780487806|
|0.7916716001125358|0.09523809523809523|
|0.7906437230222552|0.13953488372093023|
+------------------+-------------------+
only showing top 3 rows



In [14]:
from pyspark.sql import functions as F

maxFMeasure = fMeasure.agg({"F-Measure": "max"}).head()[0]
print(maxFMeasure)
maxFMeasure = fMeasure.agg(F.max(F.col("F-Measure"))).head()[0]
print(maxFMeasure)

bestThreshold = fMeasure.where(F.col("F-Measure") == maxFMeasure).select("threshold").head()[0]
print(bestThreshold)

1.0
1.0
0.5653094538250638


In [15]:
trainingSummaryLR.pr.show(3)
trainingSummaryLR.precisionByThreshold.show(3)

+------+---------+
|recall|precision|
+------+---------+
|   0.0|      1.0|
| 0.025|      1.0|
|  0.05|      1.0|
+------+---------+
only showing top 3 rows

+------------------+---------+
|         threshold|precision|
+------------------+---------+
|0.7923937273050735|      1.0|
|0.7916716001125358|      1.0|
|0.7906437230222552|      1.0|
+------------------+---------+
only showing top 3 rows



In [16]:
trainingSummaryLR.recallByThreshold.show(3)
trainingSummaryLR.roc.show(3)

+------------------+------+
|         threshold|recall|
+------------------+------+
|0.7923937273050735| 0.025|
|0.7916716001125358|  0.05|
|0.7906437230222552| 0.075|
+------------------+------+
only showing top 3 rows

+---+-----+
|FPR|  TPR|
+---+-----+
|0.0|  0.0|
|0.0|0.025|
|0.0| 0.05|
+---+-----+
only showing top 3 rows



### Linear Regression

In [17]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

lrModel = lr.fit(trainingData)

print("Weights: %s Intercept: %s" % (lrModel.coefficients, lrModel.intercept))

Weights: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0919617776e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.4

In [18]:
trainingSummaryLLS = lrModel.summary

print(trainingSummaryLLS.explainedVariance)

print(trainingSummaryLLS.meanAbsoluteError)

print(trainingSummaryLLS.meanSquaredError)

print(trainingSummaryLLS.r2)

0.05511378820975742
0.2669094074506451
0.07510961541348907
0.6958060575753693


In [19]:
trainingSummaryLLS.residuals.show(3)

print(trainingSummaryLLS.rootMeanSquaredError)

+--------------------+
|           residuals|
+--------------------+
|-0.27172696247114375|
|    0.50242170147003|
| 0.29858531755853046|
+--------------------+
only showing top 3 rows

0.2740613351304577


In [20]:
sc.stop()