In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Evaluation Metrics

### Binary Classification

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_binary_classification_data.txt

In [None]:
data = MLUtils.loadLibSVMFile(sc, "sample_binary_classification_data.txt")

training, test = data.randomSplit([0.6, 0.4], seed=11)

In [None]:
data.take(1)

In [None]:
!rm -rf metastore_db/*.lck

model = LogisticRegressionWithLBFGS.train(training)

predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

metrics = BinaryClassificationMetrics(predictionAndLabels)

print("Area under PR = %s" % metrics.areaUnderPR)

print("Area under ROC = %s" % metrics.areaUnderROC)

In [None]:
print(predictionAndLabels.take(1))

### Regression

In [None]:
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import DenseVector

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_linear_regression_data.txt

In [None]:
def parsePoint(line):
    values = line.split()
    return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]]))

data = sc.textFile("sample_linear_regression_data.txt")
parsedData = data.map(parsePoint)

parsedData.take(1)

In [None]:
model = LinearRegressionWithSGD.train(parsedData)

valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))

metrics = RegressionMetrics(valuesAndPreds)

In [None]:
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)

print("R-squared = %s" % metrics.r2)

print("MAE = %s" % metrics.meanAbsoluteError)

print("Explained variance = %s" % metrics.explainedVariance)

In [None]:
sc.stop()