**Model Evaluation with PySpark**

In [1]:
#Import needed libraries
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
#Initialize SQL context
sqlContext = SQLContext(sc)
predictions = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/predictions.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [3]:
#Initialize a Multi Class Classificator Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', 
                                              metricName = 'precision')

In [4]:
#Look at the model's accuracy
accuracy = evaluator.evaluate(predictions)
print('Accuracy Rate = %g ' % accuracy)
print('Error Rate = %g ' % (1.0 - accuracy))

Accuracy Rate = 0.754491 
Error Rate = 0.245509 


In [5]:
#RDD: Resilient Distributed Datasets
predictions.rdd.take(5)

[Row(prediction=1.0, label=1.0),
 Row(prediction=1.0, label=1.0),
 Row(prediction=1.0, label=1.0),
 Row(prediction=1.0, label=1.0),
 Row(prediction=1.0, label=1.0)]

In [6]:
#Create a tuple with the predicitions and the true labels
predictions.rdd.map(tuple).take(5)

[(1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0)]

In [7]:
#Initialize a Multi Class Metric
metrics = MulticlassMetrics(predictions.rdd.map(tuple))

In [8]:
#Create a confusion matrix
confusion_m = metrics.confusionMatrix().toArray().transpose()

In [9]:
#Look at the confusion matrix
confusion_m

array([[ 134.,   53.],
       [  29.,  118.]])