In [1]:
import findspark
findspark.init()

from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('spark://192.168.17.1:7077')
conf.setAppName('mnist_logistic_regression')
sc = SparkContext(conf=conf)
print("SparkContext created.")
print(sc)

ValueError: Couldn't find Spark, make sure SPARK_HOME env is set or Spark is in an expected location (e.g. from homebrew installation).

In [3]:
import random as ran
import time

start_time = time.time()

def sample(p):
    x, y = ran.random(), ran.random()
    return 1 if x*x + y*y < 1 else 0

NUM_SAMPLES = 10*1000*1000

print("Mapping...")
mappedOutput = sc.parallelize(xrange(0, NUM_SAMPLES)).map(sample)

print("Reducing...")
count = mappedOutput.reduce(lambda a, b: a + b)

print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))
print("--- %s seconds ---" % (time.time() - start_time))

Mapping...
Reducing...
Pi is roughly 3.142818
--- 16.6654689312 seconds ---


In [3]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

# Load training data in LIBSVM format

data = MLUtils.loadLibSVMFile(sc, "/home/farmer/scripts/mnist_test.libsvm")

# Split data into training (60%) and test (40%)

training, test = data.randomSplit([0.6, 0.4], seed=11L)
training.cache()

# Run training algorithm to build the model

model = LogisticRegressionWithLBFGS.train(training, numClasses=10)

# Compute raw scores on the test set

predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object

metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics

precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

# Statistics by class

labels = data.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

# Weighted stats

print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Summary Stats
Precision = 0.901585703499
Recall = 0.901585703499
F1 Score = 0.901585703499
Class 0.0 precision = 0.927223719677
Class 0.0 recall = 0.939890710383
Class 0.0 F1 Measure = 0.933514246947
Class 1.0 precision = 0.952277657267
Class 1.0 recall = 0.960612691466
Class 1.0 F1 Measure = 0.956427015251
Class 2.0 precision = 0.928205128205
Class 2.0 recall = 0.868105515588
Class 2.0 F1 Measure = 0.897149938042
Class 3.0 precision = 0.880893300248
Class 3.0 recall = 0.898734177215
Class 3.0 F1 Measure = 0.889724310777
Class 4.0 precision = 0.899244332494
Class 4.0 recall = 0.934554973822
Class 4.0 F1 Measure = 0.916559691913
Class 5.0 precision = 0.874635568513
Class 5.0 recall = 0.817438692098
Class 5.0 F1 Measure = 0.845070422535
Class 6.0 precision = 0.927461139896
Class 6.0 recall = 0.934725848564
Class 6.0 F1 Measure = 0.931079323797
Class 7.0 precision = 0.901265822785
Class 7.0 recall = 0.901265822785
Class 7.0 F1 Measure = 0.901265822785
Class 8.0 precision = 0.829326923077
