In [152]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data
data = spark.read.format("libsvm")\
    .load("data/sample_multiclass_classification_data.txt")

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 3, 2, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.9682539682539683


In [153]:
data.show(200)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  1.0|(4,[0,2,3],[-0.83...|
|  2.0|(4,[0,1,2,3],[-1....|
|  2.0|(4,[0,1,2,3],[-1....|
|  1.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,2,3],[0.611...|
|  0.0|(4,[0,1,2,3],[0.2...|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,2,3],[-0.94...|
|  2.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  2.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  1.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.5...|
|  2.0|(4,[0,2,3],[0.166...|
|  2.0|(4,[0,1,2,3],[0.1...|
|  2.0|(4,[0,1,2,3],[0.1...|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,2,3],[-0.11...|
|  2.0|(4,[0,1

In [154]:
import random
import numpy as np
layers_grid = []
for layers in np.arange(1,7):
    for i in np.arange(0,10):
        lg = [4]
        for j in np.arange(0,layers):
            lg.append(random.randrange(2, 9))
        lg.append(3)    
        layers_grid.append(lg)    
layers_grid.append([4, 3, 2, 3])

In [155]:
layers_grid

[[4, 5, 3],
 [4, 6, 3],
 [4, 2, 3],
 [4, 7, 3],
 [4, 4, 3],
 [4, 8, 3],
 [4, 7, 3],
 [4, 5, 3],
 [4, 5, 3],
 [4, 6, 3],
 [4, 8, 2, 3],
 [4, 7, 3, 3],
 [4, 8, 7, 3],
 [4, 7, 2, 3],
 [4, 6, 8, 3],
 [4, 3, 7, 3],
 [4, 5, 6, 3],
 [4, 8, 4, 3],
 [4, 6, 8, 3],
 [4, 2, 6, 3],
 [4, 7, 4, 5, 3],
 [4, 4, 5, 3, 3],
 [4, 4, 6, 6, 3],
 [4, 5, 4, 4, 3],
 [4, 8, 6, 7, 3],
 [4, 3, 2, 5, 3],
 [4, 4, 7, 7, 3],
 [4, 6, 3, 7, 3],
 [4, 5, 5, 6, 3],
 [4, 7, 5, 3, 3],
 [4, 7, 4, 7, 8, 3],
 [4, 2, 2, 6, 6, 3],
 [4, 4, 4, 3, 5, 3],
 [4, 2, 6, 7, 7, 3],
 [4, 7, 4, 7, 2, 3],
 [4, 5, 4, 6, 4, 3],
 [4, 3, 4, 7, 7, 3],
 [4, 2, 8, 5, 6, 3],
 [4, 4, 7, 2, 2, 3],
 [4, 7, 7, 8, 5, 3],
 [4, 5, 2, 7, 3, 7, 3],
 [4, 3, 8, 5, 8, 7, 3],
 [4, 6, 8, 3, 2, 3, 3],
 [4, 6, 8, 5, 3, 3, 3],
 [4, 2, 6, 7, 6, 5, 3],
 [4, 7, 4, 3, 2, 4, 3],
 [4, 2, 7, 5, 8, 4, 3],
 [4, 3, 6, 3, 5, 4, 3],
 [4, 8, 7, 2, 7, 5, 3],
 [4, 3, 2, 5, 6, 4, 3],
 [4, 7, 8, 3, 2, 7, 6, 3],
 [4, 4, 2, 6, 6, 3, 7, 3],
 [4, 5, 8, 6, 5, 6, 4, 3],
 [4, 7, 5, 8, 5, 6,

In [156]:
from pyspark.ml import Pipeline
trainer = MultilayerPerceptronClassifier(maxIter=100,  blockSize=128, seed=1234)
stages = [trainer]
pipeline = Pipeline().setStages(stages)

In [157]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
            .addGrid(trainer.layers, layers_grid)\
            .build()

In [158]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setRawPredictionCol("prediction")\
    .setLabelCol("label")

In [159]:
from pyspark.ml.tuning import TrainValidationSplit
train, test = data.randomSplit([0.7, 0.3])
tvs = TrainValidationSplit()\
        .setTrainRatio(0.75)\
        .setEstimatorParamMaps(params)\
        .setEstimator(pipeline)\
        .setEvaluator(evaluator)
tvs_fitted = tvs.fit(train)

In [70]:
tvs_fitted

TrainValidationSplitModel_79ba3ba04e4c

In [160]:
evaluator.evaluate(tvs_fitted.transform(test)) 

0.8636363636363636

In [151]:
tvs_fitted.save("nnet")