In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
#findspark.init("/usr/local/spark")
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Iris Dataset MLP").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

## Load Data

In [None]:
sdf = spark.read.format('csv').options(header='false', inferSchema='true').load('../datasets/iris.csv')

In [None]:
sdf.printSchema()

## Rename Columns

In [None]:
cols = ['Sepal_Len', 'Sepal_Width', 'Petal_Len', 'Petal_Width', 'class']

In [None]:
sdf = sdf.toDF(*cols)

In [None]:
sdf.show(3)

## Data Pre-processing

In [None]:
classes = sdf.select('class').distinct()
classes.show()

In [None]:
cllist = list(classes.select('class').toPandas()['class'])
cllist

In [None]:
labelIndexer = StringIndexer(inputCol="class", outputCol="label").fit(sdf)

In [None]:
df = labelIndexer.transform(sdf)

In [None]:
df.show(3)

In [None]:
# Take all columns except last
fcols = cols[:-1]
fcols

In [None]:
# Select features into vector assembler
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = fcols, outputCol = 'features')
ndf = vassemb.transform(df)
ndf = ndf.select(['label', 'features'])

#ndf.show(3, truncate = False)
ndf.printSchema()

## Create training and test datasets

In [None]:
(trainingData, testData) = ndf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

## Use Multi layer perceptron classifier

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# There are 13 features and 3 Classes - Customer Segments)
# Define the hidden layers
layers = [4, 4, 3]

In [None]:
trainer = MultilayerPerceptronClassifier(featuresCol = 'features', labelCol = 'label',
                                         maxIter = 100, layers = layers, blockSize = 3,
                                         seed = 100)

In [None]:
model = trainer.fit(trainingData)

In [None]:
result = model.transform(testData)

In [None]:
predictionAndLabels = result.select("prediction", "label")

In [None]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

In [None]:
accuracy = evaluator.evaluate(predictionAndLabels)

In [None]:
print("Test set accuracy = " + str(accuracy))

## Confusion Matrix

In [None]:
y_true = result.select('label')
y_true = y_true.toPandas()

In [None]:
y_p = result.select('prediction')
y_p = y_p.toPandas()

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_p)
cm

In [None]:
cm_df = pd.DataFrame(cm, index = cllist, columns = cllist)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(cm_df, annot=True, cmap = 'Blues', annot_kws={"size": 16})
plt.title('MLP \nAccuracy:{0:.3f}'.format(accuracy))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()