# Neural Networks - MultiLayerPerceptron

# Wine Dataset
## Wines are categorized into 3 customer segments based on featuers listed below:
### Features are:

<ol>
    <li>Alcohol</li>
    <li>Malic acid</li>
    <li>Ash</li>
    <li>Alcalinity of ash</li>
    <li>Magnesium</li>
    <li>Total phenols</li>
    <li>Flavanoids</li>
    <li>Nonflavanoid phenols</li>
    <li>Proanthocyanins</li>
    <li>Color intensity</li>
    <li>Hue</li>
    <li>OD280/OD315 of diluted wines</li>
    <li>Proline </li>
    </ol>
</font>


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, VectorIndexer

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Wine Dataset MLP").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [None]:
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/Wine.csv')

In [None]:
sdf.printSchema()

In [None]:
sdf.show(3)

<font color = 'tomato'>
<h3>Data preparation</h3>
1. Convert the Customer_Segment (string) to label (number) <br>
2. Create features using Vector Assembler<br>
3. Create training and test set <br>
</font>

In [None]:
# Convert the target column to numbers
labelIndexer = StringIndexer(inputCol="Customer_Segment", outputCol="label").fit(sdf)

In [None]:
df = labelIndexer.transform(sdf)

In [None]:
df.show(3)

In [None]:
# Take all columns except Customer_Segment and label
fcols = df.columns[:-2]
fcols

In [None]:
# Select features into vector assembler
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = fcols, outputCol = 'features')
ndf = vassemb.transform(df)
ndf = ndf.select(['label', 'features'])

ndf.show(3, truncate = False)
#ndf.printSchema()

### Apply StandardScaler

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="stdFeatures",
                        withStd=False, withMean=True)

In [None]:
scalerModel = scaler.fit(ndf)

In [None]:
scaledData = scalerModel.transform(ndf)

In [None]:
# scaledData.select("stdFeatures").show(3, truncate = False)

In [None]:
(trainingData, testData) = scaledData.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

<font color = 'tomato'>
    <h3>Use MultiLayerPerceptron Classifier </h3>
    <ol>
        <li>Create MultiLayerPerceptronClassifier instance</li>
        <li>Make sure to define layers</li>
        <li>Fit the transformed features</li>
        <li>Transform the model</li>
        <li>Evaluate the model using multi-class classification evaluator</li>
        <li>Print Test Data accuracy</li>
        <li>Create Confusion Matrix</li>
    </ol>
</font>

## MultiLayerPerceptron Classifier

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# There are 13 features and 3 Classes - Customer Segments)
layers = [13, 9, 7, 3]

In [None]:
trainer = MultilayerPerceptronClassifier(featuresCol = 'stdFeatures', labelCol = 'label',
                                         maxIter = 100, layers = layers, blockSize = 3,
                                         seed = 100)

In [None]:
model = trainer.fit(trainingData)

In [None]:
result = model.transform(testData)

In [None]:
predictionAndLabels = result.select("prediction", "label")

In [None]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

In [None]:
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
result.select('label', 'prediction', 'probability', 'features').show(52)

In [None]:
y_true = result.select('label')
y_true = y_true.toPandas()

In [None]:
y_p = result.select('prediction')
y_p = y_p.toPandas()

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_p)
cm

In [None]:
cseg = ["Seg-1", "Seg-2", "Seg-3"]
cm_df = pd.DataFrame(cm, index = cseg, columns = cseg)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(cm_df, annot=True, cmap = 'Blues', annot_kws={"size": 16})
plt.title('Wine Customers Confusion Matrix\n')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()