# <u>Image classification using Spark's MLlib and the MNIST dataset</u>

##### Setup

In [1]:
# Import libraries.
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from time import time

# Create Spark Session.
sc = SparkContext()
spark = SparkSession(sc)
print("UI Web URL:",sc.uiWebUrl)

UI Web URL: http://DESKTOP-JI84C36:4040


##### Prepare the Training Set

In [2]:
# Create PySpark DF from CSV file.
df_training = (spark
               .read
               .options(header = True, inferSchema = True) # 'header = True' b/c our file has column headers
               .csv("NormalizedDataset/mnist_training.csv")) 

# Removes the indexing column. Delete this if there is no index column in your csv.
df_training = df_training.drop(df_training._c0) 

# Extract column names. Remove 'label' to get names of feature columns only. 
feature_columns = df_training.schema.names
feature_columns.pop()

# Vectorize the feature columns for each instance. 
vectorizer = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Map labels to their features vector. 
training = (vectorizer
            .transform(df_training)
            .select("label", "features") # maps 'label' values from df_training to 'features' from vectorizer
            .toDF("label", "features") # assigns the column names in our new df
            .cache())

# For visualizing
#training.show()

##### Prepare the Testing Set

In [3]:
# Create PySpark DF from CSV file.
df_testing = (spark
              .read
              .options(header = True, inferSchema = True)
              .csv("NormalizedDataset/mnist_testing.csv"))

# Remove indexing column.
df_testing = df_testing.drop(df_testing._c0)

# Map labels to their features vector. 
testing = (vectorizer
           .transform(df_testing)
           .select("label", "features")
           .toDF("label", "features")
           .cache())

# For visualizing 
#testing.show()

##### Train and Evaluate

In [4]:
# Initialize classifier
layers = [784, 100, 20, 10]
# We have created a network for 784 input nodes (one for each pixel in our image),
# 100 nodes in the first hidden layer, 20 in the second, and 10 in the ouput 
# (one for each digit the image represents: 0-9).
perceptron = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=128, seed=1234)

# Initialize evluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                               predictionCol="prediction", 
                                               metricName="accuracy")

# Begin training
start_time = time()
perceptron_model = perceptron.fit(training)
stop_time = time()

# Generate predictions and evaluate performance.
test_pred = perceptron_model.transform(testing)
print("Accuracy:", evaluator.evaluate(test_pred))
print("Training Time: %d" % (stop_time - start_time))

Accuracy: 0.8676
Training Time: 2251
