# MNIST Dataset

## Init Spark 

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("MNIST") \
    .getOrCreate()

## Credits
- [abulbasar](https://github.com/abulbasar/pyspark-examples)
- [MNIST](http://yann.lecun.com/exdb/mnist/)

## Dataset

In [None]:
!wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz

In [None]:
!gzip -d *.gz

In [None]:
!mkdir mnist-data

In [None]:
!mv *-ubyte mnist-data

## Convert Dataset

- https://pjreddie.com/projects/mnist-in-csv/

In [None]:
def convert(imgf, labelf, outf, n):
    f = open(imgf, "rb")
    o = open(outf, "w")
    l = open(labelf, "rb")

    f.read(16)
    l.read(8)
    images = []

    for i in range(n):
        image = [ord(l.read(1))]
        for j in range(28*28):
            image.append(ord(f.read(1)))
        images.append(image)

    for image in images:
        o.write(",".join(str(pix) for pix in image)+"\n")
    f.close()
    o.close()
    l.close()

In [None]:
convert("./mnist-data/train-images-idx3-ubyte", "./mnist-data/train-labels-idx1-ubyte", "./mnist-data/mnist_train.csv", 60000)
convert("./mnist-data/t10k-images-idx3-ubyte", "./mnist-data/t10k-labels-idx1-ubyte", "./mnist-data/mnist_test.csv", 10000)

## Path Setup

In [None]:
import os
path = os.getcwd()
print(path)

## Read Data as DataFrames

In [None]:
df_training = (spark
               .read
               .options(header = False, inferSchema = True)
               .csv(f"file://{path}/mnist-data/mnist_train.csv"))

In [None]:
df_training.count()

## Transforming Features to Vectors

In [None]:
print("No of columns: ", len(df_training.columns), df_training.columns)

In [None]:
feature_culumns = ["_c" + str(i+1) for i in range(784)]
print(feature_culumns)

In [None]:
from pyspark.ml.feature import VectorAssembler
vectorizer = VectorAssembler(inputCols=feature_culumns, outputCol="features")
training = (vectorizer
            .transform(df_training)
            .select("_c0", "features")
            .toDF("label", "features")
            .repartition(15)
            .cache())
training.show()

## Visualize the Data

In [None]:
a = training.first().features.toArray()
type(a)

In [None]:
import matplotlib.pyplot as plt

plt.imshow(a.reshape(28, 28), cmap="Greys")

In [None]:
images = training.sample(False, 0.01, 1).take(25)
fig, _ = plt.subplots(5, 5, figsize = (10, 10))
for i, ax in enumerate(fig.axes):
    r = images[i]
    label = r.label
    features = r.features
    ax.imshow(features.toArray().reshape(28, 28), cmap = "Greys")
    ax.set_title("True: " + str(label))

plt.tight_layout()

## Distribution of Digits

In [None]:
counts_df = training.groupBy("label").count().orderBy("label")

In [None]:
counts_df.show(10)

In [None]:
counts = counts_df.rdd.map(lambda r: {"label": r['label'], "count": r['count']}).collect()

In [None]:
counts

In [None]:
import pandas as pd
pd.DataFrame(counts).set_index("label").sort_index().plot.bar()

## Loading Test Data

In [None]:
df_testing = (spark
              .read
              .options(header = False, inferSchema = True)
              .csv(f"file://{path}/mnist-data/mnist_test.csv"))

testing = (vectorizer
           .transform(df_testing)
           .select("_c0", "features")
           .toDF("label", "features")
           .cache())

In [None]:
testing.show(2)

## Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", 
                        labelCol="label", 
                        regParam=0.1, 
                        elasticNetParam=0.1, 
                        maxIter=10000)

In [None]:
lr_model = lr.fit(training)

## Predict 

In [None]:
from pyspark.sql.functions import expr

In [None]:
test_pred = lr_model.transform(testing).withColumn("matched", expr("label == prediction"))
test_pred.show()

## Evaluate 

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
evaluator.evaluate(test_pred)

In [None]:
from pyspark.sql.functions import avg
test_pred \
 .withColumn("matched", expr("cast(matched as int)")) \
 .groupby("label") \
 .agg(avg("matched")) \
 .orderBy("label") \
 .show()

In [None]:
wrong_df = test_pred.filter("matched = false")

In [None]:
wrong_df.count()

In [None]:
images = wrong_df.take(36)

In [None]:
len(images)

In [None]:
fig, _ = plt.subplots(6, 6, figsize = (20, 20))
for i, ax in enumerate(fig.axes):
    r = images[i]
    label = r.label
    prediction = int(r.prediction)
    features = r.features
    ax.imshow(features.toArray().reshape(28, 28), cmap = "Greys")
    ax.set_title(f"True: {str(label)} / Pred: {str(prediction)}")

plt.show()

## Stop Spark 

In [None]:
spark.stop()