In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import os
from PIL import Image
import pandas as pd
import numpy as np

## Part B: Structured Streaming

### Pre-Processing

In [23]:
#https://stackoverflow.com/questions/1120707/using-python-to-execute-a-command-on-every-file-in-a-folder
#https://stackoverflow.com/questions/7762948/how-to-convert-an-rgb-image-to-numpy-array

for filename in os.listdir('lfw'):
    name = 'lfw/'+filename
    img = Image.open( name )
    img.load()
    #data = np.asarray(img)
    n = 'lfw_np/'+filename[:-4]+'.csv'
    #person = np.asarray(filename[:-9])
    #data2 = data.tolist().append(person)
    #np.savetxt(n, data2,delimiter=',')
    #df = pd.DataFrame({'person': [filename[:-9]], 'rgb':[np.asarray(img)]})
    df = pd.DataFrame({'person': [filename[:-9]]})
    df.to_csv(n, index=False)

In [24]:
import random as rand

# seed for reproducability
rand.seed(100)

for filename in os.listdir('lfw_np'):
    r = randint(1, 2)
    current = 'lfw_np/'+filename
    if (r==1):
        new = 'lfw_batch/'+filename
    else:
        new = 'lfw_stream/'+filename
    # https://stackoverflow.com/questions/8858008/how-to-move-a-file-in-python
    os.rename(current, new)

### Batch Processing

In [25]:
# From https://docs.databricks.com/_static/notebooks/structured-streaming-python.html
from pyspark.sql.types import *
from pyspark.sql import SQLContext

inputPath = "lfw_batch/"
#sc.stop()
sc = pyspark.SparkContext(appName="stream")

sqlContext = SQLContext(sc)

# Static DataFrame representing data in the csv files
staticInputDF = (
  sqlContext
    .read
    .format("csv")
    .option("header", "true") 
    .load(inputPath)
)

display(staticInputDF)

DataFrame[person: string]

In [26]:
staticInputDF.show(5)

+--------------------+
|              person|
+--------------------+
|Sergei_Alexandrov...|
|Sabah_Al-Ahmad_Al...|
|Enrique_Haroldo_G...|
|Maria_Soledad_Alv...|
|Maria_Soledad_Alv...|
+--------------------+
only showing top 5 rows



In [27]:
from pyspark.sql.functions import *      # for window() function

staticCountsDF = (
  staticInputDF
    .groupBy(
       staticInputDF.person)    
    .count()
)
staticCountsDF.cache()

# Register the DataFrame as table 'static_counts'
staticCountsDF.createOrReplaceTempView("static_counts")



In [66]:
sqlContext.sql("select person,count from static_counts where count >50 order by count desc").show()

+-----------------+-----+
|           person|count|
+-----------------+-----+
|    George_W_Bush|  253|
|     Colin_Powell|  117|
|  Donald_Rumsfeld|   64|
|       Tony_Blair|   62|
|Gerhard_Schroeder|   53|
+-----------------+-----+



### Stream Processing

In [42]:
# From https://docs.databricks.com/_static/notebooks/structured-streaming-python.html
inputPath = "lfw_stream/"
schema = StructType([StructField("person",StringType())])
# Similar to definition of staticInputDF above, just using `readStream` instead of `read`
streamingInputDF = (
  sqlContext
    .readStream                       
    .format("csv")              # Set the schema of the JSON data
    .schema(schema)
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .option("header", "true") 
    .load(inputPath)
)

# Same query as staticInputDF
streamingCountsDF = (                 
  streamingInputDF
    .groupBy(
      streamingInputDF.person)
    .count()
)

# Is this DF actually a streaming DF?
streamingCountsDF.isStreaming

True

In [49]:
sqlContext.setConf("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small

query = (
  streamingCountsDF
    .writeStream
    .format("memory")        # memory = store in-memory table (for testing only in Spark 2.0)
    .queryName("pcs")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

There are 6,664 csvs in the lfw_stream folder, so processing the stream will take a long time. First check that it is getting data at all:

In [51]:
sqlContext.sql("select * from pcs").show()

+---------------+-----+
|         person|count|
+---------------+-----+
|Aaron_Patterson|    1|
|      Abba_Eban|    1|
|  Aaron_Peirsol|    3|
|   Aaron_Tippin|    1|
|    Aaron_Guiel|    1|
+---------------+-----+



Check the data a few seconds later:

In [52]:
sqlContext.sql("select * from pcs").show()

+--------------------+-----+
|              person|count|
+--------------------+-----+
|     Aaron_Patterson|    1|
|           Abba_Eban|    1|
|       Aaron_Peirsol|    3|
|  Abdel_Madi_Shabneh|    1|
|        Aaron_Tippin|    1|
|         Aaron_Guiel|    1|
| Abdel_Nasser_Assidi|    1|
|Abdul_Majeed_Shob...|    1|
|        Abdul_Rahman|    1|
+--------------------+-----+



And again a few seconds after that:

In [53]:
sqlContext.sql("select * from pcs").show()

+--------------------+-----+
|              person|count|
+--------------------+-----+
|     Aaron_Patterson|    1|
|           Abba_Eban|    1|
|       Aaron_Peirsol|    3|
|  Abdel_Madi_Shabneh|    1|
|        Aaron_Tippin|    1|
|         Aaron_Guiel|    1|
| Abdel_Nasser_Assidi|    1|
|Abdul_Majeed_Shob...|    1|
|        Abdul_Rahman|    1|
|   Abdulaziz_Kamilov|    1|
|            Abdullah|    2|
+--------------------+-----+



There is now enough data to start querying based on count value:

In [54]:
sqlContext.sql("select * from pcs where count > 1").show()

+-------------+-----+
|       person|count|
+-------------+-----+
|Aaron_Peirsol|    3|
| Abdullah_Gul|    6|
|     Abdullah|    3|
+-------------+-----+



Check the counts again:

In [55]:
sqlContext.sql("select * from pcs where count > 1").show()

+-------------+-----+
|       person|count|
+-------------+-----+
| Abel_Pacheco|    3|
|Aaron_Peirsol|    3|
| Abdullah_Gul|    9|
|     Abdullah|    3|
+-------------+-----+



Check how many people have been evaluated so far:

In [56]:
sqlContext.sql("select count(person) from pcs").show()

+-------------+
|count(person)|
+-------------+
|           26|
+-------------+



After allowing the stream to run for 20 minutes...

In [57]:
sqlContext.sql("select count(person) from pcs").show()

+-------------+
|count(person)|
+-------------+
|          131|
+-------------+



In [58]:
sqlContext.sql("select sum(count) from pcs").show()

+----------+
|sum(count)|
+----------+
|       221|
+----------+



In ~25 minutes of streaming 221 files have been read, representing 131 people. Looking at the query filtering based on count: 

In [59]:
sqlContext.sql("select * from pcs where count > 1").show()

+--------------------+-----+
|              person|count|
+--------------------+-----+
|        Albert_Costa|    4|
|        Adrien_Brody|    7|
|       Aaron_Peirsol|    3|
|        Alice_Fisher|    2|
|           Ali_Naimi|    3|
|  Alexander_Losyukov|    3|
|           Alex_Sink|    3|
|        Alec_Baldwin|    2|
|      Akhmed_Zakayev|    2|
|    Alexander_Downer|    2|
|        Adam_Sandler|    3|
|       Ahmed_Chalabi|    2|
|         Ai_Sugiyama|    2|
|Alvaro_Silva_Cald...|    2|
|        Abel_Pacheco|    3|
|        Alvaro_Uribe|   21|
|        Alvaro_Noboa|    3|
|        Ali_Khamenei|    2|
|            Abdullah|    3|
|        Aldo_Paredes|    2|
+--------------------+-----+
only showing top 20 rows



Only one person has more than 10 pictures in the rows returned, but there are now more than 20 rows meeting the condition and not all the rows are displayed (e.g. Abdulla Gul is not showing)

In [60]:
sqlContext.sql("select * from pcs where person like 'Abdullah%'").show()

+--------------------+-----+
|              person|count|
+--------------------+-----+
|            Abdullah|    3|
|Abdullah_Ahmad_Ba...|    1|
|        Abdullah_Gul|    9|
+--------------------+-----+



Allowing this to run for several hours:

In [61]:
sqlContext.sql("select sum(count) from pcs").show()

+----------+
|sum(count)|
+----------+
|      6664|
+----------+



In [62]:
sqlContext.sql("select count(person) from pcs").show()

+-------------+
|count(person)|
+-------------+
|         3499|
+-------------+



The entire "stream" data set has been read, so it can be compared to the static one.

In [65]:
sqlContext.sql("select person,count from pcs where count >50 order by count desc").show()

+-----------------+-----+
|           person|count|
+-----------------+-----+
|    George_W_Bush|  277|
|     Colin_Powell|  119|
|       Tony_Blair|   82|
|  Donald_Rumsfeld|   57|
|Gerhard_Schroeder|   56|
+-----------------+-----+



Since the csvs were randomly assigned, the numbers per person differ between the two sets, but the fact that both queries return the same list of names makes sense.

In [67]:
sc.stop()

## Part C: SparkML
For this portion of the assignment, I am using the embeddings generated by the OpenFace trained CNN. In the OpenFace face recognition pipeline I am using in my project I train an SVM model on the embeddings. The embeddings with their labels are in train/embeddings.csv. 

In [119]:
from pyspark.sql.types import *
from pyspark.sql import SQLContext

sc.stop()
sc = pyspark.SparkContext(appName="ml")

sqlContext = SQLContext(sc)

In [120]:
inputPath = "train/embeddings.csv"
embeddings = sqlContext.read.options(header='true', inferSchema='true').csv(inputPath)
embeddings.show(5)

+-----+------------+------------+------------+------------+------------+------------+------------+------------+------------+-----------+------------+------------+------------+-----------+-----------+-----------+------------+------------+-----------+------------+------------+-----------+------------+------------+------------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+------------+------------+------------+-----------+------------+------------+-----------+------------+------------+------------+------------+------------+-----------+-----------+------------+------------+-----------+------------+------------+------------+-----------+-----------+------------+------------+------------+------------+------------+------------+-----------+------------+------------+----

In [121]:
# https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html#preprocess-data
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

# Transform all features into a vector using VectorAssembler
assemblerInputs = ["e1","e2","e3","e4","e5","e6","e7","e8","e9","e10",
                   "e11","e12","e13","e14","e15","e16","e17","e18","e19","e20",
                  "e21","e22","e23","e24","e25","e26","e27","e28","e29","e30",
                  "e31","e32","e33","e34","e35","e36","e37","e38","e39","e40",
                  "e41","e42","e43","e44","e45","e46","e47","e48","e49","e50",
                  "e51","e52","e53","e54","e55","e56","e57","e58","e59","e60",
                  "e61","e62","e63","e64","e65","e66","e67","e68","e69","e70",
                  "e71","e72","e73","e74","e75","e76","e77","e78","e79","e80",
                  "e81","e82","e83","e84","e85","e86","e87","e88","e89","e90",
                  "e91","e92","e93","e94","e95","e96","e97","e98","e99","e100",
                  "e101","e102","e103","e104","e105","e106","e107","e108","e109","e110",
                  "e111","e112","e113","e114","e115","e116","e117","e118","e19","e120",
                  "e121","e122","e123","e124","e125","e126","e127","e128"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [122]:
data = assembler.transform(embeddings)

In [123]:
stages=[assembler]
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(embeddings)
dataset = pipelineModel.transform(embeddings)

# Keep relevant columns
selectedcols = ["label", "features"] #+ cols
dataset = dataset.select(selectedcols)
display(dataset)

DataFrame[label: int, features: vector]

In [124]:
trainData, testData = dataset.randomSplit([0.6, 0.4])

The original data was sorted by label, so confirm that the labels are distributed between the train and test sets:

In [125]:
testData.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|   51|
|    3|   70|
|    5|  216|
|    4|   56|
|    2|   96|
+-----+-----+



In [126]:
trainData.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|   58|
|    3|   74|
|    5|  313|
|    4|   64|
|    2|  140|
+-----+-----+



### Decision Tree

In [133]:
from pyspark.ml.classification import DecisionTreeClassifier
import time

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(maxDepth=11, labelCol="label")

# Train model with Training Data

start = time.time()
dtModel = dt.fit(trainData)
end = time.time()
print(end - start)

2.174104928970337


In [134]:
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

numNodes =  47
depth =  7


In [135]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)

In [136]:
predictions.printSchema()

root
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [137]:
predictions.select("label", "prediction", "probability").show()


+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    1|       4.0|[0.0,0.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       3.0|[0.0,0.0,0.0,1.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
|    1|       1.0|[0.0,1.0,0.0,0.0,...|
+-----+----------+--------------------+
only showing top 20 rows



In [145]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.903885 


In [146]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision = %g " % (precision))

Precision = 0.909685 


In [148]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall = %g " % (recall))

Recall = 0.903885 


In [139]:
# Confusion Matrix
# https://mingchen0919.github.io/learning-apache-spark/decision-tree-classification.html
label_and_pred = dtModel.transform(testData).select('label', 'prediction')
label_and_pred.rdd.zipWithIndex().countByKey()


defaultdict(int,
            {Row(label=1, prediction=1.0): 43,
             Row(label=1, prediction=2.0): 3,
             Row(label=1, prediction=3.0): 2,
             Row(label=1, prediction=4.0): 2,
             Row(label=1, prediction=5.0): 1,
             Row(label=2, prediction=2.0): 89,
             Row(label=2, prediction=3.0): 6,
             Row(label=2, prediction=5.0): 1,
             Row(label=3, prediction=2.0): 5,
             Row(label=3, prediction=3.0): 65,
             Row(label=4, prediction=1.0): 4,
             Row(label=4, prediction=2.0): 1,
             Row(label=4, prediction=3.0): 2,
             Row(label=4, prediction=4.0): 49,
             Row(label=5, prediction=1.0): 7,
             Row(label=5, prediction=2.0): 1,
             Row(label=5, prediction=3.0): 4,
             Row(label=5, prediction=4.0): 8,
             Row(label=5, prediction=5.0): 196})

### Random Forest

In [167]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
start = time.time()
rfModel = rf.fit(trainData)
end = time.time()
print(end - start)

2.4110472202301025


In [150]:
# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

In [151]:
predictions.printSchema()

root
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [152]:
predictions.select("label", "prediction", "probability").show()

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    1|       1.0|[0.0,0.3715238150...|
|    1|       1.0|[0.0,0.3812885647...|
|    1|       1.0|[0.0,0.8009500351...|
|    1|       5.0|[0.0,0.1575277101...|
|    1|       1.0|[0.0,0.4698117166...|
|    1|       1.0|[0.0,0.6830540723...|
|    1|       1.0|[0.0,0.6469463720...|
|    1|       1.0|[0.0,0.8129489232...|
|    1|       1.0|[0.0,0.8565055906...|
|    1|       1.0|[0.0,0.4453730202...|
|    1|       1.0|[0.0,0.7128711368...|
|    1|       5.0|[0.0,0.3670721367...|
|    1|       1.0|[0.0,0.5286222342...|
|    1|       1.0|[0.0,0.7311721611...|
|    1|       1.0|[0.0,0.6273129752...|
|    1|       1.0|[0.0,0.8291416040...|
|    1|       5.0|[0.0,0.1595484144...|
|    1|       1.0|[0.0,0.4987773061...|
|    1|       1.0|[0.0,0.8530333684...|
|    1|       1.0|[0.0,0.7426879232...|
+-----+----------+--------------------+
only showing top 20 rows



In [154]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision = %g " % (precision))

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall = %g " % (recall))

Accuracy = 0.9591 
Precision = 0.95955 
Recall = 0.9591 


The Gradient Boost classifier currently only supports binary classification so it is not appropriate for this data set. SVM is not available. MLP Classifier throws ambiguous error.

In [155]:
# Confusion Matrix
label_and_pred = rfModel.transform(testData).select('label', 'prediction')
label_and_pred.rdd.zipWithIndex().countByKey()


defaultdict(int,
            {Row(label=1, prediction=1.0): 47,
             Row(label=1, prediction=3.0): 1,
             Row(label=1, prediction=5.0): 3,
             Row(label=2, prediction=2.0): 94,
             Row(label=2, prediction=3.0): 1,
             Row(label=2, prediction=5.0): 1,
             Row(label=3, prediction=2.0): 4,
             Row(label=3, prediction=3.0): 63,
             Row(label=3, prediction=4.0): 2,
             Row(label=3, prediction=5.0): 1,
             Row(label=4, prediction=1.0): 1,
             Row(label=4, prediction=4.0): 55,
             Row(label=5, prediction=2.0): 1,
             Row(label=5, prediction=3.0): 2,
             Row(label=5, prediction=4.0): 3,
             Row(label=5, prediction=5.0): 210})

### Multinomial Logistic Regression

In [189]:
# https://spark.apache.org/docs/latest/ml-classification-regression.html#multinomial-logistic-regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)


In [190]:
# train the model
start = time.time()
lrmodel = lr.fit(trainData)
end = time.time()
print(end - start)

0.650860071182251


In [191]:
trainingSummary = lrmodel.summary

In [192]:
# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

False positive rate by label:
label 0: 0.0
label 1: 0.03732809430255403
label 2: 0.0
label 3: 0.0
label 4: 0.5714285714285714
True positive rate by label:
label 0: 0.0
label 1: 0.8928571428571429
label 2: 0.0
label 3: 0.0
label 4: 1.0
Precision by label:
label 0: 0.0
label 1: 0.8680555555555556
label 2: 0.0
label 3: 0.0
label 4: 0.6198019801980198
Recall by label:
label 0: 0.0
label 1: 0.8928571428571429
label 2: 0.0
label 3: 0.0
label 4: 1.0
F-measure by label:
label 0: 0.0
label 1: 0.8802816901408451
label 2: 0.0
label 3: 0.0
label 4: 0.765281173594132


In [193]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.674884437596302
FPR: 0.2836411033274275
TPR: 0.674884437596302
F-measure: 0.558971408250665
Precision: 0.486172261293926
Recall: 0.674884437596302


In [194]:
# Make predictions on test data using the Transformer.transform() method.
predictions = lrmodel.transform(testData)

In [195]:
predictions.select("label", "prediction", "probability").show()

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    1|       5.0|[0.00169650794029...|
|    1|       5.0|[0.00151308069433...|
|    1|       5.0|[0.00166768256868...|
|    1|       5.0|[0.00149363998806...|
|    1|       5.0|[0.00171846804535...|
|    1|       5.0|[0.00153413562170...|
|    1|       5.0|[0.00162077523378...|
|    1|       5.0|[0.00157817344816...|
|    1|       5.0|[0.00176216081979...|
|    1|       5.0|[0.00157800338034...|
|    1|       5.0|[0.00165536312677...|
|    1|       5.0|[0.00164028223821...|
|    1|       5.0|[0.00155672362625...|
|    1|       5.0|[0.00142800467150...|
|    1|       5.0|[0.00153452235884...|
|    1|       5.0|[0.00153433540389...|
|    1|       5.0|[0.00126918593580...|
|    1|       5.0|[0.00191308443030...|
|    1|       5.0|[0.00163590734046...|
|    1|       5.0|[0.00177801012549...|
+-----+----------+--------------------+
only showing top 20 rows



In [196]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision = %g " % (precision))

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall = %g " % (recall))


Accuracy = 0.593047 
Precision = 0.39937 
Recall = 0.593047 


In [197]:
# Confusion Matrix
label_and_pred = lrmodel.transform(testData).select('label', 'prediction')
label_and_pred.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=1, prediction=5.0): 51,
             Row(label=2, prediction=2.0): 74,
             Row(label=2, prediction=5.0): 22,
             Row(label=3, prediction=2.0): 16,
             Row(label=3, prediction=5.0): 54,
             Row(label=4, prediction=2.0): 1,
             Row(label=4, prediction=5.0): 55,
             Row(label=5, prediction=5.0): 216})

In [217]:
sc.stop()

### Comparison to TensorFlow

In [2]:
%run -i 'image_retraining/retrain.py' --image_dir 'train'

INFO:tensorflow:Looking for images in 'Colin_Powell'
INFO:tensorflow:Looking for images in 'Donald_Rumsfeld'
INFO:tensorflow:Looking for images in 'George_W_Bush'
INFO:tensorflow:Looking for images in 'Gerhard_Schroeder'
INFO:tensorflow:Looking for images in 'Tony_Blair'
>> Downloading inception-2015-12-05.tgz 100.0%
INFO:tensorflow:Successfully downloaded inception-2015-12-05.tgz 88931400 bytes.
Extracting file from  /tmp/imagenet/inception-2015-12-05.tgz
Model path:  /tmp/imagenet/classify_image_graph_def.pb
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0001.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0002.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0003.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0004.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/C

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0091.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0093.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0094.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0095.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0096.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0098.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0099.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0100.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0103.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck a

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0182.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0183.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0185.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0187.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0188.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0189.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0190.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0191.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0193.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck a

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0097.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0109.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0139.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0145.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0155.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0171.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0173.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0192.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Colin_Powell/Colin_Powell_0223.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck a

INFO:tensorflow:300 bottleneck files created.
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0075.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0078.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0081.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0082.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0083.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0084.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0086.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Donald_Rumsfeld/Donald_Rumsfeld_0088.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottle

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0025.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0026.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0027.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0028.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0029.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0031.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0032.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0033.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0034.jpg_inception_v3.txt
INFO:tensorflow:Cre

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0123.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0124.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0125.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0126.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0127.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0128.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0129.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0130.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0131.jpg_inception_v3.txt
INFO:tensorflow:Cre

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0216.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0217.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0218.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0219.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0220.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0222.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0223.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0224.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0225.jpg_inception_v3.txt
INFO:tensorflow:Cre

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0319.jpg_inception_v3.txt
INFO:tensorflow:600 bottleneck files created.
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0320.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0322.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0323.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0324.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0325.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0327.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0328.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0411.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0412.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0414.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0415.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0416.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0417.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0418.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0420.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0421.jpg_inception_v3.txt
INFO:tensorflow:Cre

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0499.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0500.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0501.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0502.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0503.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0504.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0505.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0506.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0507.jpg_inception_v3.txt
INFO:tensorflow:Cre

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0413.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0433.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0439.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0453.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0463.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0475.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0491.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0497.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/George_W_Bush/George_W_Bush_0509.jpg_inception_v3.txt
INFO:tensorflow:Cre

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0016.jpg_inception_v3.txt
INFO:tensorflow:900 bottleneck files created.
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0018.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0019.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0020.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0021.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0022.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0023.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0024.jpg_inception_v3.txt
INFO:tensorflow:Cr

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0101.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0102.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0105.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0106.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0107.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0108.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0005.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_Schroeder/Gerhard_Schroeder_0025.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Gerhard_S

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0063.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0064.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0065.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0066.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0067.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0068.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0070.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0071.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0072.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Bl

INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0134.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0138.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0014.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0033.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0053.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0057.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0059.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0069.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Blair_0080.jpg_inception_v3.txt
INFO:tensorflow:Creating bottleneck at /tmp/bottleneck/Tony_Blair/Tony_Bl

INFO:tensorflow:2018-03-22 18:36:54.509997: Step 270: Cross entropy = 1.033288
INFO:tensorflow:2018-03-22 18:36:54.615424: Step 270: Validation accuracy = 75.0% (N=100)
INFO:tensorflow:2018-03-22 18:36:55.553819: Step 280: Train accuracy = 68.0%
INFO:tensorflow:2018-03-22 18:36:55.555056: Step 280: Cross entropy = 1.013279
INFO:tensorflow:2018-03-22 18:36:55.654632: Step 280: Validation accuracy = 66.0% (N=100)
INFO:tensorflow:2018-03-22 18:36:56.568346: Step 290: Train accuracy = 71.0%
INFO:tensorflow:2018-03-22 18:36:56.569527: Step 290: Cross entropy = 1.013082
INFO:tensorflow:2018-03-22 18:36:56.660687: Step 290: Validation accuracy = 72.0% (N=100)
INFO:tensorflow:2018-03-22 18:36:57.641976: Step 300: Train accuracy = 73.0%
INFO:tensorflow:2018-03-22 18:36:57.643096: Step 300: Cross entropy = 1.019461
INFO:tensorflow:2018-03-22 18:36:57.749060: Step 300: Validation accuracy = 76.0% (N=100)
INFO:tensorflow:2018-03-22 18:36:58.721943: Step 310: Train accuracy = 72.0%
INFO:tensorflow:

INFO:tensorflow:2018-03-22 18:37:30.199865: Step 600: Validation accuracy = 74.0% (N=100)
INFO:tensorflow:2018-03-22 18:37:31.194024: Step 610: Train accuracy = 85.0%
INFO:tensorflow:2018-03-22 18:37:31.195351: Step 610: Cross entropy = 0.781089
INFO:tensorflow:2018-03-22 18:37:31.303268: Step 610: Validation accuracy = 73.0% (N=100)
INFO:tensorflow:2018-03-22 18:37:32.310478: Step 620: Train accuracy = 77.0%
INFO:tensorflow:2018-03-22 18:37:32.312070: Step 620: Cross entropy = 0.884044
INFO:tensorflow:2018-03-22 18:37:32.415002: Step 620: Validation accuracy = 72.0% (N=100)
INFO:tensorflow:2018-03-22 18:37:33.324898: Step 630: Train accuracy = 81.0%
INFO:tensorflow:2018-03-22 18:37:33.326517: Step 630: Cross entropy = 0.775146
INFO:tensorflow:2018-03-22 18:37:33.424484: Step 630: Validation accuracy = 67.0% (N=100)
INFO:tensorflow:2018-03-22 18:37:34.318641: Step 640: Train accuracy = 77.0%
INFO:tensorflow:2018-03-22 18:37:34.319918: Step 640: Cross entropy = 0.775182
INFO:tensorflow:

INFO:tensorflow:2018-03-22 18:38:05.366699: Step 940: Train accuracy = 84.0%
INFO:tensorflow:2018-03-22 18:38:05.368340: Step 940: Cross entropy = 0.676816
INFO:tensorflow:2018-03-22 18:38:05.462812: Step 940: Validation accuracy = 82.0% (N=100)
INFO:tensorflow:2018-03-22 18:38:06.409456: Step 950: Train accuracy = 93.0%
INFO:tensorflow:2018-03-22 18:38:06.411037: Step 950: Cross entropy = 0.571442
INFO:tensorflow:2018-03-22 18:38:06.513579: Step 950: Validation accuracy = 79.0% (N=100)
INFO:tensorflow:2018-03-22 18:38:07.445500: Step 960: Train accuracy = 92.0%
INFO:tensorflow:2018-03-22 18:38:07.446680: Step 960: Cross entropy = 0.542283
INFO:tensorflow:2018-03-22 18:38:07.541464: Step 960: Validation accuracy = 71.0% (N=100)
INFO:tensorflow:2018-03-22 18:38:08.468405: Step 970: Train accuracy = 83.0%
INFO:tensorflow:2018-03-22 18:38:08.469811: Step 970: Cross entropy = 0.691517
INFO:tensorflow:2018-03-22 18:38:08.569171: Step 970: Validation accuracy = 75.0% (N=100)
INFO:tensorflow:

INFO:tensorflow:2018-03-22 18:38:38.883986: Step 1270: Train accuracy = 87.0%
INFO:tensorflow:2018-03-22 18:38:38.885227: Step 1270: Cross entropy = 0.608695
INFO:tensorflow:2018-03-22 18:38:38.985019: Step 1270: Validation accuracy = 72.0% (N=100)
INFO:tensorflow:2018-03-22 18:38:39.896124: Step 1280: Train accuracy = 87.0%
INFO:tensorflow:2018-03-22 18:38:39.897335: Step 1280: Cross entropy = 0.624496
INFO:tensorflow:2018-03-22 18:38:39.996817: Step 1280: Validation accuracy = 80.0% (N=100)
INFO:tensorflow:2018-03-22 18:38:40.896563: Step 1290: Train accuracy = 79.0%
INFO:tensorflow:2018-03-22 18:38:40.897655: Step 1290: Cross entropy = 0.620115
INFO:tensorflow:2018-03-22 18:38:40.995919: Step 1290: Validation accuracy = 76.0% (N=100)
INFO:tensorflow:2018-03-22 18:38:41.893870: Step 1300: Train accuracy = 86.0%
INFO:tensorflow:2018-03-22 18:38:41.895620: Step 1300: Cross entropy = 0.614992
INFO:tensorflow:2018-03-22 18:38:41.996838: Step 1300: Validation accuracy = 79.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:39:12.189251: Step 1600: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:39:12.190500: Step 1600: Cross entropy = 0.479279
INFO:tensorflow:2018-03-22 18:39:12.287242: Step 1600: Validation accuracy = 85.0% (N=100)
INFO:tensorflow:2018-03-22 18:39:13.190737: Step 1610: Train accuracy = 91.0%
INFO:tensorflow:2018-03-22 18:39:13.191993: Step 1610: Cross entropy = 0.529382
INFO:tensorflow:2018-03-22 18:39:13.282993: Step 1610: Validation accuracy = 78.0% (N=100)
INFO:tensorflow:2018-03-22 18:39:14.180979: Step 1620: Train accuracy = 88.0%
INFO:tensorflow:2018-03-22 18:39:14.182173: Step 1620: Cross entropy = 0.491252
INFO:tensorflow:2018-03-22 18:39:14.279836: Step 1620: Validation accuracy = 79.0% (N=100)
INFO:tensorflow:2018-03-22 18:39:15.197024: Step 1630: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:39:15.198517: Step 1630: Cross entropy = 0.482876
INFO:tensorflow:2018-03-22 18:39:15.292103: Step 1630: Validation accuracy = 84.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:39:46.111388: Step 1930: Train accuracy = 92.0%
INFO:tensorflow:2018-03-22 18:39:46.112659: Step 1930: Cross entropy = 0.428218
INFO:tensorflow:2018-03-22 18:39:46.220724: Step 1930: Validation accuracy = 92.0% (N=100)
INFO:tensorflow:2018-03-22 18:39:47.151569: Step 1940: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:39:47.152763: Step 1940: Cross entropy = 0.440543
INFO:tensorflow:2018-03-22 18:39:47.243535: Step 1940: Validation accuracy = 79.0% (N=100)
INFO:tensorflow:2018-03-22 18:39:48.139123: Step 1950: Train accuracy = 88.0%
INFO:tensorflow:2018-03-22 18:39:48.140318: Step 1950: Cross entropy = 0.488224
INFO:tensorflow:2018-03-22 18:39:48.230373: Step 1950: Validation accuracy = 84.0% (N=100)
INFO:tensorflow:2018-03-22 18:39:49.111747: Step 1960: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:39:49.112845: Step 1960: Cross entropy = 0.429321
INFO:tensorflow:2018-03-22 18:39:49.207234: Step 1960: Validation accuracy = 90.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:40:22.221634: Step 2260: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:40:22.223064: Step 2260: Cross entropy = 0.437133
INFO:tensorflow:2018-03-22 18:40:22.317575: Step 2260: Validation accuracy = 84.0% (N=100)
INFO:tensorflow:2018-03-22 18:40:23.336501: Step 2270: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:40:23.337881: Step 2270: Cross entropy = 0.406551
INFO:tensorflow:2018-03-22 18:40:23.447130: Step 2270: Validation accuracy = 80.0% (N=100)
INFO:tensorflow:2018-03-22 18:40:24.469506: Step 2280: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:40:24.470676: Step 2280: Cross entropy = 0.356825
INFO:tensorflow:2018-03-22 18:40:24.590335: Step 2280: Validation accuracy = 89.0% (N=100)
INFO:tensorflow:2018-03-22 18:40:25.591965: Step 2290: Train accuracy = 90.0%
INFO:tensorflow:2018-03-22 18:40:25.593241: Step 2290: Cross entropy = 0.463659
INFO:tensorflow:2018-03-22 18:40:25.691325: Step 2290: Validation accuracy = 83.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:41:00.045726: Step 2590: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:41:00.047263: Step 2590: Cross entropy = 0.347984
INFO:tensorflow:2018-03-22 18:41:00.156551: Step 2590: Validation accuracy = 86.0% (N=100)
INFO:tensorflow:2018-03-22 18:41:01.104739: Step 2600: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:41:01.105964: Step 2600: Cross entropy = 0.382417
INFO:tensorflow:2018-03-22 18:41:01.205844: Step 2600: Validation accuracy = 89.0% (N=100)
INFO:tensorflow:2018-03-22 18:41:02.147334: Step 2610: Train accuracy = 95.0%
INFO:tensorflow:2018-03-22 18:41:02.148567: Step 2610: Cross entropy = 0.417857
INFO:tensorflow:2018-03-22 18:41:02.255997: Step 2610: Validation accuracy = 90.0% (N=100)
INFO:tensorflow:2018-03-22 18:41:03.237807: Step 2620: Train accuracy = 88.0%
INFO:tensorflow:2018-03-22 18:41:03.239013: Step 2620: Cross entropy = 0.422879
INFO:tensorflow:2018-03-22 18:41:03.346989: Step 2620: Validation accuracy = 84.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:41:35.890694: Step 2920: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:41:35.891920: Step 2920: Cross entropy = 0.376147
INFO:tensorflow:2018-03-22 18:41:36.000378: Step 2920: Validation accuracy = 83.0% (N=100)
INFO:tensorflow:2018-03-22 18:41:36.963278: Step 2930: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:41:36.965461: Step 2930: Cross entropy = 0.306973
INFO:tensorflow:2018-03-22 18:41:37.073933: Step 2930: Validation accuracy = 84.0% (N=100)
INFO:tensorflow:2018-03-22 18:41:38.064746: Step 2940: Train accuracy = 92.0%
INFO:tensorflow:2018-03-22 18:41:38.066447: Step 2940: Cross entropy = 0.362412
INFO:tensorflow:2018-03-22 18:41:38.178973: Step 2940: Validation accuracy = 86.0% (N=100)
INFO:tensorflow:2018-03-22 18:41:39.147800: Step 2950: Train accuracy = 95.0%
INFO:tensorflow:2018-03-22 18:41:39.149163: Step 2950: Cross entropy = 0.342522
INFO:tensorflow:2018-03-22 18:41:39.250399: Step 2950: Validation accuracy = 82.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:42:10.189410: Step 3250: Train accuracy = 97.0%
INFO:tensorflow:2018-03-22 18:42:10.190502: Step 3250: Cross entropy = 0.280166
INFO:tensorflow:2018-03-22 18:42:10.293647: Step 3250: Validation accuracy = 88.0% (N=100)
INFO:tensorflow:2018-03-22 18:42:11.217214: Step 3260: Train accuracy = 94.0%
INFO:tensorflow:2018-03-22 18:42:11.218588: Step 3260: Cross entropy = 0.343003
INFO:tensorflow:2018-03-22 18:42:11.312475: Step 3260: Validation accuracy = 89.0% (N=100)
INFO:tensorflow:2018-03-22 18:42:12.188840: Step 3270: Train accuracy = 97.0%
INFO:tensorflow:2018-03-22 18:42:12.190107: Step 3270: Cross entropy = 0.336765
INFO:tensorflow:2018-03-22 18:42:12.283995: Step 3270: Validation accuracy = 82.0% (N=100)
INFO:tensorflow:2018-03-22 18:42:13.150613: Step 3280: Train accuracy = 99.0%
INFO:tensorflow:2018-03-22 18:42:13.151772: Step 3280: Cross entropy = 0.269830
INFO:tensorflow:2018-03-22 18:42:13.243840: Step 3280: Validation accuracy = 88.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:42:42.449978: Step 3580: Train accuracy = 92.0%
INFO:tensorflow:2018-03-22 18:42:42.451253: Step 3580: Cross entropy = 0.375878
INFO:tensorflow:2018-03-22 18:42:42.545957: Step 3580: Validation accuracy = 87.0% (N=100)
INFO:tensorflow:2018-03-22 18:42:43.427606: Step 3590: Train accuracy = 95.0%
INFO:tensorflow:2018-03-22 18:42:43.428840: Step 3590: Cross entropy = 0.326939
INFO:tensorflow:2018-03-22 18:42:43.526337: Step 3590: Validation accuracy = 88.0% (N=100)
INFO:tensorflow:2018-03-22 18:42:44.401339: Step 3600: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:42:44.402537: Step 3600: Cross entropy = 0.336386
INFO:tensorflow:2018-03-22 18:42:44.503047: Step 3600: Validation accuracy = 85.0% (N=100)
INFO:tensorflow:2018-03-22 18:42:45.385163: Step 3610: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:42:45.386394: Step 3610: Cross entropy = 0.310468
INFO:tensorflow:2018-03-22 18:42:45.482853: Step 3610: Validation accuracy = 82.0% (N=100)
INFO

INFO:tensorflow:2018-03-22 18:43:14.558731: Step 3910: Train accuracy = 95.0%
INFO:tensorflow:2018-03-22 18:43:14.559921: Step 3910: Cross entropy = 0.308832
INFO:tensorflow:2018-03-22 18:43:14.656937: Step 3910: Validation accuracy = 79.0% (N=100)
INFO:tensorflow:2018-03-22 18:43:15.516036: Step 3920: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:43:15.517190: Step 3920: Cross entropy = 0.311673
INFO:tensorflow:2018-03-22 18:43:15.614612: Step 3920: Validation accuracy = 84.0% (N=100)
INFO:tensorflow:2018-03-22 18:43:16.456932: Step 3930: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:43:16.458266: Step 3930: Cross entropy = 0.299962
INFO:tensorflow:2018-03-22 18:43:16.555865: Step 3930: Validation accuracy = 83.0% (N=100)
INFO:tensorflow:2018-03-22 18:43:17.408256: Step 3940: Train accuracy = 96.0%
INFO:tensorflow:2018-03-22 18:43:17.409395: Step 3940: Cross entropy = 0.337049
INFO:tensorflow:2018-03-22 18:43:17.504437: Step 3940: Validation accuracy = 84.0% (N=100)
INFO

Total run time: 17 minutes

In [5]:
%run -i 'label_image/label_image.py' --graph='/tmp/output_graph.pb' --labels='/tmp/output_labels.txt' --input_layer='Mul' --output_layer='final_result' --input_mean=128 --input_std=128 --image='test/Colin_Powell.jpg'

gerhard schroeder 0.547417
colin powell 0.35788
donald rumsfeld 0.0423357
tony blair 0.0319602
george w bush 0.0204071


In [4]:
%run -i 'label_image/label_image.py' --graph='/tmp/output_graph.pb' --labels='/tmp/output_labels.txt' --input_layer='Mul' --output_layer='final_result' --input_mean=128 --input_std=128 --image='test/George-W-Bush.jpg'

gerhard schroeder 0.56776
george w bush 0.289871
colin powell 0.111549
tony blair 0.0297545
donald rumsfeld 0.0010649


In [6]:
%run -i 'label_image/label_image.py' --graph='/tmp/output_graph.pb' --labels='/tmp/output_labels.txt' --input_layer='Mul' --output_layer='final_result' --input_mean=128 --input_std=128 --image='test/Tiger_Woods.jpg'

george w bush 0.555008
tony blair 0.234888
gerhard schroeder 0.13264
colin powell 0.0765645
donald rumsfeld 0.000900364


## Part D: GraphX and GraphFrames
### Pre-Processing

In [201]:
tables = []
for filename in os.listdir('lfw'):
    df = pd.DataFrame({'person': [filename[:-9]], 'image': [filename]})
    tables.append(df)

output = pd.DataFrame(columns=['person','image'])
for table in tables:
    output = output.append(table)
    
output.to_csv('graphdata/imagelabels.csv', index=False)

### GraphFrames
In analyzing my data set, I followed the Databricks tutorial for using [GraphFrames with Python](https://docs.databricks.com/spark/latest/graph-analysis/graphframes/user-guide-python.html)

In [235]:
from pyspark.sql.types import *
from pyspark.sql import SQLContext

#sc.stop()
sc = pyspark.SparkContext(appName="graph")
sqlContext = SQLContext(sc)


In [237]:
inputPath1 = "graphdata/people_images.csv"
vertices = sqlContext.read.options(header='true', inferSchema='true').csv(inputPath1)
vertices.show(5)

+---------------+------+
|             id|  type|
+---------------+------+
|  Aaron_Eckhart|person|
|    Aaron_Guiel|person|
|Aaron_Patterson|person|
|  Aaron_Peirsol|person|
|     Aaron_Pena|person|
+---------------+------+
only showing top 5 rows



In [238]:
inputPath2 = "graphdata/imagelabels.csv"
edges = sqlContext.read.options(header='true', inferSchema='true').csv(inputPath2)
edges.show(5)

+--------------------+---------------+------------+
|                 src|            dst|relationship|
+--------------------+---------------+------------+
|Aaron_Eckhart_000...|  Aaron_Eckhart|       label|
|Aaron_Guiel_0001.jpg|    Aaron_Guiel|       label|
|Aaron_Patterson_0...|Aaron_Patterson|       label|
|Aaron_Peirsol_000...|  Aaron_Peirsol|       label|
|Aaron_Peirsol_000...|  Aaron_Peirsol|       label|
+--------------------+---------------+------------+
only showing top 5 rows



In [232]:
vertices.printSchema()
edges.printSchema()


root
 |-- vertex: string (nullable = true)
 |-- type: string (nullable = true)

root
 |-- image: string (nullable = true)
 |-- person: string (nullable = true)
 |-- relationship: string (nullable = true)



In [239]:
# https://stackoverflow.com/questions/40894739/dataproc-jupyter-pyspark-notebook-unable-to-import-graphframes-package
import os
sc.addPyFile(os.path.expanduser('~/.ivy2/jars/graphframes_graphframes-0.5.0-spark2.1-s_2.11.jar'))
from graphframes import *

In [3]:
sc.stop()

Although the above import doesn't throw an error, there is an error when I try to create a graph. I get the same error (shown below) when trying to run the code as a python script. It is related to the jar for GraphFrames not being loaded, however adding this jar to my python path did not resolve the error. I thus had to run the code in a PySpark shell. The commands and screenshots of the output are below. A full copy of the shell inputs and outputs (complete with mistakes) are in pyspark_commands.rtf

In [4]:
%run -i 'graph.py'

Vertices:
+---------------+------+
|             id|  type|
+---------------+------+
|  Aaron_Eckhart|person|
|    Aaron_Guiel|person|
|Aaron_Patterson|person|
|  Aaron_Peirsol|person|
|     Aaron_Pena|person|
+---------------+------+
only showing top 5 rows

Edges:
+--------------------+---------------+------------+
|                 src|            dst|relationship|
+--------------------+---------------+------------+
|Aaron_Eckhart_000...|  Aaron_Eckhart|       label|
|Aaron_Guiel_0001.jpg|    Aaron_Guiel|       label|
|Aaron_Patterson_0...|Aaron_Patterson|       label|
|Aaron_Peirsol_000...|  Aaron_Peirsol|       label|
|Aaron_Peirsol_000...|  Aaron_Peirsol|       label|
+--------------------+---------------+------------+
only showing top 5 rows

Schemas:
root
 |-- id: string (nullable = true)
 |-- type: string (nullable = true)

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- relationship: string (nullable = true)



Py4JJavaError: An error occurred while calling o97.loadClass.
: java.lang.ClassNotFoundException: org.graphframes.GraphFramePythonAPI
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


### To run the code

On the command line launch PySpark Shell with the command:
```
pyspark --packages graphframes:graphframes:0.5.0-spark2.1-s_2.11
```
Then load the csvs:
```
sqlContext = SQLContext(sc)
inputPath1 = "graphdata/people_images.csv"
vertices = sqlContext.read.options(header='true', inferSchema='true').csv(inputPath1)
vertices.show(5)  
inputPath2 = "graphdata/imagelabels.csv"
edges = sqlContext.read.options(header='true', inferSchema='true').csv(inputPath2)
edges.show(5)
vertices.printSchema()
edges.printSchema()
```
These output the same tables and schemas as shown above.

Next create the graph:
```
from graphframes import *
g = GraphFrame(vertices, edges)
g
```
![G](images/g_output.png)

Obtain basic metrics about the graph:
```
g.inDegrees.show()
g.edges.filter("relationship = 'label'").count()
g.edges.filter("relationship = 'errata'").count()
```
![metrics](images/metrics.png)

Look at the connected components:
```
sc.setCheckpointDir("checkpoints")
g.connectedComponents().show()
```
![connected components](images/connected.png)

And at the strongly connected components:
```
g.stronglyConnectedComponents(maxIter=10).show()
```
![strongly connected components](images/stronglyConnected.png)

Next test out the PageRank algorithm that comes with GraphFrames:
```
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.show()
results.edges.show()
```
![pagerank](images/pagerank.png)

For context, Francisco Santos has 1 image in this data set, while Gray Davis has 26.

Finally perfom centrality analysis similar to what was done with the structured streaming. The person found to have the most images through structured streaming was George W. Bush.
```
g.edges.filter("dst = 'George_W_Bush'").count()
t = g.edges.groupby("dst").count().orderBy("count", ascending=False)
t.show()
```
![centrality](images/centrality.png)

Finally close the SQL Context and the PySpark shell:
```
sc.stop()
exit()
```
