In [None]:
from pyspark.sql import SparkSession

In [None]:
session = SparkSession.Builder().getOrCreate()

In [None]:
!wget https://github.com/zsellami/images_classification/raw/master/personalities.zip

In [None]:
!unzip personalities.zip

In [None]:
from pyspark.sql.functions import lit
from pyspark.ml.image import ImageSchema

img_dir = '/home/jovyan/personalities'

#Read images and Create training & test DataFrames for transfer learning
jobs_df = ImageSchema.readImages(img_dir + "/jobs").withColumn("label", lit(1))
zuckerberg_df = ImageSchema.readImages(img_dir + "/zuckerberg").withColumn("label", lit(0))
jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4])
zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4])

#dataframe for training a classification model
train_df = jobs_train.unionAll(zuckerberg_train)

#dataframe for testing the classification model
test_df = jobs_test.unionAll(zuckerberg_test)

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)

In [None]:
predictions = p_model.transform(test_df)

predictions.select("image", "prediction").show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
df = p_model.transform(test_df)
df.show()

predictionAndLabels = df.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))