In [45]:
# tested with :
# bigdl 0.7.2
# analytics-zoo 0.4.0
# pyspark 2.1.0

from bigdl.nn.layer import Model
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType, ArrayType

from zoo.common.nncontext import *
from zoo.feature.image import *
from zoo.models.image.imageclassification import *
from zoo.pipeline.nnframes import *
from zoo.pipeline.api.net import Net
from zoo.pipeline.api.keras.models import Sequential
from zoo.pipeline.api.keras.layers import *
from zoo.pipeline.api.keras.metrics import AUC
from zoo.pipeline.api.net import Net
from zoo.pipeline.nnframes import NNEstimator
from zoo.pipeline.api.keras.objectives import BinaryCrossEntropy

sparkConf = create_spark_conf().setAppName("ChestXray_Inference")
sc = init_nncontext(sparkConf)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sqlContext = SQLContext(sc)

In [53]:
# data location
test_image_path = "hdfs:///datasets/xray/all_images/00000001_000.png"
label_path = "hdfs:///datasets/xray/Data_Entry_2017.csv"

In [54]:
# labels
label_texts = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia",
               "Pneumothorax", "Consolidation", "Edema", "Emphysema", "Fibrosis", "Pleural_Thickening", "Hernia"]
label_map = {k: v for v, k in enumerate(label_texts)}

def text_to_label(text):
    arr = [0.0] * len(label_texts)
    for l in text.split("|"):
        if l != "No Finding":
            arr[label_map[l]] = 1.0
    return arr

label_length = len(label_texts)

In [55]:
# create inference dataframe
getLabel = udf(lambda x: text_to_label(x), ArrayType(DoubleType()))
getName = udf(lambda row: os.path.basename(row[0]), StringType())
test_imageDF = NNImageReader.readImages(test_image_path, sc, resizeH=256, resizeW=256, image_codec=1)\
                .withColumn("Image Index", getName(col('image')))
imageDF = test_imageDF.withColumnRenamed('Image Index', 'Image_Index')
labelDF = sqlContext.read.option('timestampFormat', 'yyyy/MM/dd HH:mm:ss ZZ')\
            .load(label_path, format="csv", sep=",", inferSchema="true", header="true")\
            .select("Image Index", "Finding Labels")\
            .withColumn("label", getLabel(col('Finding Labels')))\
            .withColumnRenamed('Image Index', 'Image_Index')
labelDF1 = labelDF.withColumnRenamed('Image Index', 'Image_Index')\
            .withColumnRenamed('Finding Labels', 'Finding_Labels')
trainDF = imageDF.join(labelDF1, on="Image_Index", how="inner")

print(trainDF.printSchema())
trainDF.show()

root
 |-- Image_Index: string (nullable = true)
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = false)
 |    |-- width: integer (nullable = false)
 |    |-- nChannels: integer (nullable = false)
 |    |-- mode: integer (nullable = false)
 |    |-- data: binary (nullable = false)
 |-- Finding_Labels: string (nullable = true)
 |-- label: array (nullable = true)
 |    |-- element: double (containsNull = true)

None
+----------------+--------------------+--------------+--------------------+
|     Image_Index|               image|Finding_Labels|               label|
+----------------+--------------------+--------------+--------------------+
|00000001_000.png|[hdfs://gnamenode...|  Cardiomegaly|[0.0, 1.0, 0.0, 0...|
+----------------+--------------------+--------------+--------------------+



In [56]:
# load resnet-50 analytics zoo keras model
resnet_zoo_model = Net.load("hdfs:///user/leelau/xray/save_model/model.bigdl", "hdfs:///user/leelau/xray/save_model/model.bin")

In [57]:
type(resnet_zoo_model)

zoo.pipeline.api.keras.base.ZooKerasLayer

In [58]:
# inference using the resnet model as-is when it is used on trainDF
def predict(model, inputdf, image_feature_col = "image", batchsize=4):
    """
    Predict output of when inputdf is passed through model
    """
    transformer = ChainedPreprocessing([
        RowToImageFeature(),
        ImageCenterCrop(224, 224),
        ImageChannelNormalize(123.68, 116.779, 103.939),
        ImageMatToTensor(),
        ImageFeatureToTensor()])
    classifier_model = NNModel(model, transformer).setFeaturesCol(image_feature_col)\
                        .setBatchSize(batchsize)
    output = classifier_model.transform(inputdf)
    return output

def show_prediction_output(predDF):
    """
    Display the output size and array
    """
    print(predDF.show(1))
    print("length of prediction array : ", len(predDF.collect()[0].prediction))
    print("prediction : ", predDF.collect()[0].prediction)

    
# check output of the resnet model as-is using the inference dataframe
output = predict(resnet_zoo_model, trainDF)
show_prediction_output(output)

creating: createRowToImageFeature
creating: createImageCenterCrop
creating: createImageChannelNormalize
creating: createImageMatToTensor
creating: createImageFeatureToTensor
creating: createChainedPreprocessing
creating: createTensorToSample
creating: createChainedPreprocessing
creating: createNNModel
+----------------+--------------------+--------------+--------------------+--------------------+
|     Image_Index|               image|Finding_Labels|               label|          prediction|
+----------------+--------------------+--------------+--------------------+--------------------+
|00000001_000.png|[hdfs://gnamenode...|  Cardiomegaly|[0.0, 1.0, 0.0, 0...|[0.051492076, 0.9...|
+----------------+--------------------+--------------+--------------------+--------------------+

None
('length of prediction array : ', 14)
('prediction : ', [0.051492076367139816, 0.9890044331550598, 0.23523545265197754, 0.11708319187164307, 0.021687369793653488, 0.030919406563043594, 0.01281912624835968, 