## Single Image Inference using Chestxray Dataset

Following are the modules that are necessarily required to run the code

Tested with: BigDL-0.7.2, Analytics Zoo-0.4.0, Pyspark-2.1.0

In [1]:
from bigdl.nn.layer import Model
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType, ArrayType

from zoo.common.nncontext import *
from zoo.feature.image import *
from zoo.models.image.imageclassification import *
from zoo.pipeline.nnframes import *
from zoo.pipeline.api.net import Net
from zoo.pipeline.api.keras.models import Sequential
from zoo.pipeline.api.keras.layers import *
from zoo.pipeline.api.keras.metrics import AUC
from zoo.pipeline.nnframes import NNEstimator
from zoo.pipeline.api.keras.objectives import BinaryCrossEntropy

import pandas as pd
import time
from PIL import Image

sparkConf = create_spark_conf().setAppName("ChestXray_Inference")
sc = init_nncontext(sparkConf)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sqlContext = SQLContext(sc)

  from . import _csparsetools
  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters


## Loading data paths

Loading the data of image and label from HDFS location

In [2]:
test_image_path = "hdfs:///datasets/xray/all_images/00000006_000.png"
label_path = "hdfs:///datasets/xray/Data_Entry_2017.csv"

Labels

In [3]:
%%time
label_texts = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia",
               "Pneumothorax", "Consolidation", "Edema", "Emphysema", "Fibrosis", "Pleural_Thickening", "Hernia"]
label_map = {k: v for v, k in enumerate(label_texts)}

def text_to_label(text):
    arr = [0.0] * len(label_texts)
    for l in text.split("|"):
        if l != "No Finding":
            arr[label_map[l]] = 1.0
    return arr

label_length = len(label_texts)

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 11.9 µs


Create Inference Dataframe

In [4]:
%%time
getLabel = udf(lambda x: text_to_label(x), ArrayType(DoubleType()))
getName = udf(lambda row: os.path.basename(row[0]), StringType())
test_imageDF = NNImageReader.readImages(test_image_path, sc, resizeH=256, resizeW=256, image_codec=1)\
                .withColumn("Image Index", getName(col('image')))
imageDF = test_imageDF.withColumnRenamed('Image Index', 'Image_Index')
labelDF = sqlContext.read.option('timestampFormat', 'yyyy/MM/dd HH:mm:ss ZZ')\
            .load(label_path, format="csv", sep=",", inferSchema="true", header="true")\
            .select("Image Index", "Finding Labels")\
            .withColumn("label", getLabel(col('Finding Labels')))\
            .withColumnRenamed('Image Index', 'Image_Index')
labelDF1 = labelDF.withColumnRenamed('Image Index', 'Image_Index')\
            .withColumnRenamed('Finding Labels', 'Finding_Labels')
trainDF = imageDF.join(labelDF1, on="Image_Index", how="inner")

CPU times: user 21.9 ms, sys: 2.06 ms, total: 23.9 ms
Wall time: 4.04 s


Load ResNet-50 Analytics Zoo Keras Model

In [5]:
%%time
resnet_zoo_model = Net.load("hdfs:///user/leelau/xray/save_model/model.bigdl", "hdfs:///user/leelau/xray/save_model/model.bin")
type(resnet_zoo_model)

CPU times: user 4.2 ms, sys: 1.18 ms, total: 5.38 ms
Wall time: 19.9 s


Inference using the loaded ResNet Model which is used on trainDF

In [6]:
%%time
def predict(model, inputdf, image_feature_col = "image", batchsize=4):
    """
    Predict output of when inputdf is passed through model
    """
    transformer = ChainedPreprocessing([
        RowToImageFeature(),
        ImageCenterCrop(224, 224),
        ImageChannelNormalize(123.68, 116.779, 103.939),
        ImageMatToTensor(),
        ImageFeatureToTensor()])
    classifier_model = NNModel(model, transformer).setFeaturesCol(image_feature_col)\
                        .setBatchSize(batchsize)
    output = classifier_model.transform(inputdf)
    return output

def show_prediction_output(predDF):
    """
    Display the output size and array
    """
    print(predDF.show(1))
    print("length of prediction array : ", len(predDF.collect()[0].prediction))
    predictions_list = predDF.collect()[0].prediction
    labelList = predDF.collect()[0].label
    print("{:<15} - {:<25} - {:<15}".format('Finding_Labels', 'Prediction', 'Label'))
    print("{:<15} - {:<25} - {:<15}".format('-'*len('Finding_Labels'), '-'*len('Prediction'), '-'*len('Label')))
    for indx in range(0, len(predictions_list)):
        print("{:<15} - {:<25} - {:<15}".format(label_texts[indx], predictions_list[indx], labelList[indx]))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs


Check output of the loaded ResNet model which is used in the Inference Dataframe

In [7]:
%%time
output = predict(resnet_zoo_model, trainDF)
show_prediction_output(output)

creating: createRowToImageFeature
creating: createImageCenterCrop
creating: createImageChannelNormalize
creating: createImageMatToTensor
creating: createImageFeatureToTensor
creating: createChainedPreprocessing
creating: createTensorToSample
creating: createChainedPreprocessing
creating: createNNModel
+----------------+--------------------+--------------+--------------------+--------------------+
|     Image_Index|               image|Finding_Labels|               label|          prediction|
+----------------+--------------------+--------------+--------------------+--------------------+
|00000006_000.png|[hdfs://gnamenode...|    No Finding|[0.0, 0.0, 0.0, 0...|[0.10832821, 0.02...|
+----------------+--------------------+--------------+--------------------+--------------------+

None
('length of prediction array : ', 14)
Finding_Labels  - Prediction                - Label          
--------------  - ----------                - -----          
Atelectasis     - 0.108328208327            