## Convert X-Ray images into SQL Spark Dataframe 

In this example, all the ChestXray images will be converted into SQL spark dataframe and saved in HDFS.

In the following cell, import all the required packages and libraries.

In [1]:
from bigdl.nn.criterion import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType, ArrayType
from zoo.common.nncontext import *
from zoo.pipeline.nnframes import *

  from . import _csparsetools


## Generate one hot encoding 
This part of the code generates SQL spark dataframe for the mutiple labels for each class as one hot encoding then save that as a CSV file.  

In [2]:
label_length = 14
label_texts = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax",
               "Consolidation", "Edema", "Emphysema", "Fibrosis", "Pleural_Thickening", "Hernia"]
label_map = {k: v for v, k in enumerate(label_texts)}

def write_to_csv(df, label_col="label"):
    for i in range(label_length):
        get_Kth = udf(lambda a: a[i] * (i + 1), DoubleType())
        df = df.withColumn(str(i) + " th", get_Kth(col(label_col)))

    df.show()
    df = df.drop("label")
    df.write.csv("label.csv")
    return df

## Read the image and pre-trained model paths 

In [None]:
image_path = "hdfs:///path_to_your_downloaded_images" #sys.argv[1] 
label_path = "hdfs:///path_to_your_saved_label_and_txt_files(Data_Entry_2017.csv, train_val_list.txt, test_list.txt)" #sys.argv[2] 
save_path = "hdfs:///path_to_save_your_DataFrames(trainDF, testDF)" #sys.argv[3] 

## Start the spark session and read images then convert it into Spark Dataframe 

In [78]:
sparkConf = create_spark_conf().setAppName("convertimgs_to_spdf")
sc = init_nncontext(sparkConf)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
print(sc.master)

def text_to_label(text):
    arr = [0.0] * len(label_texts)
    for l in text.split("|"):
        if l != "No Finding":
            arr[label_map[l]] = 1.0
    return arr

getLabel = udf(lambda x: text_to_label(x), ArrayType(DoubleType()))
getName = udf(lambda row: os.path.basename(row[0]), StringType())
imageDF = NNImageReader.readImages(image_path, sc, resizeH=256, resizeW=256, image_codec=1) \
    .withColumn("Image_Index", getName(col('image')))
imageDF=imageDF.withColumnRenamed('Image_Index', 'Image_Index')

yarn


## Reading label CSV file and generate spark dataframe 

In [82]:
labelDF = spark.read.load(label_path + "/Data_Entry_2017.csv", format="csv", sep=",", inferSchema="true", header="true") \
        .select("Image Index", "Finding Labels") \
        .withColumn("label", getLabel(col('Finding Labels'))) \
        .withColumnRenamed('Image Index', 'Image_Index') 
       # .select("Image_Index", "label")
labelDF.printSchema()

root
 |-- Image_Index: string (nullable = true)
 |-- Finding Labels: string (nullable = true)
 |-- label: array (nullable = true)
 |    |-- element: double (containsNull = true)



## Join the images with their labels in a new  dataframe 

In [85]:
train_df = imageDF.join(labelDF, on="Image_Index", how="inner")

## Formulate the train and test dataframe by reading their labels from train_val_list and test_list text files 

In [87]:
trainingList = spark.read.text(label_path + "/train_val_list.txt").withColumnRenamed("value", "Image_Index")
testList = spark.read.text(label_path + "/test_list.txt").withColumnRenamed("value", "Image_Index")

## Construct train and test dataframe  

In [90]:
trainingDF = train_df.join(trainingList, on="Image_Index")
testDF = train_df.join(testList, on="Image_Index")

## Remove the special character from column labels 

In [104]:
trainingDF1=trainingDF.withColumnRenamed("Finding Labels", "Finding_Labels")
testDF1=testDF.withColumnRenamed("Finding Labels", "Finding_Labels")

## Save train and test dataframe in HDFS 

In [108]:
trainingDF1.write.save(save_path + "/trainDF") 
testDF1.write.save(save_path + "/testDF")
print("data saved at ", save_path)

('data saved at ', 'hdfs:///datasets/xray_files/DataFrames')


## Load the dataframes and print the number of images in each of them 

In [110]:
loadedTrainingDF = spark.read.load(save_path + "/trainDF")
loadedTestDF = spark.read.load(save_path + "/testDF")
print("trainingDF count: ", loadedTrainingDF.count())
print("testDF count: ", loadedTestDF.count())
loadedTrainingDF.show()
loadedTestDF.show()

('trainingDF count: ', 86524)
('testDF count: ', 25596)
+----------------+--------------------+--------------------+--------------------+
|     Image_Index|               image|      Finding_Labels|               label|
+----------------+--------------------+--------------------+--------------------+
|00023287_000.png|[hdfs://pNameNode...|            Effusion|[0.0, 0.0, 1.0, 0...|
|00023313_008.png|[hdfs://pNameNode...|       Effusion|Mass|[0.0, 0.0, 1.0, 0...|
|00023450_000.png|[hdfs://pNameNode...|          No Finding|[0.0, 0.0, 0.0, 0...|
|00023477_000.png|[hdfs://pNameNode...|          No Finding|[0.0, 0.0, 0.0, 0...|
|00023543_000.png|[hdfs://pNameNode...|          No Finding|[0.0, 0.0, 0.0, 0...|
|00023650_000.png|[hdfs://pNameNode...|          No Finding|[0.0, 0.0, 0.0, 0...|
|00023708_000.png|[hdfs://pNameNode...|          No Finding|[0.0, 0.0, 0.0, 0...|
|00023731_000.png|[hdfs://pNameNode...|          No Finding|[0.0, 0.0, 0.0, 0...|
|00023775_000.png|[hdfs://pNameNode...|   