In [None]:
from bigdl.nn.criterion import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, StringType, ArrayType
from zoo.common.nncontext import *
from zoo.feature.image import *
from zoo.pipeline.api.keras.layers import Input, Flatten, Dense
from zoo.pipeline.api.keras.models import *
from zoo.pipeline.api.net import *
from zoo.pipeline.nnframes import *
from zoo.feature.image.imagePreprocessing import *
import random
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from bigdl.transform.vision.image import RandomTransformer, HFlip
from pyspark.sql.types import *
import numpy as np
from sklearn.metrics import roc_auc_score
from pyspark.ml import Pipeline
from bigdl.optim.optimizer import * 
from zoo.pipeline.api.keras.metrics import AUC
random.seed(1234)

### 1. READ IMAGE , Model and  CSV paths 
Comment the path of the model/dataset  that you do not need to use  it and make sure to use the right  path and choped the last layer of pre-traned model in the next cells:
1. Inception model path 
2. Resnet50 model path 
3. test dataset path 
4. train dataset path 
5. validation dataset path 
6. CSV file with all image labels path 

All dataset are saved in HDFS and the paths return  as strings 

In [None]:
def Read_Pathes(): 
    #model_path ="hdfs:///datasets/xray_files/xray/analytics-zoo_resnet-50_imagenet_0.1.0.model"
    model_path ="hdfs:///datasets/xray_files/xray/bigdl_inception-v1_imagenet_0.4.0.model"
    image_test_path ="hdfs:///datasets/xray_files/xray/test" 
    image_path ="hdfs:///datasets/xray_files/xray/train"
    #image_path="hdfs:///datasets/xray_files/xray/all_images"
    #image_path="hdfs:///datasets/RGBresizedto256"
    #image_path="hdfs:///datasets/xray_files/RGB_PIL_Imge"
    label_path = "hdfs:///datasets/xray_files/Data_Entry_2017.csv"
    return model_path,image_test_path,image_path,label_path   


In [None]:
model_path,image_test_path,image_path,label_path=Read_Pathes()

### List of dataset  and model paths :
Print all paths 

In [None]:
print model_path
print image_test_path
print image_path
print image_test_path
print label_path

### 2. Spark Engine Creation 

In [None]:
sparkConf = create_spark_conf().setAppName("testNNClassifer")
sc = init_nncontext(sparkConf)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()


### 3. Create a list of dataset labels and map it

In [None]:
label_texts = list("""Atelectasis, Consolidation, Infiltration, Pneumothorax, Edema, Emphysema, Fibrosis, Effusion, Pneumonia, Pleural_Thickening, Cardiomegaly, Nodule, Mass, Hernia, No Finding""".replace("\n", "").split(", "))
label_map = {k: v for v, k in enumerate(label_texts)}

### 4. Label binarizer function 
Create a multiple labels hot encoding for each class. There is 15 claases with mutiple labels 

In [None]:
def text_to_label(text):
    arr = [0.0] * len(label_texts)
    for l in text.split("|"):
        arr[label_map[l]] = 1.0
     
    return arr

### 5. Read all images as dataframe and merege them with thier labels 

In [None]:
getLabel = udf(lambda x: text_to_label(x), ArrayType(DoubleType()))
getName = udf(lambda row: os.path.basename(row[0]), StringType())
imageDF = NNImageReader.readImages(image_path, sc, resizeH=256, resizeW=256, image_codec=1) \
    .withColumn("Image Index", getName(col('image')))
imageDF=imageDF.withColumnRenamed('Image Index', 'Image_Index')
labelDF = spark.read.load(label_path, format="csv", sep=",", inferSchema="true", header="true") \
   .select("Image Index", "Finding Labels") \
   .withColumn("label", getLabel(col('Finding Labels')))


In [None]:
labelDF = labelDF.withColumnRenamed('Image Index', 'Image_Index')\
    .withColumnRenamed('Finding Labels', 'Finding_Labels')

In [None]:
labelDF.printSchema()

### 6. Split dataset frame into training and validation 

In [None]:
train_df = imageDF.join(labelDF, on="Image_Index", how="inner")
#(trainingDF1, validationDF1) = train_df.randomSplit([0.9, 0.1])

In [None]:
(trainingDF, validationDF) = train_df.randomSplit([0.7, 0.3])

### 7. Drop the unused dataset in order to better memory utilization 

In [None]:
train_df.unpersist()
trainingDF1.unpersist()
validationDF1.unpersist()

### 8. Model definition and loading 
There is two models : inception and Resnet50 . Load on of them and layers can be freezed or unfreezed 

In [None]:

full_model = Net.load_bigdl(model_path)

#for layer in full_model.layers:
#    print (layer.name())
#this is for Resnet50 
#model = full_model.new_graph(["pool5"])
#model.freeze_up_to(["pool4/3x3_s2"])
# this for Inception v1
model = full_model.new_graph(["pool5/drop_7x7_s1"])  # this inception 
inputNode = Input(name="input", shape=(3, 224, 224))
inception = model.to_keras()(inputNode)
flatten = Flatten()(inception)
logits = Dense(15, activation="sigmoid")(flatten)
lrModel = Model(inputNode, logits)

### 9.Set the validation sammary and save in order  to use it with tensorboard
tensorboard --logdir=/home/mahmood/ChestXray/logDirectory/testNNClassifer/validation --port 8080

tensorboard --logdir=/home/mahmood/ChestXray/logDirectory/testNNClassifer/validation --port 8080

In [None]:
#logdir ='/logDirectory'
train_summary = TrainSummary(log_dir="/home/mahmood/ChestXray/logDirectory", app_name="testNNClassifer")
val_summary = ValidationSummary(log_dir="/home/mahmood/ChestXray/logDirectory", app_name="testNNClassifer")



### 10. Transform images in order to preprocess them to fit with

In [None]:
train_summary.set_summary_trigger("Parameters", SeveralIteration(1))
train_summary.set_summary_trigger("LearningRate", SeveralIteration(1))

In [None]:
#tmp_dir='/home/cdsw/checkPoint'
# compose a pipeline that includes feature transform, pretrained model 
transformer = ChainedPreprocessing(
    [RowToImageFeature(), ImageCenterCrop(224, 224), BigDLAdapter(RandomTransformer(HFlip(), 0.5)), 
     ImageChannelNormalize(123.68, 116.779, 103.939 ), ImageMatToTensor(), ImageFeatureToTensor()])


In [None]:
#to_RGB=True  ,ImageResize(256, 256), ImageChannelNormalize(123.68, 116.779, 103.939, 58.395, 57.12, 57.375)

#tmp_dir='/home/cdsw/checkPoint'
# compose a pipeline that includes feature transform, pretrained model 
transformer = ChainedPreprocessing(
    [RowToImageFeature(), ImageResize(256, 256),BigDLAdapter(RandomTransformer(HFlip(), 0.5)), 
     ImageChannelNormalize(mean_r=0.0, mean_g=0.0, mean_b=0.0, std_r=255.0, std_g=255.0, std_b=255.0 ), ImageMatToTensor(), ImageFeatureToTensor()])


#I would like to try this 
transformer = ChainedPreprocessing([RowToImageFeature(), ImageResize(256, 256),
                                    ImageCenterCrop(256, 256),
                                    ImageChannelNormalize(123.0, 117.0, 104.0),
                                    ImageMatToTensor(),
                                    ImageFeatureToTensor()])

 ImageChannelNormalize(mean_r=0.0, mean_g=0.0, mean_b=0.0, std_r=255.0, std_g=255.0, std_b=255.0 ), ImageMatToTensor(), ImageFeatureToTensor(),ImageSetToSample()])


### 11.Estimator Defintion and setting the parameters 

In [None]:
classifier = NNEstimator(lrModel, MultiLabelSoftMarginCriterion(), transformer, SeqToTensor([15])) \
     .setLearningRate(0.005).setBatchSize(64).setMaxEpoch(8).setFeaturesCol("image")\
     .setCachingSample(False)\
     .setValidation(EveryEpoch(), validationDF, [AUC()], 64)\
     .setTrainSummary(train_summary) \
     .setValidationSummary(val_summary) \
     .setCheckpoint("/home/mahmood/ChestXray/checkpoint", EveryEpoch(), False)


.setOptimMethod()

train_summary.set_summary_trigger("Parameters", SeveralIteration(1))
train_summary.set_summary_trigger("Parameters", SeveralIteration(1))

### 12. Training the model 

In [None]:
nnModel = classifier.fit(trainingDF)
print("Finished training")

In [None]:
nnModel.transform(trainingDF).show(5)

### 13.Evaluate the model

In [None]:
predictionDF = nnModel.transform(validationDF).cache()
predictionDF.select("Image_Index","label","prediction").show(5)


In [None]:
predictionDF.select("Image_Index","label","prediction").sort("label", ascending=False).show(5)

In [None]:
#predictionDF=predictionDF.withColumn('label', predictionDF['label'].cast(ArrayType(FloatType())))

In [None]:
predictionDF=predictionDF.withColumn('label', predictionDF['label'].cast(ArrayType(DoubleType())))

## 14. AUC  CALCULATION 

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from scipy import sparse
import seaborn as sns; sns.set_style('whitegrid')

### 15. Connvert label and prediction into 2D array to fit with AUC_ROC

In [None]:
Lab=predictionDF.select("label").collect()

L=np.array(Lab)

L=L.reshape(L.shape[0],L.shape[2])

In [None]:
Pre=predictionDF.select("prediction").collect()


Pre1=np.array(Pre)

Pre1=Pre1.reshape(Pre1.shape[0],Pre1.shape[2])

def toArray(dfColum)
    P=np.array(dfColum)
    P=P.reshape(P.shape[0],P.shape[2])


ConvertToArrayUdf = udf(lambda A: toArray(A), ArrayType(DoubleType()))

def array_fetch(P):
  P=P.toPandas().values
  array1=np.zeros((len(P),len(P[0][0][:])))
  for i in range(len(P)):
      for j in range(len(P[0][0][:])):
          array1[i,j]=P[i][0][j]
  return array1 

Lab=predictionDF.select("label")
Pre=predictionDF.select("prediction")
def array_fetch(P):
    P=np.array(P.collect())
    P=P.reshape(P.shape[0],P.shape[2])
    return P

LabelArray=array_fetch(Lab)
PredArray=array_fetch(Pre)
 



In [None]:
total_score=roc_auc_score(P, Pre1)
n_classes=15

print('total roc_auc_score : = {0}'.format(total_score))

### 16. Get AUC values

In [None]:
def get_auc_values(LabelArray,PredArray): 
    n_classes=15
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(LabelArray[:, i], PredArray[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        print roc_auc[i]
    return roc_auc,fpr,tpr     

roc_auc,fpr, tpr=get_auc_values(L,Pre1)

### 17. Ploting AUC

In [None]:
def ploting_AUC(fpr, tpr, label_texts): 

  #%matplotlib inline
    plt.figure()
    lw=1
    colors = (['aqua', 'darkorange', 'cornflowerblue','red','blue','maroon','coral','olive','aqua','springgreen','fuchsia','navy','plum','orchid','thistle'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                label='{0} (area = {1:0.2f})'
                ''.format(label_texts[i], roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--', lw=lw)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('AUC for multi-class')
      
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
            fancybox=True, shadow=True, ncol=5)
    plt.show()

#ploting_AUC(fpr, tpr, label_texts)

In [None]:
ploting_AUC(fpr, tpr, label_texts)

### 18. Compute micro-average ROC curve and ROC area  

In [None]:
from scipy import interp
fpr_micro=dict()
tpr_micro=dict()
roc_auc_micro=dict()
# Compute micro-average ROC curve and ROC area
fpr_micro["micro"], tpr_micro["micro"], _ = roc_curve(LabelArray.ravel(), PredArray.ravel())
roc_auc_micro["micro"] = auc(fpr_micro["micro"], tpr_micro["micro"])

# Find   macro-average metrics 

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes
fpr_macro = dict()
tpr_macro = dict()
roc_auc_macro= dict()
fpr_macro["macro"] = all_fpr
tpr_macro["macro"] = mean_tpr
roc_auc_macro["macro"] = auc(fpr_macro["macro"], tpr_macro["macro"])


In [None]:
print('micro average := {0}'.format(roc_auc_micro["micro"]))

In [None]:
print('macro average :=  {0}'.format(roc_auc_macro["macro"]))

### 19. Plot macro and micro  curves

In [None]:
%matplotlib inline
plt.figure()
plt.title('micro -averge') 
plt.plot(fpr_micro["micro"], tpr_micro["micro"]
,label='micro-average ROC curve (area = {0:0.2f})' 
''.format(roc_auc_micro["micro"]),
         color='blue', linestyle=':', linewidth=2 )
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2),
            fancybox=True, shadow=True, ncol=5)
plt.show()

In [None]:
plt.figure()
plt.title('macro -averge') 
plt.plot(fpr_macro["macro"], tpr_macro["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc_macro["macro"]),
         color='navy', linestyle=':', linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.02),
            fancybox=True, shadow=True, ncol=5)
plt.show()

In [None]:
correct=dict()
accuracy=dict()
p1=PredArray
p2=p1>0.5
p= p2.astype(int)
y_scores=p

from __future__ import division

for i in range(n_classes):
    
    correct[i] =np.sum(LabelArray[:, i]==y_scores[:, i])
    accuracy[i]=correct[i]/(len(LabelArray[:, i]))
  

for i in accuracy.iteritems():
    avgDict = sum(i)/ float(len(accuracy))

In [None]:
print('Total accuracy= {0}'.format(avgDict))