In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
from sklearn.utils import resample
import keras
import dask
from keras.models import Sequential
from keras.models import save_model
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Flatten
from keras.layers import Input
from keras.layers import Concatenate
from keras.applications.densenet import *
from numbers import Number
from keras.utils import to_categorical
import gc
import psutil
from cachetools import TTLCache
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import tensorflow as tf




# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


        
input_filepath = "../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/"
train_image_filepath = "../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/stage_2_train/"
# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import pydicom
from pydicom.data import get_testdata_files

print(__doc__)

filename = train_image_filepath + "ID_00019828f.dcm"
dataset = pydicom.dcmread(filename)

# Normal mode:
print()
print("Filename.........:", filename)
print()

print("Modality.........:", dataset.Modality)

if 'PixelData' in dataset:
    rows = int(dataset.Rows)
    cols = int(dataset.Columns)
    print("Image size.......: {rows:d} x {cols:d}, {size:d} bytes".format(
        rows=rows, cols=cols, size=len(dataset.PixelData)))
    if 'PixelSpacing' in dataset:
        print("Pixel spacing....:", dataset.PixelSpacing)

# use .get() if not sure the item exists, and want a default value if missing
print("Slice location...:", dataset.get('SliceLocation', "(missing)"))

# plot the image using matplotlib
plt.imshow(dataset.pixel_array, cmap=plt.cm.bone)
plt.title('Before Windowing', y=-0.17)
plt.savefig('before-windowing.png')

plt.show()


In [None]:
dir(dataset)

In [None]:
def brain_window(img):
    
    window_center =  img.WindowCenter if isinstance(img.WindowCenter, Number) else img.WindowCenter[0] 
    window_width = img.WindowWidth if isinstance(img.WindowWidth, Number) else img.WindowWidth[0] 
    slope, intercept  =  img.RescaleSlope, img.RescaleIntercept
    img_min = window_center - window_width // 2
    img_max = window_center + window_width // 2
    img = img.pixel_array
    img = img * dataset.RescaleSlope + intercept
    img[img < img_min] = img_min
    img[img > img_max] = img_max
    # Normalize
    img = (img - img_min) / (img_max - img_min)
    return img
    

In [None]:
plt.imshow(brain_window(dataset), cmap=plt.cm.bone)
plt.title('After Windowing', y=-0.17)
plt.savefig('after-windowing.png')




In [None]:
def balanced_subsample(x,y,subsample_size=1.0):

    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = [a for a,b in zip(x,y) if b == yi ]
        class_xs.append((yi, elems))
        if min_elems == None or len(elems) < min_elems:
            min_elems = len(elems)

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.extend(x_)
        ys.extend(y_)

    print (xs[:10], ys[:10])
    return xs,ys

In [None]:
#########making y with all labels ################
#####multilabel###############
filename = train_image_filepath + "ID_00019828f.dcm"
file_dcm = pydicom.dcmread(filename)
print(file_dcm.pixel_array.shape)
def get_two_class_labels(csv_file_path, stratify_percentage=1):
    """returns a list of tuples where the first value is the file id and the second is the label
    [('ID_00019828f', 0)]
    """
    
    input_dataframe = pd.read_csv(csv_file_path)
    #filtered_input_dataframe = input_dataframe[input_dataframe['ID'].apply(lambda x : 'any' in x) ]
    files_with_ids = []
    
   # print(input_dataframe.columns.values)
    X = list(input_dataframe['ID'])
    y_dataframe = input_dataframe.drop(input_dataframe.columns[[0,1,7]], axis = 1)
    
  
    #y = [y_dataframe.columns.values.tolist()] + y_dataframe.values.tolist()
    y =  y_dataframe.values.tolist()
    #print (y[0])
    #print(len(X))
    #print(len(y))
    
    num_samples = int(stratify_percentage * len(X))
    print("Num Samples :", num_samples)
    
    for k,v in list(zip(X, y)) :
        files_with_ids.append( ("_".join(k.split('_')[:2]), v))
        
    return files_with_ids
        
    

def get_images(image_folder_root, image_label_list):
    """returns a list of tuples with ('ID',label,file) where file is the ndarray (with a readable shape )"""
    file_dcm=[]
    X = []
    y = []
    for file_name,label in image_label_list:
        try:
            current_file = pydicom.dcmread(image_folder_root + file_name + '.dcm')
            pixel_array = current_file.pixel_array
            if (pixel_array.shape != (512,512)):
                continue
            file_dcm.append((file_name,label,brain_window(current_file)))
            y.append(label)
            X.append(pydicom.dcmread(image_folder_root + file_name + '.dcm').pixel_array)
        except ValueError:
            continue
    return X,y

In [None]:
###using multilabel dataset
#csv_file_path = "../input/brain-ai/hem_positive_train_set.csv"
csv_file_path = "../input/brain-ai-equalsample/down_sampled_positive_data.csv"
image_folder_root = train_image_filepath
files_with_ids = get_two_class_labels(csv_file_path,stratify_percentage=1)
X,y = [ x for x,y in files_with_ids], [y for x,y in files_with_ids]
print (len(files_with_ids))
print((y[0]))
print(X[0])

In [None]:
class Model():
    
    
    def fit(self,X,y):
        raise NotImplemetedError()
    def predict(self, X):
        """Takes test data and returns the label probabilities """
        raise NotImplemetedError()

class Basic(Model):
    """intput dimension is the shape of the input"""
    def __init__(self, input_dimension, output_dimension):
        self.input_dimension = input_dimension
        self.output_dimension = output_dimension
        self.model = Sequential()
        self.model.add(Flatten())
        self.model.add(Dense(400,input_shape=(512,512)))
        self.model.add(Activation('relu'))
        self.model.add(Dense(200))
        self.model.add(Activation('relu'))
        self.model.add(Dense(50))
        self.model.add(Activation('relu'))
        self.model.add(Dense(25))
        self.model.add(Activation('relu'))
        self.model.add(Dense(10))
        self.model.add(Activation('relu'))
        self.model.add(Dense(5))
        self.model.add(Activation('sigmoid'))
        self.model.compile(optimizer='adam', loss='mean_squared_error',metrics=['accuracy'])

 
        
    
    def fit(self, X,y):
        self.model.fit(x=X,y=y,epochs=1,batch_size=8)
    def predict(self, X):
        self.model.predict(X)
    def save(self,filename):
        self.model.save(filename)



In [None]:
############multiclass classifier, basic model, with 5 neurons in the output layer, sigmoid function and categorical cross-entropy#########
class Basic(Model):
    """intput dimension is the shape of the input"""
    def __init__(self, input_dimension, output_dimension):
        self.input_dimension = input_dimension
        self.output_dimension = output_dimension
        self.model = Sequential()
        self.model.add(Flatten())
        self.model.add(Dense(400,input_shape=(512,512)))
        self.model.add(Activation('relu'))
        self.model.add(Dense(30))
        self.model.add(Activation('relu'))
        self.model.add(Dense(15))
        self.model.add(Activation('relu'))
        self.model.add(Dense(10))
        self.model.add(Activation('relu'))
        self.model.add(Dense(5))
        self.model.add(Activation('sigmoid'))
        self.model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

 
        
    
    def fit(self, X,y):
        self.model.fit(x=X,y=y,epochs=1,batch_size=8)
    def predict(self, X):
        self.model.predict(X)
        
    #y=to_categorical(y)

In [None]:
#Has an image loader for providing the image for a given id in the dataset 
#Written separately so that we can add any preprocessing steps here while the image is being loaded into memory
class DataLoader:
    def __init__(self,base_file_path, cache_size=500, ttl_seconds=20):
        self.base_file_path = base_file_path
        self.cache = TTLCache(maxsize=cache_size,ttl=ttl_seconds)
        
        
    ##will apply only brain windowing while loading the image for now. Need to change this to apply all windowing functions. 
    def load_image(self, image_id):
        if image_id in self.cache:
            return self.cache[image_id]
        
        else:
            current_file = pydicom.dcmread(image_folder_root + image_id + '.dcm')
            pixel_array = brain_window(current_file)
            self.cache[image_id] = pixel_array
            return pixel_array
    def trigger_expire(self):
        self.cache.expire()

In [None]:
class  ModelTrainer(object):
    
    def __init__(self, dataloader, training_batch_size=8, split_size = 400):
        
        self.dataloader = dataloader
        self.split_size = split_size
        
    
    """Takes X and y as the file name and labels and """    
    def fit(self, X,y,model, epochs=10, training_batch_size=8 ):
        splits = len(y) // self.split_size +1
        
        splitter = StratifiedKFold(n_splits=splits, random_state=None, shuffle=True)
        count = 1
        while epochs > 0:
            X,y = shuffle(X,y)
            print("Starting epoch ",count)
            ##TODO: add a better split and shuffle mechanism
            processed = 0
            while processed < len(y):
                batch_imgs = []
                batch_labels = []
            
                current_x = X[processed:min(processed+self.split_size,len(y))]
                current_y = y[processed:min(processed+self.split_size,len(y))]
                for img,label in zip(current_x,current_y):
                    image = self.dataloader.load_image(img)
                    ##Figure out how many images are getting ignored because of this assumption
                    ##check if all reshape operations can happen in the dataloader
                    if image.shape != (512,512):
                        continue
                    batch_imgs.append(image)
                    batch_labels.append(label)
                print("using  batch with size", len(batch_imgs), len(batch_labels), "Processed ", processed, "Total ", len(y))
                model.fit(np.array(batch_imgs),np.array(batch_labels))
                self.dataloader.trigger_expire()
                del batch_imgs
                del batch_labels
                
                gc.collect()
                processed +=self.split_size
            print("Ending Epoch", count)
            print("Saving Model")
            save_model(model.model,str(epochs)+"-epoch-model-three.hdf5")
            #model.save("basic-model-1-epochs.h5")
            #model.model.save_model(model,h5pyBasic-model-1-epoch)
            epochs-=1
            count+=1
        return model
        

In [None]:
#m = keras.models.load_model("model.hdf5")

In [None]:
#####MULTILABEL RUN#######
dataloader = DataLoader(train_image_filepath)
model = Basic(5,5)
trainer = ModelTrainer(dataloader,split_size=1000)
model = trainer.fit(X,y,model)


In [None]:
model.model.summary()

In [None]:
####calculating precsison, recall through keras itself
precision = tf.keras.metrics.Precision()
precision.update_state(y, y)
print('Final result: ', precision.result().numpy())  

recall = tf.keras.metrics.Recall()
recall.update_state(y, y)
print('Final result: ', recall.result().numpy())  

