In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pydicom
import pandas as pd
from glob import glob
import os
from matplotlib.patches import Rectangle
import seaborn as sns
import os
from os import listdir
from os.path import isfile, join
import pylab

In [None]:
detail_class_path = '../input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv'
bounding_box_path = '../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv'
train_directory = '../input/rsna-pneumonia-detection-challenge/stage_2_train_images/'
test_directory = '../input/rsna-pneumonia-detection-challenge/stage_2_test_images'

In [None]:
detail_class_df = pd.read_csv(detail_class_path)
print(detail_class_df.shape[0], 'class infos loaded')
print(detail_class_df['patientId'].value_counts().shape[0], 'patient cases')
detail_class_df.groupby('class').size().plot.bar()
detail_class_df.head(5)

#### Three classes is present Lung opacity,No Lung opacity/Not Normal and Normal

### Bounding Boxes

In [None]:
bounding_box_df = pd.read_csv(bounding_box_path)
print(bounding_box_df.shape[0], 'boxes loaded')
print(bounding_box_df['patientId'].value_counts().shape[0], 'patient cases')
bounding_box_df.head(10)

### Checking for Missing Values

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return np.transpose(pd.concat([total, percent], axis=1, keys=['Total', 'Percent']))
missing_data(bounding_box_df)

#### 68% of cases has no bounding boxes 

In [None]:
missing_data(detail_class_df)

#### No missing classes

### Number of images in each class

In [None]:
def get_feature_distribution(data, feature):
    # Get the count for each label
    label_counts = data[feature].value_counts()

    # Get total number of samples
    total_samples = len(data)

    # Count the number of items in each class
    print("Feature: {}".format(feature))
    for i in range(len(label_counts)):
        label = label_counts.index[i]
        count = label_counts.values[i]
        percent = int((count / total_samples) * 10000) / 100
        print("{:<30s}:   {} or {}%".format(label, count, percent))

get_feature_distribution(detail_class_df, 'class')

#### maximum cases appear to be Not normal.

### Combining Bounding Box Data and Classes

In [None]:
comb_bounding_box_df = pd.concat([bounding_box_df, 
                        detail_class_df.drop('patientId',1)], 1)
print(comb_bounding_box_df.shape[0], 'combined cases')
comb_bounding_box_df.head(5)

### Number of Bounding boxes

In [None]:
box_df = comb_bounding_box_df.groupby('patientId').\
    size().\
    reset_index(name='boxes')
comb_box_df = pd.merge(comb_bounding_box_df, box_df, on='patientId')
box_df.\
    groupby('boxes').\
    size().\
    reset_index(name='patients')

### Class and Target

In [None]:
comb_bounding_box_df.groupby(['class', 'Target']).size().reset_index(name='Patient Count')

In [None]:
fig, ax = plt.subplots(nrows=1,figsize=(12,6))
tmp = comb_bounding_box_df.groupby('Target')['class'].value_counts()
df = pd.DataFrame(data={'Exams': tmp.values}, index=tmp.index).reset_index()
sns.barplot(ax=ax,x = 'Target', y='Exams',hue='class',data=df, palette='Set2')
plt.title("Chest exams class and Target")
plt.show()

#### there appear to be more cases belonging to '0'target. There is imbalance in target variable

### Plotting the dimensions of bounding boxes

In [None]:
target1 = comb_bounding_box_df[comb_bounding_box_df['Target']==1]
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(2,2,figsize=(12,12))
sns.distplot(target1['x'],kde=True,bins=50, color="deeppink", ax=ax[0,0])
sns.distplot(target1['y'],kde=True,bins=50, color="navy", ax=ax[0,1])
sns.distplot(target1['width'],kde=True,bins=50, color="darkgreen", ax=ax[1,0])
sns.distplot(target1['height'],kde=True,bins=50, color="maroon", ax=ax[1,1])
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()

## Images

In [None]:
train_images = [f for f in listdir(train_directory) if isfile(join(train_directory, f))]
test_images = [f for f in listdir(test_directory) if isfile(join(test_directory, f))]

In [None]:
print('Number of training images:', len(train_images))
print('Number of test images:', len(test_images))

### Printing images without imposing bounding boxes

In [None]:
plt.style.use('default')
fig=plt.figure(figsize=(20, 10))
columns = 4; rows = 2
for i in range(1, columns*rows +1):
    ds = pydicom.dcmread(train_directory + train_images[i])
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap=plt.cm.bone)
    fig.add_subplot

### Exploring Dicom meta_data

In [None]:
patientId = bounding_box_df['patientId'][0]
dcm_file = '../input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % patientId
dcm_data = pydicom.read_file(dcm_file)

print(dcm_data)

#### File contains patients details like Age,Gender,Body part examined etc...

In [None]:
im = dcm_data.pixel_array #pixels of the image
print(type(im))
print(im.dtype)
print(im.shape)

In [None]:
pylab.imshow(im, cmap=pylab.cm.gist_gray)
pylab.axis('off')

### parsing each image with its bounding boxes and label into seperate dicts respectively

In [None]:
def parse_data(df):

    #Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    for n, row in df.iterrows():
        #Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': '../input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % pid,
                'label': row['Target'],
                'boxes': []}

        #Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

In [None]:
parsed = parse_data(bounding_box_df)

In [None]:
print(parsed['00436515-870c-4b36-a041-de91049b9ab4'])

### Overlaying Bounding Boxes to each image.

In [None]:
def draw(data):
    """
    Method to draw single patient with bounding box(es) if present 

    """
    #Open DICOM file
    d = pydicom.read_file(data['dicom'])
    im = d.pixel_array

    #Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    #Add boxes with random color if present
    for box in data['boxes']:
        rgb = np.floor(np.random.rand(3) * 256).astype('int')
        im = overlay_box(im=im, box=box, rgb=rgb, stroke=6)

    pylab.imshow(im, cmap=pylab.cm.gist_gray)
    pylab.axis('off')

def overlay_box(im, box, rgb, stroke=1):
    """
    Method to overlay single box on image

    """
    #Convert coordinates to integers
    box = [int(b) for b in box]
    
    #Extract coordinates
    y1, x1, height, width = box
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im

### Example

In [None]:
draw(parsed['00436515-870c-4b36-a041-de91049b9ab4'])

In [None]:
plt.style.use('default')
fig=plt.figure(figsize=(20, 10))
columns = 4; rows = 2
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    draw(parsed[bounding_box_df['patientId'].unique()[i]])
    fig.add_subplot

### Plotting the images for each class

In [None]:
opacity = detail_class_df \
    .loc[detail_class_df['class'] == 'Lung Opacity'] \
    .reset_index()
not_normal = detail_class_df \
    .loc[detail_class_df['class'] == 'No Lung Opacity / Not Normal'] \
    .reset_index()
normal = detail_class_df \
    .loc[detail_class_df['class'] == 'Normal'] \
    .reset_index()

In [None]:
plt.style.use('default')
fig=plt.figure(figsize=(20, 10))
columns = 4; rows = 2
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    draw(parsed[opacity['patientId'].unique()[i]])

In [None]:
plt.style.use('default')
fig=plt.figure(figsize=(20, 10))
columns = 4; rows = 2
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    draw(parsed[not_normal['patientId'].loc[i]])

In [None]:
plt.style.use('default')
fig=plt.figure(figsize=(20, 10))
columns = 4; rows = 2
for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    draw(parsed[normal['patientId'].loc[i]])

### Comparing image from each class

In [None]:
fig=plt.figure(figsize=(20, 10))
columns = 3; rows = 1
fig.add_subplot(rows, columns, 1).set_title("Normal", fontsize=30)
draw(parsed[normal['patientId'].unique()[0]])
fig.add_subplot(rows, columns, 2).set_title("Not Normal", fontsize=30)
# ax2.set_title("Not Normal", fontsize=30)
draw(parsed[not_normal['patientId'].unique()[0]])
fig.add_subplot(rows, columns, 3).set_title("Opacity", fontsize=30)
# ax3.set_title("Opacity", fontsize=30)
draw(parsed[opacity['patientId'].unique()[0]])

### Plotting images with 3 and 4 bounding boxes.

In [None]:
box_df.sort_values('boxes', ascending=False).head(20)

In [None]:
fig=plt.figure(figsize=(20, 10))
plt.suptitle('"Lung Opacity" Example', fontsize=16)
draw(parsed['1c44e0a4-4612-438f-9a83-8d5bf919cb67'])

In [None]:
fig=plt.figure(figsize=(20, 10))
plt.suptitle('"Lung Opacity" Example', fontsize=16)
draw(parsed['b19a9422-a790-4a43-b59f-65a3bf0f16be'])