In [1]:
#import click
import utils
from models import models
import warnings
import datetime
import os
import numpy as np
import pandas as pd

Using TensorFlow backend.


In [9]:
def create_test_generator(testdf, categories, batch_size):

    test_generator = utils.Three_Channel_Generator(
        testdf[['filename']],
        ycols=categories,
        desired_size=512,
        batch_size=batch_size,
        subset='test',
        random_transform=False,
        rgb=True
    )
    return test_generator


def build_submission(testdf, y_pred, dataloc):

    # build output dataframe
    df_output = testdf.copy()

    categories = [
        'any',
        'epidural',
        'intraparenchymal',
        'intraventricular',
        'subarachnoid',
        'subdural'
    ]

    if len(y_pred) < len(df_output):
        mismatch = len(df_output) - len(y_pred)
        # this is necessary because the number of test images isn't evenly divisible by the batch size.
        # The predict generator stops early. I should return to this and fix it!!!
        warnings.warn(
            'y_pred is {} entries too short. Filling with zeros'.format(mismatch))
        for _ in range(mismatch):
            y_pred = np.vstack((y_pred, np.zeros_like(categories)))

    # populate columns of df_output with predictions
    for ii, cat in enumerate(categories):
        df_output[cat] = y_pred[:len(df_output), ii]


    # using the sample submission as the prototype, iterate through and fill with actual predictions
    df_output.set_index('ID', inplace=True)
    sample_submission = pd.read_csv(os.path.join(
        dataloc, 'stage_2_sample_submission.csv'))
    submission = sample_submission.copy()

    for idx, row in submission.iterrows():
        img_id = 'ID_'+row['ID'].split('_')[1]
        if img_id == 'ID_ffffcbff8':
            continue
        hem_type = row['ID'].split('_')[2]
        submission.at[idx, 'Label'] = df_output.at[img_id, hem_type]

    datestamp = str(datetime.datetime.now()).replace(':','_').replace(' ','T')
    submission_filename = os.path.join(dataloc,'submission_{}.csv'.format(datestamp))
    submission.to_csv(submission_filename, index=False)
    return submission_filename


def upload_submission(path_to_submission, message='none'):
    from urllib.request import urlretrieve
    import yaml
    credentials_url = 'https://www.dropbox.com/s/tjs6z6pna1get6g/kaggle_credentials.yml?dl=1'
    urlretrieve(credentials_url, 'credentials.yml')
    with open('credentials.yml', 'r') as f:
        credentials = yaml.load(f)
    # remove credentials so they don't end up being accidentally committed
    os.remove('credentials.yml')

    os.environ["KAGGLE_USERNAME"] = credentials["KAGGLE_USERNAME"]
    os.environ["KAGGLE_KEY"] = credentials["KAGGLE_KEY"]

    os.system('export -p | grep KAGGLE_')

    os.system('kaggle competitions submit -c rsna-intracranial-hemorrhage-detection -f "{}" -m "{}"'.format(
        path_to_submission,
        message
    ))

    os.system(
        'kaggle competitions submissions -c rsna-intracranial-hemorrhage-detection')

In [None]:
dataloc = '/ssd1'

path_to_weights = os.path.join(dataloc,'model_weights_vgg19_2019-11-09T12_02_24.854862.h5')
model = 'vgg19'
rgb=True
batch_size = 8

# load test data
test_df = utils.load_test_data(dataloc, stage=2)

#     categories = utils.define_categories(test_df)
categories = [
    'any',
    'epidural',
    'intraparenchymal',
    'intraventricular',
    'subarachnoid',
    'subdural'
]
print("CATEGORIES = {}".format(categories))

# load model
model = models(model, input_image_size=512, number_of_output_categories=len(categories))

# load weights
model.load_weights(path_to_weights)

# instantiate generator

test_generator = create_test_generator(test_df, categories, batch_size)

test_generator.__reset__() # make sure the generator is starting at index 0!!!
# predict
y_pred = model.predict_generator(
    test_generator,
    steps=len(test_df)//batch_size+1,
    verbose=1
)

In [8]:
test_df.head()

Unnamed: 0,filename,ID
0,/ssd1/stage_2_test_images/ID_d89585863.dcm,ID_d89585863
1,/ssd1/stage_2_test_images/ID_095f60af7.dcm,ID_095f60af7
2,/ssd1/stage_2_test_images/ID_1f8bf6c51.dcm,ID_1f8bf6c51
3,/ssd1/stage_2_test_images/ID_5303b8a95.dcm,ID_5303b8a95
4,/ssd1/stage_2_test_images/ID_5752f287d.dcm,ID_5752f287d


In [10]:
# build submission
submission_filename = build_submission(test_df, y_pred, dataloc)

In [11]:
submission_filename

'/ssd1/submission_2019-11-13T15_28_55.205119.csv'

In [5]:
testdf.head()

NameError: name 'testdf' is not defined

In [26]:
testdf = test_df

# build output dataframe
df_output = testdf.copy()

categories = [
    'any',
    'epidural',
    'intraparenchymal',
    'intraventricular',
    'subarachnoid',
    'subdural'
]

if len(y_pred) < len(df_output):
    mismatch = len(df_output) - len(y_pred)
    # this is necessary because the number of test images isn't evenly divisible by the batch size.
    # The predict generator stops early. I should return to this and fix it!!!
    warnings.warn(
        'y_pred is {} entries too short. Filling with zeros'.format(mismatch))
    for _ in range(mismatch):
        y_pred = np.vstack((y_pred, np.zeros_like(categories)))

# populate columns of df_output with predictions
for ii, cat in enumerate(categories):
#     df_output[cat] = y_pred[:len(df_output), ii]
    if cat == 'any':
        df_output[cat] = 1.0
    else:
        df_output[cat] = 0.0
    
df_output.head()

Unnamed: 0,filename,ID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,/ssd1/stage_2_test_images/ID_d89585863.dcm,ID_d89585863,1.0,0.0,0.0,0.0,0.0,0.0
1,/ssd1/stage_2_test_images/ID_095f60af7.dcm,ID_095f60af7,1.0,0.0,0.0,0.0,0.0,0.0
2,/ssd1/stage_2_test_images/ID_1f8bf6c51.dcm,ID_1f8bf6c51,1.0,0.0,0.0,0.0,0.0,0.0
3,/ssd1/stage_2_test_images/ID_5303b8a95.dcm,ID_5303b8a95,1.0,0.0,0.0,0.0,0.0,0.0
4,/ssd1/stage_2_test_images/ID_5752f287d.dcm,ID_5752f287d,1.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# using the sample submission as the prototype, iterate through and fill with actual predictions
df_output.set_index('ID', inplace=True)
sample_submission = pd.read_csv(os.path.join(
    dataloc, 'stage_2_sample_submission.csv'))
submission = sample_submission.copy()

In [29]:
for idx, row in submission.iterrows():
    img_id = 'ID_'+row['ID'].split('_')[1]
    if img_id == 'ID_ffffcbff8':
        continue
    hem_type = row['ID'].split('_')[2]
    submission.at[idx, 'Label'] = df_output.at[img_id, hem_type]
    print(idx,end='\r')

727391

In [30]:
submission.tail(6)

Unnamed: 0,ID,Label
727386,ID_6b455e140_epidural,0.0
727387,ID_6b455e140_intraparenchymal,0.0
727388,ID_6b455e140_intraventricular,0.0
727389,ID_6b455e140_subarachnoid,0.0
727390,ID_6b455e140_subdural,0.0
727391,ID_6b455e140_any,1.0


In [31]:
df_output.loc['ID_6b455e140']

filename            /ssd1/stage_2_test_images/ID_6b455e140.dcm
any                                                          1
epidural                                                     0
intraparenchymal                                             0
intraventricular                                             0
subarachnoid                                                 0
subdural                                                     0
Name: ID_6b455e140, dtype: object

In [32]:
datestamp = str(datetime.datetime.now()).replace(':','_').replace(' ','T')
submission_filename = os.path.join(dataloc,'submission_binary_{}.csv'.format(datestamp))
submission.to_csv(submission_filename, index=False)
submission_filename

'/ssd1/submission_binary_2019-11-09T11_45_26.110098.csv'

In [13]:
df_output

Unnamed: 0_level_0,filename
ID,Unnamed: 1_level_1
1.517623e-03,/ssd1/stage_1_test_images/ID_ce4ffc33c.dcm
1.803156e-08,/ssd1/stage_1_test_images/ID_abff5116f.dcm
4.312943e-01,/ssd1/stage_1_test_images/ID_846566139.dcm
4.409184e-04,/ssd1/stage_1_test_images/ID_54183cf4f.dcm
8.622026e-07,/ssd1/stage_1_test_images/ID_7582c206d.dcm
1.313751e-03,/ssd1/stage_1_test_images/ID_1415bbbc9.dcm
2.442597e-09,/ssd1/stage_1_test_images/ID_022e35f8b.dcm
6.387699e-01,/ssd1/stage_1_test_images/ID_1cab3b0f3.dcm
7.867962e-06,/ssd1/stage_1_test_images/ID_5625ac30a.dcm
3.334422e-02,/ssd1/stage_1_test_images/ID_48634c499.dcm


In [14]:
temp = utils.load_test_data(dataloc)

In [15]:
temp

Unnamed: 0,filename,ID
0,/ssd1/stage_1_test_images/ID_ce4ffc33c.dcm,ID_ce4ffc33c
1,/ssd1/stage_1_test_images/ID_abff5116f.dcm,ID_abff5116f
2,/ssd1/stage_1_test_images/ID_846566139.dcm,ID_846566139
3,/ssd1/stage_1_test_images/ID_54183cf4f.dcm,ID_54183cf4f
4,/ssd1/stage_1_test_images/ID_7582c206d.dcm,ID_7582c206d
5,/ssd1/stage_1_test_images/ID_1415bbbc9.dcm,ID_1415bbbc9
6,/ssd1/stage_1_test_images/ID_022e35f8b.dcm,ID_022e35f8b
7,/ssd1/stage_1_test_images/ID_1cab3b0f3.dcm,ID_1cab3b0f3
8,/ssd1/stage_1_test_images/ID_5625ac30a.dcm,ID_5625ac30a
9,/ssd1/stage_1_test_images/ID_48634c499.dcm,ID_48634c499


In [21]:
testdf[['filename']+categories]

KeyError: "['any' 'epidural' 'intraparenchymal' 'intraventricular' 'subarachnoid'\n 'subdural'] not in index"

In [17]:
categories

['ID']