## Dependencies

In [1]:
!pip install --quiet efficientnet

In [2]:
import glob, json, re
from melanoma_utility_scripts import *
from kaggle_datasets import KaggleDatasets
from tensorflow.keras import Model
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
import efficientnet.tfkeras as efn

SEED = 0
seed_everything(SEED)

## TPU configuration

In [3]:
strategy, tpu = set_up_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE

REPLICAS:  1


# Model parameters

In [4]:
input_base_path = '/kaggle/input/2-melanoma-5fold-efficientnetb3/'
dataset_path = 'melanoma-256x256'

with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config['BATCH_SIZE'] = 512
config

{'HEIGHT': 256,
 'WIDTH': 256,
 'CHANNELS': 3,
 'BATCH_SIZE': 512,
 'EPOCHS': 40,
 'LEARNING_RATE': 0.0003,
 'ES_PATIENCE': 10,
 'N_FOLDS': 5,
 'BASE_MODEL_PATH': '/kaggle/input/efficientnet/efficientnet-b3_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5',
 'DATASET_PATH': 'melanoma-256x256'}

# Load data

In [5]:
database_base_path = '/kaggle/input/siim-isic-melanoma-classification/'
test = pd.read_csv(database_base_path + 'test.csv')

print(f'Test samples: {len(test)}')
display(test.head())

GCS_PATH = KaggleDatasets().get_gcs_path(dataset_path)
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test*.tfrec')

Test samples: 10982


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


In [6]:
model_path_list = glob.glob(input_base_path + '*.h5')
n_models = len(model_path_list)
model_path_list.sort()

print(f'{n_models} Models to predict:')
print(*model_path_list, sep='\n')

5 Models to predict:
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_1.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_2.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_3.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_4.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_5.h5


## Auxiliar functions

In [7]:
# Datasets utility functions
UNLABELED_TFREC_FORMAT = {
    "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
    "image_name": tf.io.FixedLenFeature([], tf.string), # shape [] means single element
    # meta features
    "patient_id": tf.io.FixedLenFeature([], tf.int64),
    "sex": tf.io.FixedLenFeature([], tf.int64),
    "age_approx": tf.io.FixedLenFeature([], tf.int64),
    "anatom_site_general_challenge": tf.io.FixedLenFeature([], tf.int64),
}

def decode_image(image_data, height, width, channels):
    image = tf.image.decode_jpeg(image_data, channels=channels)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [height, width, channels])
    return image

def read_unlabeled_tfrecord(example, height=config['HEIGHT'], width=config['WIDTH'], channels=config['CHANNELS']):
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'], height, width, channels)
    image_name = example['image_name']
    # meta features
    data = {}
    data['patient_id'] = tf.cast(example['patient_id'], tf.int32)
    data['sex'] = tf.cast(example['sex'], tf.int32)
    data['age_approx'] = tf.cast(example['age_approx'], tf.int32)
    data['anatom_site_general_challenge'] = tf.cast(tf.one_hot(example['anatom_site_general_challenge'], 7), tf.int32)
    
    return {'input_image': image, 'input_tabular': data}, image_name # returns a dataset of (image, data, image_name)

def load_dataset_test(filenames, buffer_size=-1):
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=buffer_size) # automatically interleaves reads from multiple files
    dataset = dataset.map(read_unlabeled_tfrecord, num_parallel_calls=buffer_size)
    # returns a dataset of (image, data, label, image_name) pairs if labeled=True or (image, data, image_name) pairs if labeled=False
    return dataset

def get_test_dataset(filenames, batch_size=32, buffer_size=-1):
    dataset = load_dataset_test(filenames, buffer_size=buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=False)
    dataset = dataset.prefetch(buffer_size)
    return dataset

# Model

In [8]:
def model_fn(input_shape):
    input_image = L.Input(shape=input_shape, name='input_image')
    base_model = efn.EfficientNetB3(weights=None, 
                                    include_top=False)

    x = base_model(input_image)
    x = L.GlobalAveragePooling2D()(x)
    output = L.Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_image, outputs=output)
    
    return model

# Make predictions

In [9]:
test_dataset = get_test_dataset(TEST_FILENAMES, batch_size=config['BATCH_SIZE'], buffer_size=AUTO)
NUM_TEST_IMAGES = len(test)
test_preds = np.zeros((NUM_TEST_IMAGES, 1))


for model_path in model_path_list:
    print(model_path)
    with strategy.scope():
        model = model_fn((config['HEIGHT'], config['WIDTH'], config['CHANNELS']))
        model.load_weights(model_path)
    
    test_preds += model.predict(test_dataset) / n_models


image_names = next(iter(test_dataset.unbatch().map(lambda data, image_name: image_name).batch(NUM_TEST_IMAGES))).numpy().astype('U')
name_preds = dict(zip(image_names, test_preds.reshape(len(test_preds))))
test['target'] = test.apply(lambda x: name_preds[x['image_name']], axis=1)

/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_1.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_2.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_3.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_4.h5
/kaggle/input/2-melanoma-5fold-efficientnetb3/model_fold_5.h5


# Visualize predictions

In [10]:
print('Top 10 samples')
display(test[['image_name', 'sex', 'age_approx','anatom_site_general_challenge','target'] + 
             [c for c in test.columns if (c.startswith('pred_fold'))]].head(10))

print('Top 10 positive samples')
display(test[['image_name', 'sex', 'age_approx','anatom_site_general_challenge', 'target'] + 
             [c for c in test.columns if (c.startswith('pred_fold'))]].query('target >= .5').head(10))

Top 10 samples


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target
0,ISIC_0052060,male,70.0,,0.000458
1,ISIC_0052349,male,40.0,lower extremity,0.000147
2,ISIC_0058510,female,55.0,torso,0.003709
3,ISIC_0073313,female,50.0,torso,0.000344
4,ISIC_0073502,female,45.0,lower extremity,0.002317
5,ISIC_0074618,male,50.0,lower extremity,0.007215
6,ISIC_0076801,male,45.0,upper extremity,0.004527
7,ISIC_0077586,male,50.0,lower extremity,0.00389
8,ISIC_0082004,female,45.0,torso,0.002052
9,ISIC_0082785,male,65.0,lower extremity,0.002775


Top 10 positive samples


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target
1408,ISIC_1364884,female,35.0,upper extremity,0.663325
1677,ISIC_1593714,female,50.0,lower extremity,0.621903
3046,ISIC_2840019,male,25.0,lower extremity,0.506212
7200,ISIC_6519116,female,55.0,torso,0.50186
8061,ISIC_7301931,female,65.0,upper extremity,0.642449
8070,ISIC_7308210,male,30.0,upper extremity,0.514586
10448,ISIC_9495534,female,35.0,upper extremity,0.616305


# Test set predictions

In [11]:
submission = pd.read_csv(database_base_path + 'sample_submission.csv')
submission['target'] = test['target']
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,image_name,target
0,ISIC_0052060,0.000458
1,ISIC_0052349,0.000147
2,ISIC_0058510,0.003709
3,ISIC_0073313,0.000344
4,ISIC_0073502,0.002317
5,ISIC_0074618,0.007215
6,ISIC_0076801,0.004527
7,ISIC_0077586,0.00389
8,ISIC_0082004,0.002052
9,ISIC_0082785,0.002775
