In [10]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from pathlib import Path
from tensorflow.keras.models import load_model
import os
from tensorflow.keras.applications import ResNet50V2
from sklearn.model_selection import StratifiedKFold
import numpy as np
tf.__version__

'2.4.1'

In [2]:
PATH = '/kaggle/input/'

os.listdir(PATH)

['melanoma224', 'dataset-melanoma224']

In [3]:
train = pd.read_csv(f'{PATH}/dataset-melanoma224/subset.csv')
val = pd.read_csv(f'{PATH}/dataset-melanoma224/val_split.csv')
train.shape, val.shape


((2220, 8), (10932, 8))

In [4]:
BATCH_SIZE = 64
AUTO = tf.data.experimental.AUTOTUNE # Permite obtener el numero de cpu 

def decode(name, label):
    img = tf.io.read_file(name)
    img = tf.image.decode_jpeg(img,channels=3)
    img = tf.cast(img, tf.float32)
    return img,label


def load_ds(df):
    # Le indicamos a tensorflow que da igual el orden de lectura de las imagenes, lo que aumentará la velocidad
    options = tf.data.Options()
    options.experimental_deterministic = False
    #---------------
    imgs , labels = df["image_name"].values, df["target"].values
    imgs = [f'{PATH}/melanoma224/jpeg224/train/{name}.jpg' for name in imgs]
    ds = tf.data.Dataset.from_tensor_slices((imgs,labels))
    ds = ds.with_options(options) # Aplicamos las opciones que hemos puesto
    ds = ds.map(decode, num_parallel_calls=AUTO) # num_parallel_calls permitira a la funcion map procesar en paralelo tantas imagenes como le pongamos
                                                 # maximo el numero de cpu que tenga el ordenador por eso usamo la variable AUTO
    df = ds.cache() # Cada epochs se leen las imagenes y es preferible cargar nuestro datasets en cache
    ds = ds.shuffle(2048)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size = AUTO) # Mientras la gpu esta calculando la cpu vaya leyendo imagenes
    return ds


In [5]:
FOLDS = 3 # El numero de folds comun es 5 y en datasets pequeños 10
aucs = []
skf = StratifiedKFold(n_splits=FOLDS, random_state=42, shuffle=True) # Nor permitirá crear subgrupos stratificados

for f, (train_index,val_index) in enumerate(skf.split(X=np.zeros(len(train)), y=train["target"])): # Como solo nos interesa el stratified por los indices del targets podemos ponerle
                                                                                                   # a la X secuencia de ceros de la longitud de dataframe
    print("Fold: ",f+1)
    
    train_fold = train.iloc[train_index]
    val_fold = train.iloc[val_index]
    
    # Ahora deberemos meter todo aqui
    train_ds = load_ds(train_fold)
    val_ds = load_ds(val_fold)
    
    IMAGE_SIZE = (224,224,3)

    encoder = ResNet50V2(
        include_top=False,
        input_shape=IMAGE_SIZE,
        weights='imagenet'
    )
    encoder.trainable=False
    inputs = keras.Input(shape=IMAGE_SIZE)
    x = keras.layers.experimental.preprocessing.Rescaling(1./255)(inputs)
    x = encoder(x, training = False)
    x = keras.layers.GlobalAveragePooling2D()(x)
    ouputs = keras.layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, ouputs)
    #model.summary()
    
    #Compilamos el modelo
    model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[keras.metrics.AUC(name="auc")]
    )
    # Creamos nuestra callbacks
    filepath = f"kaggle/working/checkpoints/checkpoint"
    cb = tf.keras.callbacks.ModelCheckpoint(
    filepath=filepath,
    monitor="val_auc",
    verbose=1,
    save_best_only=True,
    save_weights_only=True, # Guardar todo el modelo o solo los pesos
    mode="max", # Guardara el valor maximo del parametro que monitorizamos, en caso de loss poner "min"
    )
    
    #Entrenamos nuestro modelo
    model.fit(train_ds,
          epochs=10,
          validation_data=val_ds,
          validation_steps=10,
          callbacks=[cb])
    # Al final de cada for cargamos los mejores pesos
    model.load_weights(filepath)
    model.save(f"/kaggle/working/model{f+1}.h5")# Deberemos indicar nombres diferentes para guardar los diferentes modelos que generemos
    
    # Cargamos las metricas
    _, auc = model.evaluate(val_ds) # Evaluate nos devolverá la "loss" y la metrica "auc"
    aucs.append(auc)
    
    


Fold:  1
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/10

Epoch 00001: val_auc improved from -inf to 0.25040, saving model to kaggle/working/checkpoints/checkpoint
Epoch 2/10

Epoch 00002: val_auc improved from 0.25040 to 0.38402, saving model to kaggle/working/checkpoints/checkpoint
Epoch 3/10

Epoch 00003: val_auc improved from 0.38402 to 0.48321, saving model to kaggle/working/checkpoints/checkpoint
Epoch 4/10

Epoch 00004: val_auc improved from 0.48321 to 0.52037, saving model to kaggle/working/checkpoints/checkpoint
Epoch 5/10

Epoch 00005: val_auc improved from 0.52037 to 0.61476, saving model to kaggle/working/checkpoints/checkpoint
Epoch 6/10

Epoch 00006: val_auc improved from 0.61476 to 0.62104, saving model to kaggle/working/checkpoints/checkpoint
Epoch 7/10

Epoch 00007: val_auc improved from 0.62104 to 0.74996, saving model to kaggle/working/checkpoints/checkpoint
Ep

In [6]:
aucs

[0.629880428314209, 0.7843614816665649, 0.639297366142273]

In [7]:
np.mean(aucs),np.std(aucs)

(0.6845130920410156, 0.07070806348465987)

In [8]:

test = pd.read_csv(f'{PATH}/melanoma224/test.csv')
BATCH_SIZE = 64

def decode_test(name):
    img = tf.io.read_file(name)
    img = tf.image.decode_jpeg(img,channels=3)
    img = tf.cast(img, tf.float32)
    return img


def load_test_ds(df):
    imgs  = df["image_name"].values
    imgs = [f'{PATH}/melanoma224/jpeg224/test/{name}.jpg' for name in imgs]
    ds = tf.data.Dataset.from_tensor_slices(imgs)
    ds = ds.map(decode_test, num_parallel_calls=AUTO)
    ds = ds.batch(BATCH_SIZE)
    return ds

In [9]:
test_ds = load_test_ds(test)


In [11]:
# Como tenemos tres modelos lo que haremos sera calcular las predicciones con los tres
preds = []
for f in range(1,FOLDS+1):
    print("Folds:", f)
    model_fold = load_model(f"/kaggle/working/model{f}.h5")
    probas = model_fold.predict(test_ds)
    preds.append(probas)

Folds: 1
Folds: 2
Folds: 3


In [12]:
# Ahora haremos una media de las tres predicciones 
preds_mean = np.mean(preds, axis=0)
preds_mean

array([[0.01373591],
       [0.00668238],
       [0.01764069],
       ...,
       [0.01693267],
       [0.00294129],
       [0.03003343]], dtype=float32)

In [17]:
submission = pd.DataFrame({
    'image_name':test['image_name'].values,
    'target':preds_mean.ravel()
})

In [18]:
submission.to_csv("/kaggle/working/submission.csv", index=False)
