In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# function to filter for only desired categories
def get_categories(df, col, categories):
    return df[df[col].isin(categories)]

# function to change to binary label is cancer/not cancer
def cancer_labeling(dx):
    if dx == "akiec" or dx == 'mel' or dx == 'bcc':
        return 'cancer'
    else:
        return 'not cancer'
    
def melanoma_labeling(dx):
    if dx == "mel":
        return 'melanoma'
    else:
        return 'not melanoma'

  from ._conv import register_converters as _register_converters


In [3]:
meta = pd.read_csv('skin-cancer-mnist-ham10000/HAM10000_metadata.csv')

# filter images dataset for desired categories
categories = ['bkl', 'nv', 'mel']
meta_filtered = get_categories(meta, 'dx', categories)

# append jpg to image id
meta_filtered['image_id'] = meta_filtered['image_id'].transform(lambda x: x + '.jpg')

meta_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419.jpg,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030.jpg,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769.jpg,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661.jpg,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633.jpg,bkl,histo,75.0,male,ear


In [4]:
# split data into train, test, validate
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    meta_filtered['image_id'], 
    meta_filtered['dx'], 
    test_size=0.2, 
    random_state=1
)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

train = pd.DataFrame(X_train).join(y_train)
test = pd.DataFrame(X_test).join(y_test)
val = pd.DataFrame(X_val).join(y_val)

# change labels to cancer/not cancer
train['melanoma'] = train['dx'].apply(lambda x: melanoma_labeling(x))
test['melanoma'] = test['dx'].apply(lambda x: melanoma_labeling(x))
val['melanoma'] = val['dx'].apply(lambda x: melanoma_labeling(x))

train = train.drop(['dx'], axis=1)
test = test.drop(['dx'], axis=1)
val = val.drop(['dx'], axis=1)

train.info()
test.info()
val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5706 entries, 8014 to 9106
Data columns (total 2 columns):
image_id    5706 non-null object
melanoma    5706 non-null object
dtypes: object(2)
memory usage: 293.7+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1784 entries, 5990 to 200
Data columns (total 2 columns):
image_id    1784 non-null object
melanoma    1784 non-null object
dtypes: object(2)
memory usage: 121.8+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1427 entries, 9477 to 8472
Data columns (total 2 columns):
image_id    1427 non-null object
melanoma    1427 non-null object
dtypes: object(2)
memory usage: 73.4+ KB


In [7]:
# Create image generator classes
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)
test_datagen = ImageDataGenerator(rescale=1./255)

# Import image data
img_dir = 'HAM10000'
img_resize = (50, 50)

train_generator = train_datagen.flow_from_dataframe(
    train, 
    directory=img_dir, 
    x_col='image_id', 
    y_col='melanoma', 
    target_size=img_resize,
    class_mode='binary',
    batch_size=200
)

val_generator = train_datagen.flow_from_dataframe(
    val, 
    directory=img_dir, 
    x_col='image_id', 
    y_col='melanoma', 
    target_size=img_resize,
    class_mode='binary',
    batch_size=200
)

test_generator = test_datagen.flow_from_dataframe(
    test, 
    directory=img_dir, 
    x_col='image_id', 
    y_col='melanoma', 
    target_size=img_resize,
    class_mode='binary',
    batch_size=50
)

print(train_generator.class_indices)
print(test_generator.class_indices)
print(val_generator.class_indices)



Found 5706 images belonging to 2 classes.
Found 1427 images belonging to 2 classes.
Found 1784 images belonging to 2 classes.
{'melanoma': 0, 'not melanoma': 1}
{'melanoma': 0, 'not melanoma': 1}
{'melanoma': 0, 'not melanoma': 1}


In [None]:
model = tf.keras.Sequential()

# Add CNN layer: 2D matrix, 32 nodes, kernel=3x3 filter matrix
# 1st layer must have shape of each input image
model.add(tf.keras.layers.Conv2D(
    32, 
    kernel_size=3, 
    activation='relu', 
    input_shape=(50,50,3), 
    padding='same'
))
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(3, 3)))
model.add(tf.keras.layers.Dropout(0.25))

model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(356, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

history = model.fit_generator(
        train_generator,
        steps_per_epoch=train_generator.batch_size,
        epochs=5
)

Epoch 1/5
  2/200 [..............................] - ETA: 33:03 - loss: 0.5306 - binary_accuracy: 0.6575

KeyboardInterrupt: 