#Instruções

O dataset utilizado para as fotos dos ferimentos/pintas se encontra no link : https://www.dropbox.com/s/8o4ysiccspp46ov/skin-cancer-mnist-ham10000.zip?e=1&dl=0

Siga as seguintes instruções abaixo:

1 - Extraia o dataset no seu Google drive e substitua seu diretorio na variavel raw_files_folder.

2 - Criar as pastas Imagens_treino,Imagens_teste,Imagens_valid.

3 - As variaveis train_folder_id,test_folder_id e validation_folder_id são nome de pastas criadas para receber imagens misturadas de forma pseudoaletoria, mantenha as strings dessas variaveis .

#Instalar Libs

In [None]:
import os
import random
from shutil import copyfile
import numpy as np
from PIL import Image
from tensorflow.keras.utils import to_categorical
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Conv2D
from keras.layers import MaxPooling2D

#Conectar no Google e Variaveis iniciais

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Variaveis Glogabais

#Arquivos
raw_files_folder = '/content/drive/My Drive/dataset_deep_learning/skin_cancer/'
local_folder_path_1 =  raw_files_folder + 'HAM10000_images_part_1/'
local_folder_path_2 =  raw_files_folder + 'HAM10000_images_part_2/'

#Nome das pastas
train_folder_id = 'Imagens_treino'
test_folder_id = 'Imagens_teste'
validation_folder_id = 'Imagens_valid'

#Amostras (mudar conforme seu ambiente aguenta)
FILES_TREINO = 4001
FILES_TESTE = 1001
FILES_VALID = 1001

#Index a serem selecionados
FILES_FILTRAR_TREINO = int(FILES_TREINO/2)
FILES_FILTRAR_TESTE = int(FILES_TESTE/2)
FILES_FILTRAR_VALID = int(FILES_VALID/2)


Mounted at /content/drive


#Distribuindo arquivos entre as pastas, criar 3 pastas (treino, teste e validação) e subir as novas imagens

In [None]:
# Listar arquivos em ambas as pastas
files_1 = os.listdir(local_folder_path_1)
files_2 = os.listdir(local_folder_path_2)

In [None]:
# Filtrar os primeiros valores da lista para os conjuntos de treino
train_files_part_1 = files_1[:FILES_FILTRAR_TREINO]
train_files_part_2 = files_2[:FILES_FILTRAR_TREINO]

# Filtrar os valores finais da lista para os conjuntos de validação
validation_files_part1 = files_1[-FILES_FILTRAR_VALID:]
validation_files_part2 = files_2[-FILES_FILTRAR_VALID:]

# Calcular o índice central para os conjuntos de teste
middle_index = len(files_1) // 2

# Filtrar valores no meio da lista para os conjuntos de teste
start_index = middle_index - FILES_FILTRAR_TESTE // 2
end_index = middle_index + FILES_FILTRAR_TESTE // 2
test_files_part_1 = files_1[start_index:end_index][:FILES_TESTE]
test_files_part_2 = files_2[start_index:end_index][:FILES_TESTE]


In [None]:
# Função para fazer upload dos arquivos para uma pasta no Google Drive
def upload_files_to_drive(files, folder_id,folder_path):
    for filename in files:
        # Define o caminho completo do arquivo local
        local_file_path = os.path.join(folder_path, filename)

        # Define o caminho completo da pasta de destino no Google Drive
        dest_folder_path = os.path.join(raw_files_folder, folder_id)

        # Copia o arquivo para a pasta de destino no Google Drive
        copyfile(local_file_path, os.path.join(dest_folder_path, filename))


In [None]:
# Fazer upload das imagens para as pastas correspondentes no Google Drive
#Arquivos de treino
upload_files_to_drive(train_files_part_1, train_folder_id,local_folder_path_1)
upload_files_to_drive(train_files_part_2, train_folder_id,local_folder_path_2)

In [None]:
#Arquivos de teste
upload_files_to_drive(test_files_part_1, test_folder_id,local_folder_path_1)
upload_files_to_drive(test_files_part_2, test_folder_id,local_folder_path_2)

In [None]:
#Arquivos de validacao
upload_files_to_drive(validation_files_part1, validation_folder_id,local_folder_path_1)
upload_files_to_drive(validation_files_part2, validation_folder_id,local_folder_path_2)

#Processamento das imagens

##Listagem das imagens

In [None]:
#funcao para processamento das imagens em formato de array
def process_images_from_folder(folder_path,files):
    images = []
    for filename in os.listdir(folder_path)[:len(files)]:
        image_path = os.path.join(folder_path, filename)
        image = Image.open(image_path)
        image_array = np.array(image)
        images.append(image_array)
    return np.array(images)

In [None]:
# Pasta no Google Drive onde estão as imagens
train_folder  = raw_files_folder + train_folder_id
test_folder =  raw_files_folder + test_folder_id
validation_folder =  raw_files_folder  + validation_folder_id

# Listar os arquivos nas pastas de treino, teste e validação e remover as aspas
train_files = [os.path.splitext(file.replace('"', ''))[0] for file in os.listdir(train_folder)[:FILES_TREINO]]
test_files = [os.path.splitext(file.replace('"', ''))[0] for file in os.listdir(test_folder)[:FILES_TESTE]]
valid_files = [os.path.splitext(file.replace('"', ''))[0] for file in os.listdir(validation_folder)[:FILES_VALID]]


##Processamento inicial das features de treino, validação e teste

In [None]:
# Processar as imagens de cada pasta
x_train = process_images_from_folder(train_folder,train_files)
x_validation = process_images_from_folder(validation_folder,valid_files)
x_test = process_images_from_folder(test_folder,test_files)


###Imagem colorida contendo mais dimensoes, conventendo para apenas 1d (features) // redimensonar para 50x50 devido ao tamanho da imagem

In [None]:
# Redimensionar as imagens para 50x50 pixels
def resize_images(images):
    resized_images = []
    for img in images:
        resized_img = np.array(Image.fromarray(img).resize((50, 50)))
        resized_images.append(resized_img)
    return np.array(resized_images)

# Redimensionar as imagens de treino, validação e teste
x_train_resized = resize_images(x_train)
x_validation_resized = resize_images(x_validation)
x_test_resized = resize_images(x_test)


In [None]:
x_train_resized.shape

In [None]:
# Achatando as imagens para um array unidimensional
x_train_flat = x_train_resized.reshape(x_train_resized.shape[0], -1)
x_validation_flat = x_validation_resized.reshape(x_validation_resized.shape[0], -1)
x_test_flat = x_test_resized.reshape(x_test_resized.shape[0], -1)

In [None]:
# Criando DataFrames com os arrays unidimensionais
df_train_pixels = pd.DataFrame(x_train_flat, columns=[f'pixel_{i}' for i in range(x_train_flat.shape[1])])
df_validation_pixels = pd.DataFrame(x_validation_flat, columns=[f'pixel_{i}' for i in range(x_validation_flat.shape[1])])
df_test_pixels = pd.DataFrame(x_test_flat, columns=[f'pixel_{i}' for i in range(x_test_flat.shape[1])])


In [None]:
# Verificando a forma dos DataFrames resultantes
print("Shape do DataFrame de treino:", df_train_pixels.shape)
print("Shape do DataFrame de validação:", df_validation_pixels.shape)
print("Shape do DataFrame de teste:", df_test_pixels.shape)

In [None]:
df_train_pixels['image_id']= train_files
df_validation_pixels['image_id']= valid_files
df_test_pixels['image_id']= test_files

##Processamento das Labels [é cancer, não é cancer]

In [None]:
# Consumir os metadados
dataset_path = raw_files_folder + 'HAM10000_metadata.csv'
metadados = pd.read_csv(dataset_path)
metadados_dx = metadados[['image_id', 'dx']].copy()

# Lista de valores que indicam "não é câncer"
nao_cancer = ['akiec', 'nv', 'bkl']

####'akiec': Queratose actínica / Carcinoma intraepitelial escamoso de células escamosas (CIECE)
####'nv': Nevo melanocítico
####'bkl': Lesão benigna da queratose (Benign keratosis-like lesions)

# Criar a nova coluna "dx_rede" com a regra de cancer e não cancer
metadados_dx['dx_rede'] = metadados_dx['dx'].copy().apply(lambda x: 'não é câncer' if x in nao_cancer else 'é câncer')

df_target = metadados_dx[['image_id','dx_rede']].copy()

###Criação das Labels de arquivo com os arquivos das pastas de treino e teste

In [None]:
# Filtrar o DataFrame para cada conjunto
df_train_filtered = df_target[df_target['image_id'].isin(train_files)]
df_test_filtered = df_target[df_target['image_id'].isin(test_files)]
df_valid_filtered = df_target[df_target['image_id'].isin(valid_files)]

In [None]:
# Garantir que os IDs das imagens estão no mesmo formato em ambos os DataFrames
df_train_filtered['image_id'] = df_train_filtered['image_id'].copy().apply(lambda x: os.path.splitext(x)[0])
df_valid_filtered['image_id'] = df_valid_filtered['image_id'].copy().apply(lambda x: os.path.splitext(x)[0])
df_test_filtered['image_id'] = df_test_filtered['image_id'].copy().apply(lambda x: os.path.splitext(x)[0])


###Criação do dataset processado com as imagens e labels definindo por imagem o que é cancerigeno e o que não é

In [None]:
# Unir os DataFrames de pixels com os DataFrames de metadados usando a coluna 'image_id'
df_train_merged = pd.merge(df_train_pixels, df_train_filtered, on='image_id')
df_validation_merged = pd.merge(df_validation_pixels, df_valid_filtered, on='image_id')
df_test_merged = pd.merge(df_test_pixels, df_test_filtered, on='image_id')

df_train_true = df_train_merged.drop('image_id',axis = 1)
df_test_true = df_test_merged.drop('image_id',axis = 1)
df_validation_true = df_validation_merged.drop('image_id',axis = 1)

In [None]:
#Mudar nome de coluna de "dx_rede" para "label"
df_train_true.rename(columns={'dx_rede': 'label'}, inplace=True)
df_test_true.rename(columns={'dx_rede': 'label'}, inplace=True)
df_validation_true.rename(columns={'dx_rede': 'label'}, inplace=True)

In [None]:
df_train_true.head()

In [None]:
df_test_true.head()

In [None]:
df_validation_true.head()

In [None]:
df_train_true.shape

In [None]:
df_test_true.shape

In [None]:
df_validation_true.shape

In [None]:
len(df_train_true['label'].unique())

In [None]:
(df_train_true['label'].unique())

# Normalização do dataset e mapeamento do target (num_class)

In [None]:
# Fazer mapping para trocar string por numeros
label_dict = {'é câncer':1,'não é câncer':0}

df_train_true_label = df_train_true.copy()
df_train_true_label['label'] = df_train_true_label['label'].map(label_dict)

df_test_true_label = df_test_true.copy()
df_test_true_label['label'] = df_test_true_label['label'].map(label_dict)

df_validation_true_label = df_validation_true.copy()
df_validation_true_label['label'] = df_validation_true_label['label'].map(label_dict)


In [None]:
 set(df_train_true_label['label'])

In [None]:
# Obter as classes únicas do conjunto de dados
classes = set(df_train_true_label['label'])

plt.figure(0, figsize=(20,10))

# Iterar sobre cada classe
for c in classes:
    idx = np.where(df_train_true_label['label'] == c)
    if c + 1 == 10:
        break
    plt.subplot(430 + 1 +c)
    plt.imshow(df_train_true_label.drop('label', axis=1).iloc[idx[0][0]].values.reshape((50, 50, 3)))
    plt.text(0, 0, f'Class: {c}', color='white', backgroundcolor='black', fontsize=8)

plt.show()


# Normalização do dataset e mapeamento do target (num_class)

In [None]:

#Extração dos dfs de treino,teste e valição e suas respectivas labels
x_train = df_train_true_label.drop('label',axis=1).values
y_train = df_train_true_label['label'].values

x_test = df_test_true_label.drop('label',axis=1).values
y_test = df_test_true_label['label'].values

x_valid = df_validation_true_label.drop('label',axis=1).values
y_valid = df_validation_true_label['label'].values


In [None]:
#Normalização das features, colocando-as entre 0 e 1
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255
x_valid = x_valid.astype('float32') / 255

In [None]:
x_train

In [None]:
x_test

In [None]:
x_valid

In [None]:
# Número de pixels por imagem
num_pixels = 7500  # Ou seja, 50x50x3

# Redimensionar os dados para tensores 3D
x_train_3d = x_train.reshape(x_train.shape[0],50, 50, 3)
x_test_3d = x_test.reshape(x_test.shape[0], 50, 50, 3)
x_valid_3d = x_valid.reshape(x_valid.shape[0], 50, 50, 3)


In [None]:
x_train.shape[0]

In [None]:
x_valid.shape[0]

In [None]:
x_test.shape[0]

In [None]:
# Convertendo os rótulos verdadeiros para codificação one-hot
y_train_one_hot = to_categorical(y_train, len(classes))
y_test_one_hot = to_categorical(y_test, len(classes))
y_valid_one_hot = to_categorical(y_valid, len(classes))

In [None]:
len(y_valid_one_hot)

# Arquitetura da Rede Neural e Treino

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, AveragePooling2D, Flatten, Dense
model = Sequential()

# Camadas
model.add(Conv2D(6, kernel_size=(5, 5), activation='relu', input_shape=(50, 50, 3)))
model.add(AveragePooling2D(pool_size=(2, 2)))
model.add(Conv2D(16, kernel_size=(5, 5), activation='relu'))
model.add(AveragePooling2D(pool_size=(2, 2)))

# Camada Flatten para transformar os mapas de características em um vetor unidimensional
model.add(Flatten())

model.add(Dense(120, activation='relu'))

model.add(Dense(84, activation='relu'))

# Camada de saída com ativação softmax para classificação multiclasse
model.add(Dense(2, activation='softmax'))

In [None]:
from keras.utils import plot_model
# Plotar o modelo
plot_model(model, to_file='cnn-CHEST_X_RAY.png', show_shapes=True, show_layer_names=True)


In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
from keras.callbacks import ModelCheckpoint

In [None]:
checkpointer = ModelCheckpoint(filepath='/content/drive/My Drive/modelos/CHEST_X_RAY.hdf5', verbose=1,  save_best_only=True, monitor='val_accuracy') #

hist = model.fit(x_train_3d, y_train_one_hot, batch_size=200, epochs=5, validation_data=(x_valid_3d, y_valid_one_hot), callbacks=[checkpointer], verbose=1, shuffle=True)

# Mostrar os resultados

In [None]:
model = load_model("/content/drive/My Drive/modelos/CHEST_X_RAY.hdf5")

In [None]:
score = model.evaluate(x_test_3d, y_test_one_hot, verbose=0)
print('\n', 'Test accuracy:', score[1])

In [None]:
y_pred = model.predict(x_test_3d)

In [None]:
len(y_pred)

In [None]:
y_test_one_hot

In [None]:
y_pred

In [None]:
# Definir labels
labels = ['not cancer', 'cancer']

# Criar figure
fig = plt.figure(figsize=(20, 10))

# Iterate over random samples
for i, idx in enumerate(np.random.choice(x_test_3d.shape[0], size=32, replace=False)):
    # Add subplot
    ax = fig.add_subplot(4, 8, i + 1, xticks=[], yticks=[])

    # Mostrar imagem
    ax.imshow(np.squeeze(x_test_3d[idx]))

    # Indices do valor pred e base
    pred_idx = np.argmax(y_pred[idx])
    true_idx = np.argmax(y_test[idx])

    # Setar titulo com indicação de cor
    title_text = "{} ({})".format(labels[pred_idx], labels[true_idx])
    title_color = "green" if pred_idx == true_idx else "red"
    ax.set_title(title_text, color=title_color, fontsize=12, fontweight='bold')

# Ajustar layout
plt.tight_layout()

# Adicionar legenda
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label='Correcto',
                               markerfacecolor='green', markersize=10),
                   plt.Line2D([0], [0], marker='o', color='w', label='Errado',
                               markerfacecolor='red', markersize=10)]
plt.legend(handles=legend_elements, loc='upper right', fontsize=12)

# Mostrar plot
plt.show()