# Melanoma Predicting (with & without) DataImageGenerator
# Preprocessing and involving categorical/numeric features with image by concating CNN and Dense models

# Importing Libraries

In [None]:
import numpy as np
import random
import pandas as pd 
import matplotlib.pyplot as plt
from PIL import Image
!pip3 install --upgrade imutils
from imutils import paths
import cv2
from keras.preprocessing.image import ImageDataGenerator
import os
from PIL import ImageFile

from keras.models import Sequential
from keras.layers import Activation,Dropout,Flatten,Conv2D,MaxPooling2D,Dense,concatenate
from tensorflow.keras import layers

from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import keras
from tensorflow.keras.optimizers import Adam

# Preparing & Accessing daata

In [None]:
df_gt = pd.read_csv("../input/isic-2019/ISIC_2019_Training_GroundTruth.csv")
df_gt.sample(5)

In [None]:
df_md = pd.read_csv("../input/isic-2019/ISIC_2019_Training_Metadata.csv")
df_md['target'] = df_gt['MEL']
df_md.sample(5)

In [None]:
#Preparing directories for DataImage Generator
os.makedirs("Training_Data/MEL0")
os.makedirs("Test_Data/MEL0")
os.mkdir('./Test_Data/MEL1')
os.mkdir('./Training_Data/MEL1')

In [None]:
#Function to populate train and test directories
import shutil
import sys
def Make_Dir(src, dst, Data):
    Labels = Data[['image','target']]
    for imagename, target in Labels.values:
        src_path = src + '/'+ imagename + '.jpg'
        dst_path = dst + '/' + 'MEL' + str(int(target))
        try:
            shutil.copy(src_path, dst_path)
            #print("sucessfully copied " + imagename + ' from src ' + src_path + " to dst " + dst_path)    
        except IOError as e:
            print("Unable to copy file {} to {}".format(src_path, dst_path))
            break
        except:
            print("when try copy file {} to {}, unexpected error: {}".format(src_path, dst_path, sys.exc_info()))
            break

In [None]:
#Stratified data sampling since dataset is unbalanced we need same percentage of classes in training and testing data
Test_data =df_md.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=0.2,random_state=0))
Training_data = df_md.drop(Test_data.index)

In [None]:
Make_Dir('../input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input','./Training_Data', Training_data)
Make_Dir('../input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input','./Test_Data',Test_data )

In [None]:
#Check directories inside train and test directories
print("Test")
print(os.listdir('./Test_Data'))
print()
print("Test MEL0")
print(os.listdir('./Test_Data/MEL0')[0:5])
print()
print("Test MEL1")
print(os.listdir('./Test_Data/MEL1')[0:5])
print()                             
print("Training")
print(os.listdir('./Training_Data'))
print()                 
print("Training MEL0")
print(os.listdir('./Training_Data/MEL0')[0:5])
print()                
print("Training MEL1")
print(os.listdir('./Training_Data/MEL1')[0:5])

In [None]:
imagaPaths = list(paths.list_images('./Training_Data'))
len(imagaPaths)

In [None]:
#visualizing some random images
the_array = random.sample(imagaPaths , 25)
the_array
plt.figure(figsize=(30,20)) # specifying the overall grid size

for i in range(25):
    plt.subplot(5,5,i+1)    # the number of images in the grid is 5*5 (25)
    plt.imshow(np.asarray(plt.imread(the_array[i])))
    plt.axis('off') 
plt.show()

In [None]:
df_gt.isnull().sum()

In [None]:
df_md.isnull().sum()

#### data contains null values

In [None]:
import plotly.graph_objects as go

labels = ['No Melanoma ','Melanoma']
colors = ['gold', 'Hydrogen']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_md['target'].value_counts(), textinfo='label+percent',hole=0.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
 marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()

#### Data is imbalanced, solved further in the notebook using sklearn class weight

# Stratified Sampling (Spilting data)

In [None]:
#Stratified data sampling since dataset is unbalanced we need same percentage of classes in training and testing data
Test_data =df_md.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=0.2,random_state=0))
Training_data = df_md.drop(Test_data.index)

# Building ImageDataGenerator

In [None]:
#Defining ImageDataGenerator
image_gen = ImageDataGenerator(rotation_range = 30,
                               width_shift_range=0.1,
                               height_shift_range=0.1,
                               rescale= 1/255,
                               shear_range=0.1,
                               zoom_range=0.2,
                               horizontal_flip=True,
                               fill_mode= 'nearest',
                               validation_split=0.2)

#### Images without ImageDataGenerator

In [None]:
#visualizing some random images
the_array = random.sample(imagaPaths , 12)
plt.figure(figsize=(30,10)) # specifying the overall grid size

for i in range(12): 
    plt.subplot(2,6,i+1)    # the number of images in the grid is 5*5 (25)
    plt.imshow(np.asarray(plt.imread(the_array[i])))  
    plt.axis('off') 
plt.show()

#### Images after applying ImageDataGenerator

In [None]:
#visualizing same images but transformed
plt.figure(figsize=(30,10)) # specifying the overall grid size

for i in range(12):
    plt.subplot(2,6,i+1)    # the number of images in the grid is 5*5 (25)
    plt.imshow(image_gen.random_transform(np.asarray(plt.imread(the_array[i]))))
    plt.axis('off') 
plt.show()

#### it makes slight changes in images in each epoch

# Building model

In [None]:
input_shape = (150,150,3)
batch_size = 64

In [None]:
model = Sequential()

model.add(layers.Conv2D(32, (3,3), input_shape = input_shape , activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))

model.add(layers.Conv2D(64, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))

model.add(layers.Dropout(0.2))

model.add(layers.Conv2D(128, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))
model.add(layers.Dropout(0.2))

model.add(layers.Conv2D(256, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))

model.add(layers.Flatten())
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['acc'])

model.summary()

In [None]:
train_image_gen = image_gen.flow_from_directory('./Training_Data',
                                                target_size = input_shape[0:2],
                                               batch_size = batch_size,
                                               class_mode = 'binary')

In [None]:
test_image_gen = image_gen.flow_from_directory('./Test_Data',
                                                target_size = input_shape[0:2],
                                               batch_size = batch_size,
                                               class_mode = 'binary')

In [None]:
valid_image_gen = image_gen.flow_from_directory('./Training_Data',
                                                target_size = input_shape[0:2],
                                               batch_size = batch_size,
                                               class_mode = 'binary',
                                               subset='validation')

In [None]:
train_image_gen.class_indices

In [None]:
#Solving imbalaced data by class weight
class_weights = class_weight.compute_class_weight( 'balanced', classes=np.unique(test_image_gen.classes),  y=test_image_gen.classes)
train_class_weights = dict(enumerate(class_weights))

In [None]:
early_stopping_monitor = EarlyStopping(patience=10)

results = model.fit(train_image_gen,epochs=100,class_weight=train_class_weights,validation_data=valid_image_gen,callbacks=[early_stopping_monitor])

In [None]:
score = model.evaluate(test_image_gen, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

#### accuracy is pretty good, but lets try without image generator

# Preparing Data for the model without ImageDataGenerator

In [None]:
df_md['target'] = df_md['target'].astype('int')
df_md

In [None]:
#Preparing data to the model without DataImageGenerator
list_of_paths = []
list_of_names = []
for name in sorted(os.listdir('../input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input')):
    no_jpg_name = name[:-4]
    path ='../input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input/' + name
    list_of_paths.append(path)
    list_of_names.append(no_jpg_name)
list_of_paths[0:2]

In [None]:
list_of_names[0:2]

In [None]:
list_1 = df_md['image'].values.tolist()
list_1[0:2]

In [None]:
main_list = list(set(list_of_names) - set(list_1))
main_list

In [None]:
#Removing non image directories
list_of_paths.remove("../input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input/ATTRIBUTION.txt")
list_of_paths.remove("../input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input/LICENSE.txt")

In [None]:
df_md.insert(0,'path', list_of_paths)

### Taking a sample because my notebook runs out of memory and the kaggle's gpu never works (it is better to work with all of the data if it is possible, you can upgrade to google cloud ai notebooks)

In [None]:
df_md2=df_md.sample(frac = .2)
df_md2

In [None]:
df_md2['pixels'] = df_md2['path'].map(lambda x: np.asarray(Image.open(x).resize((128,128))))

In [None]:
df_md2.head()

In [None]:
df_md2['pixels'] = df_md2['pixels']/255

In [None]:
yy = df_md2['target']

In [None]:
XX =  df_md2['pixels']

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(XX,yy, test_size=0.2,random_state=0,stratify = yy)

In [None]:
x_train_arr= np.asarray(X_train_full.tolist())
x_test_arr = np.asarray(X_test.tolist())

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(x_train_arr,y_train_full, test_size=0.2,random_state=0,stratify = y_train_full)

In [None]:
print("X_train dataset: ", X_train.shape)
print("y_train dataset: ", y_train.shape)
print("x_valid dataset: ", X_valid.shape)
print("y_valid dataset: ", y_valid.shape)
print("X_test dataset: ", x_test_arr.shape)
print("y_test dataset: ", y_test.shape)

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_test).value_counts()

In [None]:
pd.Series(y_valid).value_counts()

In [None]:
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(3,3), input_shape =(128,128,3), activation='relu'))
model.add(MaxPooling2D(pool_size = (2,2)))
          
model.add(Conv2D(filters=64,kernel_size=(3,3), input_shape =(128,128,3), activation='relu'))
model.add(MaxPooling2D(pool_size = (2,2)))

model.add(Conv2D(filters=64,kernel_size=(3,3), input_shape =(128,128,3), activation='relu'))
model.add(MaxPooling2D(pool_size = (2,2)))
          
model.add(Flatten())
model.add(Dense(128,activation='sigmoid'))
          
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
          
model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics=['accuracy'])

In [None]:
class_weights = class_weight.compute_class_weight( 'balanced', classes=np.unique(y_train),  y=y_train)
train_class_weights = dict(enumerate(class_weights))

In [None]:
early_stopping_monitor = EarlyStopping(patience=10)

results = model.fit(X_train, y_train, validation_data=(X_valid,y_valid),epochs=100, class_weight = train_class_weights, batch_size=64,callbacks=[early_stopping_monitor])

In [None]:
score = model.evaluate(x_test_arr, y_test, verbose=1)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

#### with image generator the model gave a better accuracy, but note that this model aquired this accuracy with only 20% of the data

In [None]:
# summarize history for accuracy
plt.plot(results.history['accuracy'])
plt.plot(results.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(results.history['loss'])
plt.plot(results.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
pd.DataFrame(results.history).plot(figsize=(40,15))
plt.show()

# Preprocessing (age & sex) columns

In [None]:
df_md

In [None]:
XXX = df_md.drop(['image','lesion_id','target','anatom_site_general','path'],axis=1)
yyy = df_md['target']

cleanup_nums = {"sex":     {"male": 1, "female": 0,np.nan : 2}}
XXX=XXX.replace(cleanup_nums)

In [None]:
XXX.sample(5)

In [None]:
yyy.sample(5)

In [None]:
X_train_full1, X_test1, y_train_full1, y_test1 = train_test_split(XXX,yyy, test_size=0.2,random_state=0,stratify = yyy)

In [None]:
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X_train_full1,y_train_full1, test_size=0.2,random_state=0,stratify = y_train_full1)

In [None]:
X_train1.isnull().sum()

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(X_train1, x="age_approx",nbins=10, color_discrete_sequence=['indianred'])
fig.update_traces(marker = dict(color = 'rgba(5, 7, 73, 0.8)',
line=dict(color='rgb(255,255,255)',width=1.5)))
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()

In [None]:
X_train1.describe()

In [None]:
X_train1.mode()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_transformer

SEM = SimpleImputer(strategy = 'mean')
scaler = MinMaxScaler()

pipe = Pipeline([('SEM', SEM),('scaler', scaler)])

In [None]:
ct = make_column_transformer((pipe,['age_approx']),remainder='passthrough')

In [None]:
X_transformed_train = pd.DataFrame(ct.fit_transform(X_train1))
X_transformed_train.columns = X_train1.columns

X_transformed_valid = pd.DataFrame(ct.transform(X_valid1))
X_transformed_valid.columns = X_valid1.columns

X_transformed_test = pd.DataFrame(ct.transform(X_test1))
X_transformed_test.columns = X_test1.columns

In [None]:
X_transformed_train.sample(5)

In [None]:
sklearn_weights=class_weight.compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train1),
                                        y = y_train1)

sklearn_weights=dict(enumerate(sklearn_weights))

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(30, activation=tf.keras.activations.tanh, input_shape=(2, )))
model.add(tf.keras.layers.Dense(10, activation=tf.keras.activations.tanh))
model.add(tf.keras.layers.Dense(5, activation=tf.keras.activations.tanh))
model.add(tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid))

model.compile(
    loss=tf.keras.losses.binary_crossentropy,
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.002),
    metrics=tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')
)

from tensorflow.keras.callbacks import EarlyStopping

early_stopping_monitor = EarlyStopping(patience=10)
          
model.fit(X_transformed_train,y_train1,validation_data=(X_transformed_valid, y_valid1),epochs=200,callbacks=[early_stopping_monitor],class_weight=sklearn_weights)      

In [None]:
model.evaluate(X_transformed_test, y_test1)

#### accuracy isn't good for the numerical features, maybe some parameter tuning could give better accuarcy, but my goal is only to try a model on those two features (age, sex) to check how well it will perform with and without concatinating it with cnn model

# concatinating two models, one for images and one for numerical features

In [None]:
#Preparing Data first
XXX.sample(5)

In [None]:
same_size_numeric_train_data = pd.merge(XXX, y_train, left_index=True, right_index=True)
same_size_numeric_val_data = pd.merge(XXX, y_valid, left_index=True, right_index=True)
same_size_numeric_test_data = pd.merge(XXX, y_test, left_index=True, right_index=True)

y_train11= same_size_numeric_train_data ['target']
same_size_numeric_train_data = same_size_numeric_train_data.drop(['target'], axis = 1)

y_test11 = same_size_numeric_test_data['target']
same_size_numeric_test_data = same_size_numeric_test_data.drop(['target'], axis = 1)

y_valid11 = same_size_numeric_val_data ['target']
same_size_numeric_val_data = same_size_numeric_val_data.drop(['target'], axis = 1)


In [None]:
same_size_numeric_test_data.head()

In [None]:
#Changing null values in Age columns with the mean
SEM = SimpleImputer(strategy = 'mean')

X_transformed_trainn = pd.DataFrame(SEM.fit_transform(same_size_numeric_train_data))
X_transformed_trainn.columns = same_size_numeric_train_data.columns

X_transformed_validd = pd.DataFrame(SEM.transform(same_size_numeric_val_data))
X_transformed_validd.columns = same_size_numeric_val_data.columns

X_transformed_testt = pd.DataFrame(SEM.transform(same_size_numeric_test_data))
X_transformed_testt.columns = same_size_numeric_test_data.columns

In [None]:
print("X_train : " + str(X_train.shape))
print("X_transformed_train : " + str(X_transformed_trainn.shape))
print("X_valid : " + str(X_valid.shape))
print("X_transformed_valid : " + str(X_transformed_validd.shape))
print("X_test : " + str(X_test.shape))
print("X_transformed_test11 : " + str(X_transformed_testt.shape))
print("y_train : " + str(y_train.shape))
print("y_train11 : " + str(y_train1.shape))
print("y_valid : " + str(y_valid.shape))
print("y_valid11 : " + str(y_valid1.shape))
print("y_test : " + str(y_test.shape))
print("y_test11 : " + str(y_valid1.shape))

In [None]:
from keras.models import Model
model = Sequential()
IN = layers.Input(shape=(128,128,3))
COV1 = Conv2D(32, (3,3), activation = 'relu')(IN)
MAX1 = MaxPooling2D(2,2)(COV1)

COV2 = Conv2D(64, (3,3), activation = 'relu')(MAX1)
MAX2 = MaxPooling2D(2,2)(COV2)

DR1 = Dropout(0.2)(MAX2)

COV3 = Conv2D(128, (3,3), activation = 'relu')(DR1)
MAX3 = MaxPooling2D(2,2)(COV3)
DR2 = Dropout(0.2)(MAX3)

COV4 = Conv2D(256, (3,3), activation = 'relu')(DR2)
MAX4 = MaxPooling2D(2,2)(COV4)

F1 = Flatten()(MAX4)
#DR3 = Dropout(0.2)(F1)
#DEN1 = Dense(256, activation='relu')(DR3)
# OUT = Dense(1, activation='sigmoid')(DEN1)

In [None]:
IN2 = layers.Input(shape=(2,), name="IN2")
al_1 = Dense(30, activation = "relu",name ="a_layer_1")(IN2)
al_2 = Dense(10, activation="relu",name ="a_layer_2")(al_1)
al_3 = Dense(5, activation="relu",name ="a_layer_3")(al_2)

# al_4 = Dense(1, activation="sigmoid",name ="a_layer_4")(al_3)
# OUT2 = Dense(1, activation="sigmoid",name ="a_output_layer")(al_4)

In [None]:
concatenated = concatenate([F1, al_3])

In [None]:
output = Dense(1, activation='sigmoid')(concatenated)

In [None]:
model_final = Model(inputs=[IN, IN2], outputs=[output])

In [None]:
model_final.compile(loss = 'binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.002),metrics=tf.keras.metrics.BinaryAccuracy(name='binary_accuracy'))

In [None]:
early_stopping_monitor = EarlyStopping(patience=10)
results = model_final.fit([X_train, X_transformed_trainn], [y_train, y_train11], epochs = 200, validation_data = ([X_valid, X_transformed_validd],[y_valid ,y_valid11]),callbacks=[early_stopping_monitor])

In [None]:
test_loss , test_acc = model_final.evaluate([x_test_arr, X_transformed_testt],[y_test ,y_test11])
print('Loss : '+ str(test_loss))
print('Accuracy : '+ str(test_acc))

In [None]:
pd.DataFrame(results.history).plot(figsize=(40,15))
plt.show()

# concatinating cnn model (Images) and Dense model (age, sex) gives a better accuracy, note that we only took 20% of the data, 100% of the data should give a much better testing accuracy than 82%

In [None]:
#os.system("rm -r ./Training_Data/MEL0/ISIC_0024762.jpg")