# Group 49 Data Processing:
This portion will deal with the manipluation of the metadata and images to be suitable for use in our primary model.

In [9]:
# intial imports
import time
import pandas as pd
import numpy as np
from skimage.io import imread
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torchvision import transforms
from torchvision.utils import save_image
import matplotlib.image as mpimg
import torch

## IF USING GOOGLE COLAB

Please note that the images will be pulled from the shared folder that this file is stored in, thus it is import that you **CHANGE LINE BELLOW** to the path in which the folder is located on your drive/computer.

Relative paths work a little wonky in google colab thus the absolute path is easier to work with.

In [10]:
from google.colab import drive # For when working in Colab for training purposes
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# new data set
folder_path = "/content/drive/MyDrive/APS360 Project /Model" # no idea if this will work -- Cairo
#folder_path = "/content/drive/MyDrive/APS360/Project /Model" # -- Kate

In [18]:
import os # handling files, file paths and directories

# testing the google colab directories
directory_files = os.listdir(folder_path) # creates a list of all the files in the directory
print('Files in Model Directory:')
for i in range(len(directory_files)): # checking all the files in the current directory
    if os.path.isdir(directory_files[i]):
        image_directory = directory_files[i] # if the file is a directory, it will store the directory in this variable
    print(f"[{i+1}] {directory_files[i]}")

Files in Model Directory:
[1] Baseline Model
[2] Challenge 2019 Images
[3] HIBA Images - Test
[4] Ham10000
[5] __pycache__
[6] Saved Models
[7] Copy of CNN_primary_model.ipynb
[8] Copy of CNN_primary_model - Kate.ipynb
[9] CNN_primary_model.ipynb
[10] Model Training .gdoc
[11] training_data.csv
[12] validation_data.csv
[13] validation_data_onehot.csv
[14] training_data_onehot.csv
[15] test_data.csv
[16] test_data_onehot.csv
[17] Primary Autoencoder.ipynb
[18] AnastasiaPlayground.ipynb
[19] Copy of Primary_Autoencoder_Cairo.ipynb
[20] PROVe-AI
[21] Copy of CNN_model - Miranda.ipynb
[22] large_training_data.csv
[23] large_validation_data.csv
[24] large_validation_data_onehot.csv
[25] large_training_data_onehot.csv
[26] large_test_data.csv
[27] large_test_data_onehot.csv
[28] MobileNet_Cairo.ipynb
[29] utils.py
[30] Primary_Autoencoder_Cairo.ipynb
[31] MobileNet_Cairo_Metadata_Test.ipynb
[32] CNN_model - Miranda.ipynb
[33] demo_data_onehot.csv
[34] demo_data.csv
[35] Presentation_Demonstr

## Part 1: Verifying Images

When working this this set in the past, sometimes images have been missing despite being listed in the Metadata CSV.

Thus, we will doing an inital sanity check to make sure that if an image is listed in the CSV, the image is also present within the folder and vice versa.

Additionally, this step will also be used to check that all images within the training set are unique and that none of these images appear in the testing set.

In [19]:
categorical_features = ['anatom_site_general', 'benign_malignant', 'diagnosis', 'sex']

In [20]:
def img_transform(folder_path, dataframe, imgs_to_transform, final_amount):
  """
  This function takes in a cleaned dataset and transforms images in classes with less data points to balance the dataset
  -- folder_path is the file path only to the metadata csv
  -- imgs_to_transform is a list of the classes that need transformed images
  -- final amount is the final amount of each the desired classes (single int)
  """
  # load metadata
  #metadata_file_path = os.path.join(folder_path, 'metadata.csv')
  df_dataset = dataframe
  print(f'Initial Number of Images: {len(df_dataset)}')

  # get inital class stats
  init_class_stats = df_dataset['diagnosis'].value_counts()
  print(init_class_stats)

  #required_stats = []

  #for indx in init_class_stats.iteritems():
  #  if indx[0] in imgs_to_transform:
  #    required_stats.append(indx)

  #print(required_stats)

  num_squamous_cell_carcinoma = 0
  num_actinic_keratosis = 0
  num_dermatofibroma = 0
  num_vascular_lesion = 0
  num_basal = 0

  # dictionary for new data
  new_data = {
      'isic_id': [],
      'age_approx': [],
      'anatom_site_general': [],
      'benign_malignant': [],
      'diagnosis': [],
      'sex': []
  }

  # desired transformations being declared
  #transformations  = torch.nn.Sequential(
      #transforms.ToPILImage(),
      #transforms.RandomRotation(degrees=90),
      #transforms.ToTensor())
  transformations = transforms.RandomRotation(degrees = 90)
  # https://www.projectpro.io/recipes/convert-image-tensor-pytorch
  convert_tensor = transforms.ToTensor()

  count = 0
  # loop through metadata
  for i in range(40):
    for ind in df_dataset.index:

      #id, age, ana_site, b_m, diag
      diag = df_dataset['diagnosis'][ind]
      if diag in imgs_to_transform:

        id = df_dataset['isic_id'][ind]
        age = df_dataset['age_approx'][ind]
        ana_site = df_dataset['anatom_site_general'][ind]
        b_m = df_dataset['benign_malignant'][ind]
        sex = df_dataset['sex'][ind]

        #print(diag) #testing

        img_name = f'Transform{count}'
        #print(img_name) #testing
        og_img = os.path.join(folder_path, id + '.JPG')

        # source: https://discuss.pytorch.org/t/applying-transforms-to-a-single-image/56254
        #print(og_img) #testing
        image = mpimg.imread(og_img)
        new_img = transformations(convert_tensor(image)) # this step rotates the image

        new_img_filepath = os.path.join(folder_path, img_name +'.JPG') # this is where the image will be saved
        save_image(new_img, new_img_filepath) # should save the image in the declared file type (JPG in this case)

        # I had to hard code this next part for the sake of time

        count += 1
        if diag == 'squamous cell carcinoma' and num_squamous_cell_carcinoma<final_amount:
          new_data['isic_id'].append(img_name) # new file name
          new_data['age_approx'].append(age)
          new_data['anatom_site_general'].append(ana_site)
          new_data['benign_malignant'].append(b_m)
          new_data['diagnosis'].append(diag)
          new_data['sex'].append(sex)
          #required_stats[0][1] += 1
          num_squamous_cell_carcinoma += 1
          #print(diag) #testing

        if diag == 'actinic keratosis' and num_actinic_keratosis<final_amount:
          new_data['isic_id'].append(img_name)
          new_data['age_approx'].append(age)
          new_data['anatom_site_general'].append(ana_site)
          new_data['benign_malignant'].append(b_m)
          new_data['diagnosis'].append(diag)
          new_data['sex'].append(sex)
          #required_stats[1][1] += 1
          num_actinic_keratosis += 1
          #print(diag) #testing

        if diag == 'dermatofibroma' and num_dermatofibroma<final_amount:
          new_data['isic_id'].append(img_name)
          new_data['age_approx'].append(age)
          new_data['anatom_site_general'].append(ana_site)
          new_data['benign_malignant'].append(b_m)
          new_data['diagnosis'].append(diag)
          new_data['sex'].append(sex)
          #required_stats[2][1] += 1
          num_dermatofibroma += 1
          #print(diag) #testing

        if diag == 'vascular lesion' and num_vascular_lesion<final_amount:
          new_data['isic_id'].append(img_name)
          new_data['age_approx'].append(age)
          new_data['anatom_site_general'].append(ana_site)
          new_data['benign_malignant'].append(b_m)
          new_data['diagnosis'].append(diag)
          new_data['sex'].append(sex)
          #required_stats[3][1] += 1
          num_vascular_lesion += 1
          #print(diag) #testing

        if diag == 'basal cell carcinoma' and num_basal < final_amount:
          new_data['isic_id'].append(img_name)
          new_data['age_approx'].append(age)
          new_data['anatom_site_general'].append(ana_site)
          new_data['benign_malignant'].append(b_m)
          new_data['diagnosis'].append(diag)
          new_data['sex'].append(sex)
          #required_stats[3][1] += 1
          num_basal += 1
          print(diag) #testing
    if num_squamous_cell_carcinoma == final_amount and num_actinic_keratosis == final_amount and num_dermatofibroma == final_amount and num_vascular_lesion == final_amount and num_basal == final_amount: # not sure if that index is done correctly
        print(count) #testing
        break

  # appending the final dictionary to the csv file
  # source: https://www.geeksforgeeks.org/how-to-append-pandas-dataframe-to-existing-csv-file/
  data_to_add = pd.DataFrame(new_data)
  #data_to_add.to_csv(metadata_file_path, mode='a', index=False, header=False)
  print("Data transformations complete.")
  return data_to_add

In [21]:
def clean_metadata(dataset_folder_path, train=False):
  """
  This function will take in the dataset folder path as a string a return the clean data as a Panada Data Frame.
  """
  # Load the metadata
  dataset_metadata_path = os.path.join(dataset_folder_path, 'metadata.csv')
  df = pd.read_csv(dataset_metadata_path)
  print(f'Initial Number of Images: {len(df)}')

  ################################################################################
  # Keep only useful metadata columns
  df_dataset = df[['isic_id', 'age_approx', 'anatom_site_general', 'benign_malignant', 'diagnosis', 'sex' ]]

  ################################################################################
  # Removing any exculded classifications
  desired_classifications = ['actinic keratosis', 'basal cell carcinoma', 'dermatofibroma', 'melanoma', 'nevus', 'squamous cell carcinoma', 'vascular lesion']

  print(df_dataset['diagnosis'].value_counts)

  removed_classifications_indices = []
  removed_classifications = set()
  len_before_1 = len(df_dataset)
  i = 0
  nevus_count = 0
  melanoma_count = 0
  basel_count = 0
  for diag in df_dataset['diagnosis']:
    if diag not in desired_classifications:
      removed_classifications.add(diag)
      removed_classifications_indices.append(i)

    if train and diag == 'nevus' and nevus_count < 200:
      removed_classifications_indices.append(i)
      nevus_count += 1
    elif train and diag == 'melanoma' and melanoma_count < 0:
      removed_classifications_indices.append(i)
      melanoma_count += 1
    elif train and diag == 'basal cell carcinoma' and basel_count < 0:
      removed_classifications_indices.append(i)
      basel_count += 1

    i += 1
  df_dataset = df_dataset.drop(df_dataset.index[removed_classifications_indices])

  len_after_1 = len(df_dataset)
  num_diagnosis_removed = len_before_1 - len_after_1
  print(f'Number of Images removed due to out of scope diagnosis and balancing : {num_diagnosis_removed}')

  ################################################################################
  # Remove samples with missing metadata
  categorical_features = ['anatom_site_general', 'diagnosis', 'sex']
  #^ removed benign_malignant in list because it was removing too many images

  len_before_2 = len(df_dataset) # report on inital length

  missing = pd.concat([df_dataset[c].isnull() for c in categorical_features], axis=1).any(axis=1)
  df_dataset = df_dataset[~missing]

  len_after_2 = len(df_dataset) # report on length after
  num_missing_features = len_before_2 - len_after_2

  # Report on image removal
  print(f'Number of Images Removed due to Missing Features: {num_missing_features}')
  print(f'\nPercent Removed: {round(((num_missing_features + num_diagnosis_removed)/len_before_1)*100, 2)}%')
  print(f'Images remaining: {len(df_dataset)}')

  ################################################################################
  # Image transformations to balance the dataset
  if train == True:

    print('image tranformations started')
    imgs_to_transform = ['squamous cell carcinoma', 'actinic keratosis', 'dermatofibroma', 'vascular lesion']
    final_amount = 100
    transformed_data = img_transform(dataset_folder_path, df_dataset, imgs_to_transform, final_amount)

    # merging dataframes
    df_dataset = pd.concat([df_dataset, transformed_data], ignore_index=True)
    print('\ndataframes merged')

  ################################################################################
  # Class Statistics:
  print('\nLabel Breakdown: ')
  print(df_dataset['diagnosis'].value_counts())
  print(f"\n{df_dataset['benign_malignant'].value_counts()}")

  ################################################################################
  # One-Hot Encoding of Data
  dataset_onehot = pd.get_dummies(df_dataset[['age_approx','anatom_site_general', 'sex', 'benign_malignant', 'diagnosis']])
  dataset_onehot.insert(loc=0, column='isic_id', value=df_dataset['isic_id'])

  ################################################################################
  # Normlaize Age Column in onehot encoding data
  col = 'age_approx'
  dataset_onehot[col] = (dataset_onehot[col] - dataset_onehot[col].min()) / (dataset_onehot[col].max() - dataset_onehot[col].min())

  ################################################################################
  return dataset_onehot, df_dataset,

In [22]:
def print_stats(folder_path):

  #metadata_file_path = os.path.join(folder_path, 'metadata.csv')
  df_dataset = pd.read_csv(folder_path)

  print('\nLabel Breakdown: ')
  print(df_dataset['diagnosis'].value_counts())
  print(f"\n{df_dataset['benign_malignant'].value_counts()}")

In [None]:
# Establishing Directories based on the input
train_images_path = os.path.join(folder_path, 'Ham10000')
test_images_path = os.path.join(folder_path, 'HIBA Images - Test')

In [None]:
# Run Cleaning Function
train_onehot, train_data = clean_metadata(train_images_path, train=True)

Initial Number of Images: 11720
Number of Images removed due to out of scope diagnosis and balancing : 9155
Number of Images Removed due to Missing Features: 435

Percent Removed: 81.83%
Images remaining: 2130
image tranformations started
Initial Number of Images: 11720
nevus                         7737
pigmented benign keratosis    1338
melanoma                      1305
basal cell carcinoma           622
squamous cell carcinoma        229
vascular lesion                180
dermatofibroma                 160
actinic keratosis              149
Name: diagnosis, dtype: int64
Data transformations complete.

dataframes merged

Label Breakdown: 
nevus                      506
basal cell carcinoma       501
dermatofibroma             500
squamous cell carcinoma    500
vascular lesion            500
actinic keratosis          500
melanoma                   494
Name: diagnosis, dtype: int64

benign       506
malignant    494
Name: benign_malignant, dtype: int64


In [None]:
test_onehot, test_data = clean_metadata(test_images_path)

Initial Number of Images: 1635
Number of Images removed due to out of scope diagnosis and balancing : 88
Number of Images Removed due to Missing Features: 114

Percent Removed: 12.35%
Images remaining: 1433

Label Breakdown: 
nevus                      532
basal cell carcinoma       322
melanoma                   255
squamous cell carcinoma    157
actinic keratosis           63
dermatofibroma              61
vascular lesion             43
Name: diagnosis, dtype: int64

malignant    734
benign       699
Name: benign_malignant, dtype: int64


In [None]:
test_onehot_keep, test_onehot_drop, test_data_keep, test_data_drop = train_test_split(test_onehot, test_data, test_size=0.65, random_state=0)

In [None]:
test_data_keep['diagnosis'].value_counts()

nevus                      215
basal cell carcinoma       132
melanoma                   106
squamous cell carcinoma     55
actinic keratosis           28
dermatofibroma              21
vascular lesion             16
Name: diagnosis, dtype: int64

In [None]:
train_data, validation_data, train_onehot, validation_onehot = train_test_split(train_data, train_onehot, test_size=0.17, random_state=0)

In [None]:
total_data = len(train_data) + len(validation_data) + len(test_data_keep)
print(f'Total Number of Images {total_data}')

print(f'\nLength of Train Data: {len(train_data)}')
print(f'Length of Train Data Onehot: {len(train_onehot)}')
print(f'Percent of Total: {round(len(train_data)/total_data*100, 2)}%')

print(f'\nLength of Validation Data: {len(validation_data)}')
print(f'Length of Validation Data Onehot: {len(validation_onehot)}')
print(f'Percent of Total: {round(len(validation_data)/total_data*100, 2)}%')

print(f'\nLength of Test Data: {len(test_data_keep)}')
print(f'Length of Test Data Onehot: {len(test_onehot_keep)}')
print(f'Percent of Total: {round(len(test_data_keep)/total_data*100, 2)}%')

Total Number of Images 3406

Length of Train Data: 2411
Length of Train Data Onehot: 2411
Percent of Total: 70.79%

Length of Validation Data: 494
Length of Validation Data Onehot: 494
Percent of Total: 14.5%

Length of Test Data: 501
Length of Test Data Onehot: 501
Percent of Total: 14.71%


In [None]:
train_data.to_csv(os.path.join(folder_path, 'large_training_data.csv'))
train_onehot.to_csv(os.path.join(folder_path, 'large_training_data_onehot.csv'))

validation_data.to_csv(os.path.join(folder_path, 'large_validation_data.csv'))
validation_onehot.to_csv(os.path.join(folder_path, 'large_validation_data_onehot.csv'))

test_data_keep.to_csv(os.path.join(folder_path, 'large_test_data.csv'))
test_onehot_keep.to_csv(os.path.join(folder_path, 'large_test_data_onehot.csv'))

In [23]:
demo_onehot, demo_data = clean_metadata(os.path.join(folder_path, 'PROVe-AI'), True)
demo_onehot.to_csv(os.path.join(folder_path, 'demo_data_onehot.csv'))
demo_data.to_csv(os.path.join(folder_path, 'demo_data.csv'))

Initial Number of Images: 603
<bound method IndexOpsMixin.value_counts of 0         seborrheic keratosis
1                        nevus
2      squamous cell carcinoma
3                     melanoma
4      squamous cell carcinoma
                ...           
598       seborrheic keratosis
599             dermatofibroma
600        lichenoid keratosis
601                lentigo NOS
602                   melanoma
Name: diagnosis, Length: 603, dtype: object>
Number of Images removed due to out of scope diagnosis and balancing : 344
Number of Images Removed due to Missing Features: 0

Percent Removed: 57.05%
Images remaining: 259
image tranformations started
Initial Number of Images: 259
nevus                      112
melanoma                    95
actinic keratosis           19
squamous cell carcinoma     13
dermatofibroma              11
basal cell carcinoma         9
Name: diagnosis, dtype: int64


KeyboardInterrupt: ignored