# **Data Collection**

## Objectives

* Fetch data from Kaggle and prepare it for further processes.

## Inputs

* Kaggle JSON file - the authentication token.

## Outputs

* Generate Dataset: inputs/datasets/skin_cancer_dataset

## Additional Comments

* No additional comments.


---

# Import packages

In [1]:
%pip install -r ../requirements.txt

You should consider upgrading via the '/Users/danielhamilton/.pyenv/versions/3.8.12/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy
import os

# Change working directory

In [3]:
current_dir = os.getcwd()
current_dir

'/Users/danielhamilton/CI/project5/AI-DermDiagnosis/jupyter_notebooks'

In [4]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [5]:
current_dir = os.getcwd()
current_dir

'/Users/danielhamilton/CI/project5/AI-DermDiagnosis'

# Install Kaggle

In [6]:
%pip install kaggle==1.5.12

You should consider upgrading via the '/Users/danielhamilton/.pyenv/versions/3.8.12/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

In [8]:
KaggleDatasetPath = "kmader/skin-cancer-mnist-ham10000"
DestinationFolder = "inputs/skin_cancer_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading skin-cancer-mnist-ham10000.zip to inputs/skin_cancer_dataset
100%|██████████████████████████████████████| 5.20G/5.20G [02:03<00:00, 49.4MB/s]
100%|██████████████████████████████████████| 5.20G/5.20G [02:03<00:00, 45.1MB/s]


In [9]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/skin-cancer-mnist-ham10000.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/skin-cancer-mnist-ham10000.zip')

---

# Data Preparation

## Data Inspection

### Check missing data in csv

In [10]:
import pandas as pd
# 1. Data Loading
df = pd.read_csv(DestinationFolder+'/HAM10000_metadata.csv')

# 2. Data Inspection
missing_values = df.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

Missing values in each column:
lesion_id        0
image_id         0
dx               0
dx_type          0
age             57
sex              0
localization     0
dtype: int64


dropping rows with missing data. Might drop age but will check. age may be a factor in the prediction, associated images will need to be dropped as well. 

In [11]:
# df = df.dropna()

### Check and remove non-image files

In [12]:
def check_for_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    
    for folder in folders:
        if 'images' in folder:
            files = os.listdir(my_data_dir + '/' + folder)
            # print(files)
            i = []
            j = []
            for given_file in files:
                if not given_file.lower().endswith(image_extension):
                    #file_location = my_data_dir + '/' + folder + '/' + given_file
                    #os.remove(file_location)  # remove non image file
                    i.append(1)
                else:
                    j.append(1)
                    pass
            print(f"Folder: {folder} - has image file", len(j))
            print(f"Folder: {folder} - has non-image file", len(i))

In [13]:
check_for_non_image_file(my_data_dir='inputs/skin_cancer_dataset')

Folder: HAM10000_images_part_1 - has image file 5000
Folder: HAM10000_images_part_1 - has non-image file 0
Folder: HAM10000_images_part_2 - has image file 5015
Folder: HAM10000_images_part_2 - has non-image file 0


### Label Preparation

In [14]:
diagnosis_unique = df['dx'].unique()
print(diagnosis_unique)

['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']


In [15]:
y = pd.get_dummies(df['dx'])

print(y.head())

   akiec  bcc  bkl  df  mel  nv  vasc
0      0    0    1   0    0   0     0
1      0    0    1   0    0   0     0
2      0    0    1   0    0   0     0
3      0    0    1   0    0   0     0
4      0    0    1   0    0   0     0


In [16]:
# Assuming df['dx'] is your column with the diagnosis
y = pd.get_dummies(df['dx'])

# Concatenate original DataFrame with the new DataFrame
df_new = pd.concat([df, y], axis=1)

# To see the new DataFrame
print(df_new.head())

     lesion_id      image_id   dx dx_type   age   sex localization  akiec  \
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp      0   
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp      0   
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp      0   
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp      0   
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear      0   

   bcc  bkl  df  mel  nv  vasc  
0    0    1   0    0   0     0  
1    0    1   0    0   0     0  
2    0    1   0    0   0     0  
3    0    1   0    0   0     0  
4    0    1   0    0   0     0  


## Move data into class Labels

In [17]:
import shutil
# Load the metadata
metadata = pd.read_csv('inputs/skin_cancer_dataset/HAM10000_metadata.csv')

# Define the directories where the images are stored
directories = [
    'inputs/skin_cancer_dataset/HAM10000_images_part_1',
    'inputs/skin_cancer_dataset/HAM10000_images_part_2',
]

# Create a new folder for each class label if it doesn't exist
for class_label in metadata['dx'].unique():
    os.makedirs(f'inputs/skin_cancer_dataset/sorted_images/{class_label}', exist_ok=True)

# Iterate over the metadata and move each image into the folder of its class label
for idx, row in metadata.iterrows():
    image_id = row['image_id']
    class_label = row['dx']
    for directory in directories:
        # Check if the image file exists in this directory
        if os.path.isfile(f'{directory}/{image_id}.jpg'):
            # Move the file and stop checking the other directories
            shutil.move(f'{directory}/{image_id}.jpg', f'inputs/skin_cancer_dataset/sorted_images/{class_label}/{image_id}.jpg')
            break

# Remove the original directories if they are empty
for directory in directories:
    if not os.listdir(directory):
        os.rmdir(directory)

### Split train validation test set

In [18]:
import random
def split_train_validation_test_images(data_dir, train_ratio, validation_ratio, test_ratio):

    if train_ratio + validation_ratio + test_ratio != 1.0:
        print("train_ratio + validation_ratio + test_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(data_dir)  # it should get only the folder name

    for label in labels:

        files = os.listdir(data_dir + '/' + label)
        random.shuffle(files)

        train_files_qty = int(len(files) * train_ratio)
        validation_files_qty = int(len(files) * validation_ratio)

        for count, file_name in enumerate(files):
            if count < train_files_qty:
                # move a given file to the train set
                if not os.path.exists(data_dir + '/train/' + label):
                    os.makedirs(data_dir + '/train/' + label)
                shutil.move(data_dir + '/' + label + '/' + file_name, data_dir + '/train/' + label + '/' + file_name)

            elif count < (train_files_qty + validation_files_qty):
                # move a given file to the validation set
                if not os.path.exists(data_dir + '/validation/' + label):
                    os.makedirs(data_dir + '/validation/' + label)
                shutil.move(data_dir + '/' + label + '/' + file_name, data_dir + '/validation/' + label + '/' + file_name)
            
            else:
                # move given file to test set
                if not os.path.exists(data_dir + '/test/' + label):
                    os.makedirs(data_dir + '/test/' + label)
                shutil.move(data_dir + '/' + label + '/' + file_name, data_dir + '/test/' + label + '/' + file_name)
                
        os.rmdir(data_dir + '/' + label)

- The training set is divided into a 0.70 ratio of data.
- The validation set is divided into a 0.10 ratio of data.
- The test set is divided into a 0.20 ratio of data.

In [19]:
split_train_validation_test_images(data_dir='inputs/skin_cancer_dataset/sorted_images', train_ratio=0.7, validation_ratio=0.1, test_ratio=0.2)

---

# Push files to Repo