### **Deep Learning Group Project 23-24**

##### **Group 5**

**Elements**:
- Frederico Portela | R20181072 | r20181072@novaims.unl.pt
- Samuel Santos | 20220609 | 20220609@novaims.unl.pt
- Afonso Gorjão | 20230575 | 20230575@novaims.unl.pt
- Pedro Carvalho | 20230554 | 20230554@novaims.unl.pt
- Francisco Campos | 20230565 | 20250565@novaims.unl.pt


**In this notebook:**
1. Data Extraction from raw file.

**Index**
1. [Library imports](#library_imports)
2. [Data import](#data_import)
3. [Missing values](#missing_values)
4. [Checking ID variable](#check_id)
5. [Fixing URLs](#fixing_url)
6. [Removing nulls](#remove_nulls)
7. [Storing in TF-allowed way](#storing)

#### **Library imports**<a id='library_imports'></a>

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image
import requests
from io import BytesIO
from tqdm import tqdm  # progess bar (eventually remove)

#### **Data import**<a id='data_import'></a>

In [3]:
class Metadata:

    """
    Call class for easy access to column metadata.
    """
    
    descriptions = {
        'md5hash': 'Original ID Column',
        'fitzpatrick_scale': 'https://en.wikipedia.org/wiki/Fitzpatrick_scale',
        'fitzpatrick_centaur': '...',
        'label': 'Target column; skin disease',
        'nine_partition_label': '1-level up in granularity',
        'three_partition_label': '2-level up in granularity',
        'qc': '...',
        'url': 'URL of picture of skin condition',
        'url_alphanum': 'pseudo-url',
    }
    
    @classmethod
    def get_description(cls, column_name):
        return cls.descriptions[column_name]

rawdata = pd.read_csv('data/fitzpatrick17k.csv')
print(f'Shape: {rawdata.shape}')
print()
display(rawdata.head())
print()
print(f"Example URL: {rawdata['url'].iloc[0]}")

Shape: (16577, 9)



Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum
0,5e82a45bc5d78bd24ae9202d194423f8,3,3,drug induced pigmentary changes,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmminoc...
1,fa2911a9b13b6f8af79cb700937cc14f,1,1,photodermatoses,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicpphoto...
2,d2bac3c9e4499032ca8e9b07c7d3bc40,2,3,dermatofibroma,benign dermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicdderma...
3,0a94359e7eaacd7178e06b2823777789,1,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...
4,a39ec3b1f22c08a421fa20535e037bba,1,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...



Example URL: https://www.dermaamin.com/site/images/clinical-pic/m/minocycline-pigmentation/minocycline-pigmentation1.jpg


#### **Missing values**<a id='missing_values'></a>

In [4]:
display(rawdata.isnull().sum())

# NOTE: there are images with no URL

md5hash                      0
fitzpatrick_scale            0
fitzpatrick_centaur          0
label                        0
nine_partition_label         0
three_partition_label        0
qc                       16073
url                         41
url_alphanum                 0
dtype: int64

#### **Checking ID variable**<a id='check_id'></a>

In [7]:
print(len(rawdata['md5hash'].unique()) == rawdata.shape[0])
print(len(rawdata.loc[rawdata['url'].notnull(), 'url'].unique()) == rawdata.loc[rawdata['url'].notnull(), 'url'].shape[0])

# hashes and urls are unique

# *********************************
# Checking label variable
# *********************************

for i in rawdata['label']:
    if i.endswith(' ') | i.startswith(' '):
        print(i)

# label strings seem fine

True
True


#### **Fixing URLs**<a id='fixing_url'></a>

In [6]:
# example of correct URL: https://www.dermaamin.com/site/images/clinical-pic/m/minocycline-pigmentation/minocycline-pigmentation1.jpg

# DO NOT CHANGE ORDER
missing_url = [
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus30jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicooralmucosalmelanomaoralmucosalmelanoma2jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus34jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus36jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicooralmucosalmelanomaoralmucosalmelanoma3jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus24jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus3jpg.jpg',
    'httpwwwdermaamincomsiteimageshistopicjjunctionnaevusjunctionnaevus3jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus11jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus35jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus18jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus10jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus28jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus19jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus27jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus4jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus1jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus29jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus21jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus32jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus23jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus13jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus9jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus26jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus14jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus15jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus8jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus16jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicooralmucosalmelanomaoralmucosalmelanoma1jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus7jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus6jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus5jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus2jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus12jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus31jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus33jpg.jpg',
    'httpwwwdermaamincomsiteimageshistopicjjunctionnaevusjunctionnaevus2jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus20jpg.jpg',
    'httpwwwdermaamincomsiteimageshistopicjjunctionnaevusjunctionnaevus1jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus25jpg.jpg',
    'httpwwwdermaamincomsiteimagesclinicalpicLLichensimplexchronicusLichensimplexchronicus22jpg.jpg',
]

# can't access the images, maybe they are null for a reason...
# does not work in upper/lower case, with dashes or underscores separating the disease names
correct_url = [
    'https://www.dermaamin.com/site/images/clinical-pic/l/lichen-simplex-chronicus/lichen-simplex-chronicus30.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/o/oral_mucosal_melanoma/oral_mucosal_melanoma2.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus34.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus36.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/o/oralmucosalmelanoma/oralmucosalmelanoma3.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus24.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus3.jpg',
    'https://www.dermaamin.com/site/images/histo-pic/j/junctionnaevus/junctionnaevus3.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus11.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus35.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus18.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus10.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus28.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus19.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus27.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus4.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus1.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus29.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus21.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus32.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus23.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus13.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus9.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus26.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus14.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus15.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus8.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus16.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/o/oralmucosalmelanoma/oralmucosalmelanoma1.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus7.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus6.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus5.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus2.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus12.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus31.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus33.jpg',
    'https://www.dermaamin.com/site/images/histo-pic/j/junctionnaevus/junctionnaevus2.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus20.jpg',
    'https://www.dermaamin.com/site/images/histo-pic/j/junctionnaevus/junctionnaevus1.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus25.jpg',
    'https://www.dermaamin.com/site/images/clinical-pic/L/Lichensimplexchronicus/Lichensimplexchronicus22.jpg',
]

# NOTE: not possible to fix any

#### **Removing nulls**<a id='remove_nulls'></a>

In [5]:
missingdata = rawdata.loc[rawdata['url'].isnull(), :].copy()  # will be useful later
rawdata = rawdata.loc[rawdata['url'].notnull(), :]
print(rawdata['url'].isnull().sum())

0


#### **Storing in TF-allowed way**<a id='storing'></a>

In [None]:
# storing the data in a way that tf functions, such as image_dataset_from_directory can
# can import the data

def download_url(url):
    # need to add user-agent othwerwise we get 406 error
    # https://stackoverflow.com/questions/56101612/python-requests-http-response-406
    response = requests.get(url, headers={'User-Agent': 'Mozilla'}) # get step takes the largest amount of time
    image = Image.open(BytesIO(response.content))
    return image

def class_indexes(col: pd.Series, label: str):
    return np.array(col.loc[col == label].index)

X = rawdata[['md5hash', 'url']]
y = rawdata['label']

# for testing
# n = 10
# X = X.iloc[:n] 
# y = y.iloc[:n]

error_images = []  # storing the URLs of the images that return an error

progress_bar = tqdm(total=len(y),
                    desc=f'Downloading images')

image_dir = 'all-images'
os.makedirs(os.path.join('data', image_dir))
ulabels = np.unique(y)
for ulabel in ulabels:
    os.makedirs(os.path.join('data', image_dir, ulabel))
    temp = X.loc[class_indexes(y, ulabel), :]
    for hash_, url in temp.values:
        try:
            image = download_url(url)
            image.save(os.path.join('data', image_dir, ulabel, f'{hash_}.jpg'))
        except Exception as e:
            print(f"{e}: {url}")
            error_images.append(url)
        progress_bar.update(1)
progress_bar.clear()

# also saving incorrect urls
(pd.DataFrame({'url': error_images})
 .to_csv('data/error_url.csv', index=False))

# 10 images have an error-inducing url meaning, we started
# with 16577 rows in the fitzpatrick csv # 41 had null urls 
# and were removed and afterwards the aforementioned 10 others
# have been due to error-prone urls. We were left with
# 16577 - 41 - 10 = 16526 images
# NOTE: DO NOT RUN, took about 7hr

In [3]:
# *********************************
# Making sure everything is OK
# *********************************

images = []
for dirpath, dirname, filename in os.walk(os.path.join('data', 'all-images')):
    images += [os.path.join(dirpath, file) for file in filename]

print(f"No. of classes: {len(os.listdir(os.path.join('data', 'all-images')))}")
print(f"No. of observations: {len(images)}")

No. of classes: 114
No. of observations: 16526
