# Data augmentation and EDA

In [133]:
import os
import pandas as pd
import numpy as np

from PIL import Image

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [82]:
images_path = "/media/dragoshh1984/Elements/kaggle/datasets/melanoma/images"
images_test_path = "/media/dragoshh1984/Elements/kaggle/datasets/melanoma/isic_2020/test"

In [50]:
def get_size(image_path):
    image = Image.open(image_path)
    
    return image.size[0]*image.size[1]

In [60]:
images_sizes = []
images_names = os.listdir(images_path)
df = pd.DataFrame(images_names)
for image_name in tqdm(images_names):
    images_sizes.append(get_size(os.path.join(images_path, image_name)))

HBox(children=(FloatProgress(value=0.0, max=59594.0), HTML(value='')))




In [84]:
images_test_sizes = []
images_test_names = os.listdir(images_test_path)
for image_name in tqdm(images_test_names):
    images_test_sizes.append(get_size(os.path.join(images_test_path, image_name)))

HBox(children=(FloatProgress(value=0.0, max=10982.0), HTML(value='')))




In [200]:
df = pd.DataFrame()
df["image_id"] = images_names
df["size"] = images_sizes

df_t = pd.DataFrame()
df_t["image_id"] = images_test_names
df_t["size"] = images_test_sizes

In [201]:
df["image_id"] = df["image_id"].str.replace(".jpg", "")
df_t["image_id"] = df_t["image_id"].str.replace(".jpg", "")

In [202]:
competition_path = "/home/dragoshh1984/repos/kaggle/datasets/melanomia_classification"

df_train = pd.read_csv(os.path.join(competition_path, "new_train.csv"))
df_test = pd.read_csv(os.path.join(competition_path, "new_test.csv"))

In [203]:
new_df_train = pd.merge(df_train, df, how='inner', on=['image_id'])
new_df_test = pd.merge(df_test, df_t, how='inner', on=['image_id'])

## Normalize size

In [204]:
size_train_min = new_df_train['size'].min()
size_train_max = new_df_train['size'].max()

size_test_min = new_df_test['size'].min()
size_test_max = new_df_test['size'].max()

In [205]:
new_df_train['size'] = new_df_train['size'].apply(lambda x: (x - size_train_min)/(size_train_max-size_train_min))

In [206]:
new_df_test['size'] = new_df_test['size'].apply(lambda x: (x - size_test_min)/(size_test_max-size_test_min))

In [112]:
new_df_train.to_csv(os.path.join(competition_path, "new_train_v2.csv"), index=False)
new_df_test.to_csv(os.path.join(competition_path, "new_test_v2.csv"), index=False)

## Normalize color mean

In [113]:
color_mean_train_min = new_df_train['color_mean'].min()
color_mean_train_max = new_df_train['color_mean'].max()
color_mean_test_min = new_df_test['color_mean'].min()
color_mean_test_max = new_df_test['color_mean'].max()

In [114]:
new_df_train['color_mean'] = new_df_train['color_mean'].apply(lambda x: (x - color_mean_train_min)/(color_mean_train_max-color_mean_train_min))

In [115]:
new_df_test['color_mean'] = new_df_test['color_mean'].apply(lambda x: (x - color_mean_test_min)/(color_mean_test_max-color_mean_test_min))

In [116]:
new_df_train.head

<bound method NDFrame.head of            image_id   patient_id  target  source  sex  age_approx  \
0      ISIC_2637011   IP_7279968       0  ISIC20    1        45.0   
1      ISIC_0015719   IP_3075186       0  ISIC20    0        45.0   
2      ISIC_0052212   IP_2842074       0  ISIC20    0        50.0   
3      ISIC_0068279   IP_6890425       0  ISIC20    0        45.0   
4      ISIC_0074268   IP_8723313       0  ISIC20    0        55.0   
...             ...          ...     ...     ...  ...         ...   
58452  ISIC_0073247  BCN_0003925       0  ISIC19    0        85.0   
58453  ISIC_0073248  BCN_0001819       0  ISIC19    1        65.0   
58454  ISIC_0073249  BCN_0001085       1  ISIC19    1        70.0   
58455  ISIC_0073251  BCN_0002083       0  ISIC19    0        55.0   
58456  ISIC_0073254  BCN_0001079       0  ISIC19    1        50.0   

       anatom_site_general_challenge  stratify_group  fold  color_mean  \
0                                  0              32     2    0.667

## Get color variance for each channel

In [157]:
image = Image.open(os.path.join(images_path, "ISIC_0000001.jpg"))
channels = np.array(image)
red = channels[0]
green = channels[1]
blue = channels[2]

red_mean = np.mean(red)
green_mean = np.mean(green)
blue_mean = np.mean(blue)

std_red = (np.sum(red) - len(red) * red_mean) / len(red)
std_green = (np.sum(green) - len(green) * green_mean) / len(green)
std_blue = (np.sum(blue) - len(blue) * blue_mean) / len(blue)
print(std_red)
print(std_green)
print(std_blue)

508.512067840835
331.54142204827133
346.94716242661445


In [159]:
def get_variances(image_path):
    image = Image.open(image_path)
    channels = np.array(image)
    red = channels[0]
    green = channels[1]
    blue = channels[2]

    red_mean = np.mean(red)
    green_mean = np.mean(green)
    blue_mean = np.mean(blue)

    std_red = (np.sum(red) - len(red) * red_mean) / len(red)
    std_green = (np.sum(green) - len(green) * green_mean) / len(green)
    std_blue = (np.sum(blue) - len(blue) * blue_mean) / len(blue)
    
    return std_red, std_green, std_blue

In [162]:
images_stds_red = []
images_stds_green = []
images_stds_blue = []

images_names = os.listdir(images_path)
for image_name in tqdm(images_names):
    red_std, green_std, blue_std = get_variances(os.path.join(images_path, image_name))
    images_stds_red.append(red_std)
    images_stds_green.append(green_std)
    images_stds_blue.append(blue_std)

HBox(children=(FloatProgress(value=0.0, max=59594.0), HTML(value='')))




In [164]:
images_test_stds_red = []
images_test_stds_green = []
images_test_stds_blue = []

images_test_names = os.listdir(images_test_path)
for image_name in tqdm(images_test_names):
    red_std, green_std, blue_std = get_variances(os.path.join(images_test_path, image_name))
    images_test_stds_red.append(red_std)
    images_test_stds_green.append(green_std)
    images_test_stds_blue.append(blue_std)

HBox(children=(FloatProgress(value=0.0, max=10982.0), HTML(value='')))




In [207]:
df = pd.DataFrame()
df["image_id"] = images_names
df["red_std"] = images_stds_red
df["green_std"] = images_stds_green
df["blue_std"] = images_stds_blue

df_t = pd.DataFrame()
df_t["image_id"] = images_test_names
df_t["red_std"] = images_test_stds_red
df_t["green_std"] = images_test_stds_green
df_t["blue_std"] = images_test_stds_blue

df["image_id"] = df["image_id"].str.replace(".jpg", "")
df_t["image_id"] = df_t["image_id"].str.replace(".jpg", "")

In [208]:
new_df_train = pd.merge(new_df_train, df, how='inner', on=['image_id'])
new_df_test = pd.merge(new_df_test, df_t, how='inner', on=['image_id'])

In [209]:
new_df_train["red_std"] = new_df_train["red_std"].apply(np.sqrt)
new_df_train["green_std"] = new_df_train["green_std"].apply(np.sqrt)
new_df_train["blue_std"] = new_df_train["blue_std"].apply(np.sqrt)

new_df_train

Unnamed: 0,image_id,patient_id,target,source,sex,age_approx,anatom_site_general_challenge,stratify_group,fold,color_mean,size,red_std,green_std,blue_std
0,ISIC_2637011,IP_7279968,0,ISIC20,1,45.0,0,32,2,169.748642,1.000000,17.384556,17.398164,17.411506
1,ISIC_0015719,IP_3075186,0,ISIC20,0,45.0,7,28,1,171.006387,1.000000,17.930105,17.936428,17.941234
2,ISIC_0052212,IP_2842074,0,ISIC20,0,50.0,2,8,1,158.517989,0.071691,18.678024,18.685068,18.680589
3,ISIC_0068279,IP_6890425,0,ISIC20,0,45.0,0,2,4,80.240191,0.071691,14.857910,14.866296,14.853163
4,ISIC_0074268,IP_8723313,0,ISIC20,0,55.0,7,28,3,183.893274,1.000000,18.937690,18.943738,18.948799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58452,ISIC_0073247,BCN_0003925,0,ISIC19,0,85.0,0,0,4,92.605895,0.032810,1.523257,1.523257,1.523257
58453,ISIC_0073248,BCN_0001819,0,ISIC19,1,65.0,5,48,3,138.607723,0.032810,15.809123,15.816637,15.828035
58454,ISIC_0073249,BCN_0001085,1,ISIC19,1,70.0,2,37,3,141.509337,0.032810,19.015276,18.986889,18.963180
58455,ISIC_0073251,BCN_0002083,0,ISIC19,0,55.0,4,14,2,119.702019,0.032810,16.433301,16.435579,16.437163


In [210]:
new_df_test

Unnamed: 0,image_id,patient_id,sex,age_approx,anatom_site_general_challenge,color_mean,size,red_std,green_std,blue_std
0,ISIC_0052060,IP_3579794,1,70.0,5,189.277809,1.000000,372.718778,372.795778,372.944000
1,ISIC_0052349,IP_7782715,1,40.0,1,206.813796,1.000000,394.091556,394.146444,394.217222
2,ISIC_0058510,IP_7960270,0,55.0,4,218.775140,1.000000,434.765111,434.774000,434.858000
3,ISIC_0073313,IP_6375035,0,50.0,4,216.443701,1.000000,419.636333,419.614667,419.591778
4,ISIC_0073502,IP_0589375,0,45.0,1,146.235956,0.074554,244.300000,245.023958,245.562500
...,...,...,...,...,...,...,...,...,...,...
10977,ISIC_9992485,IP_4152479,1,40.0,4,140.586552,0.000000,267.165625,261.788542,258.226042
10978,ISIC_9996992,IP_4890115,1,35.0,4,153.304031,0.198833,222.658951,222.841049,223.198302
10979,ISIC_9997917,IP_2852390,1,25.0,6,164.849692,0.000000,442.215625,442.944792,443.786458
10980,ISIC_9998234,IP_8861963,1,65.0,1,191.778987,1.000000,378.845889,379.031667,379.313556


In [211]:
new_df_train.to_csv(os.path.join(competition_path, "new_train_v3.csv"), index=False)
new_df_test.to_csv(os.path.join(competition_path, "new_test_v3.csv"), index=False)