In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
from PIL import Image
import os
import glob
import random
from shutil import copyfile

In [8]:
names_file = 'training.csv'
ts = 0.29
wh = 64

In [9]:
df = pd.read_csv(names_file)
# df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,type,w,h,dataset,diagnosis,diagnosis_confirm_type,age,sex
0,0,5aaf12491165976913627e89,ISIC_0024306,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,45,male
1,1,5aaf12491165976913627e95,ISIC_0024307,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,50,male
2,2,5aaf12491165976913627ea0,ISIC_0024308,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,55,female
3,3,5aaf12491165976913627eab,ISIC_0024309,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,40,male
4,4,5aaf12491165976913627eb6,ISIC_0024310,dermoscopic,600,450,HAM10000,melanoma,histopathology,60,male


In [10]:
X = df['name']
y = df['diagnosis']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)

In [13]:
folder_names = list(y.unique())
folder_names

['nevus',
 'melanoma',
 'pigmented benign keratosis',
 'dermatofibroma',
 'squamous cell carcinoma',
 'basal cell carcinoma',
 'vascular lesion',
 'actinic keratosis']

In [None]:
def plot_hist(data, name=""):
    plt.figure(figsize=(16,5))
    plt.xticks(rotation=45)
    plt.title(name)
    sns.countplot(data);   

In [None]:
plot_hist(y_test, "Test")

In [None]:
plot_hist(y_train, "train")

In [None]:
for fn in folder_names:
    train_folder = 'train/' + fn
    valid_folder = 'test/' + fn
    if not os.path.exists(train_folder):
        os.makedirs(train_folder)
    if not os.path.exists(valid_folder):
        os.makedirs(valid_folder)

In [None]:
X_train_list = list(X_train)
y_train_list = list(y_train)
for i in range(len(X_train_list)):
    old_path = 'img/' + X_train_list[i] + ".jpg"
    new_path = 'train' + '/' + y_train_list[i] + '/' + X_train_list[i] + ".jpg"
    os.replace(old_path, new_path)

In [None]:
X_test_list = list(X_test)
y_test_list = list(y_test)
for i in range(len(X_test_list)):
    old_path = 'img/' + X_test_list[i] + ".jpg"
    new_path = 'test/' + y_test_list[i] + '/' + X_test_list[i] + ".jpg"
    os.replace(old_path, new_path)

## Обрез до WHxWH и копирование в новую папку

In [None]:
def resize_img(name, new_path, size):
    img = Image.open(name)
    img = img.resize((size,size))
    img.save(new_path)

In [None]:
for fn in folder_names:
    train_folder = f'train{wh}x{wh}/' + fn
    test_folder = f'test{wh}x{wh}/' + fn
    if not os.path.exists(train_folder):
        os.makedirs(train_folder)
    if not os.path.exists(test_folder):
        os.makedirs(test_folder)

In [None]:
train_size = 0
valid_size = 0
for fn in folder_names:
    train_size+=len(os.listdir('train/' + fn))
    valid_size+=len(os.listdir('test/' + fn))
train_size, valid_size

In [None]:
for fn in folder_names:
    folder = f'train/' + fn
    for img in os.listdir(folder):
        name = folder + '/' + img
        new_path = 'train{}/'.format(wh) + fn + '/'+ img.split('.')[0] + '_{}x{}.jpg'.format(wh,wh)
        resize_img(name, new_path, wh)
    print('Done {} / {}. Class - {}'.format(folder_names.index(fn) + 1, len(folder_names), fn))

In [None]:
for fn in folder_names:
    folder = 'valid/' + fn
    for img in os.listdir(folder):
        name = folder + '/' + img
        new_path = 'valid{}/'.format(wh) + fn + '/'+ img.split('.')[0] + '_{}x{}.jpg'.format(wh,wh)
        resize_img(name, new_path, wh)
    print('Done {} / {}. Class - {}'.format(folder_names.index(fn) + 1, len(folder_names), fn))

## Разбиение выборки на равные части

In [14]:
def get_segment_stat(test_folder_name, train_folder_name, folder_names_list):
    test_fn = test_folder_name
    train_fn = train_folder_name
    
    stat_dict = {'class_name' : folder_names_list, train_fn : [], test_fn : [] }
    
    for st in [train_fn, test_fn]:
        for name in os.listdir(st):
            stat_dict[st].append(len(glob.glob(f"{st}\\{name}\\*.jpg")))
            
    stat_df = pd.DataFrame(stat_dict)
    stat_df['all'] = stat_df[train_fn] + stat_df[test_fn]
    return stat_df

In [17]:
stat_df = get_segment_stat("test", "train", folder_names)
stat_df.set_index("class_name")

Unnamed: 0_level_0,train,test,all
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nevus,93,37,130
melanoma,349,165,514
pigmented benign keratosis,82,33,115
dermatofibroma,717,396,1113
squamous cell carcinoma,4479,2226,6705
basal cell carcinoma,756,343,1099
vascular lesion,137,60,197
actinic keratosis,97,45,142


In [18]:
total_lim = stat_df['all'].min()
test_lim = int(total_lim * ts)
train_lim = total_lim - test_lim
print("total = train + test")
print(total_lim, " = ", train_lim, " + ", test_lim)

total = train + test
115  =  82  +  33


In [19]:
for fn in folder_names:
    train_folder = f'train_{total_lim}/' + fn
    test_folder = f'test_{total_lim}/' + fn
    if not os.path.exists(train_folder):
        os.makedirs(train_folder)
    if not os.path.exists(test_folder):
        os.makedirs(test_folder)

In [None]:
# train =  list[:train_lim]
# test = list[ (-1) * (len(list) - train_lim):]

In [20]:
train_fn = f'train'
print(train_fn)
for clname in os.listdir(train_fn):
    img_names = os.listdir(f"{train_fn}\\{clname}\\")
    random.shuffle(img_names)
    selected_names = img_names[:train_lim]
    print(clname, len(selected_names), "/", len(img_names))
    for name in selected_names:
        old_path = f"{train_fn}\\{clname}\\{name}"
        new_path = f"{train_fn}_{total_lim}\\{clname}\\{name}"
        copyfile(old_path, new_path)

train
actinic keratosis 82 / 93
basal cell carcinoma 82 / 349
dermatofibroma 82 / 82
melanoma 82 / 717
nevus 82 / 4479
pigmented benign keratosis 82 / 756
squamous cell carcinoma 82 / 137
vascular lesion 82 / 97


In [21]:
test_fn = f'test'
print(test_fn)
for clname in os.listdir(test_fn):
    img_names = os.listdir(f"{test_fn}\\{clname}\\")
    random.shuffle(img_names)
    selected_names = img_names[:test_lim]
    print(clname, len(selected_names), "/", len(img_names))
    for name in selected_names:
        old_path = f"{test_fn}\\{clname}\\{name}"
        new_path = f"{test_fn}_{total_lim}\\{clname}\\{name}"
        copyfile(old_path, new_path)

test
actinic keratosis 33 / 37
basal cell carcinoma 33 / 165
dermatofibroma 33 / 33
melanoma 33 / 396
nevus 33 / 2226
pigmented benign keratosis 33 / 343
squamous cell carcinoma 33 / 60
vascular lesion 33 / 45


In [None]:
get_segment_stat(f"test_{total_lim}", f"train_{total_lim}", folder_names)