In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
from PIL import Image
import os
import glob
import random
from shutil import copyfile
import itertools

In [8]:
names_file = 'training.csv'
ts = 0.29
wh = 64

In [9]:
df = pd.read_csv(names_file)
# df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,type,w,h,dataset,diagnosis,diagnosis_confirm_type,age,sex
0,0,5aaf12491165976913627e89,ISIC_0024306,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,45,male
1,1,5aaf12491165976913627e95,ISIC_0024307,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,50,male
2,2,5aaf12491165976913627ea0,ISIC_0024308,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,55,female
3,3,5aaf12491165976913627eab,ISIC_0024309,dermoscopic,600,450,HAM10000,nevus,serial imaging showing no change,40,male
4,4,5aaf12491165976913627eb6,ISIC_0024310,dermoscopic,600,450,HAM10000,melanoma,histopathology,60,male


In [10]:
X = df['name']
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)

In [13]:
folder_names = list(y.unique())
folder_names

['nevus',
 'melanoma',
 'pigmented benign keratosis',
 'dermatofibroma',
 'squamous cell carcinoma',
 'basal cell carcinoma',
 'vascular lesion',
 'actinic keratosis']

In [None]:
def plot_hist(data, name=""):
    plt.figure(figsize=(16,5))
    plt.xticks(rotation=45)
    plt.title(name)
    sns.countplot(data);   

In [None]:
plot_hist(y_test, "Test")

In [None]:
plot_hist(y_train, "train")

In [None]:
for fn in folder_names:
    train_folder = 'train/' + fn
    valid_folder = 'test/' + fn
    if not os.path.exists(train_folder):
        os.makedirs(train_folder)
    if not os.path.exists(valid_folder):
        os.makedirs(valid_folder)

In [None]:
X_train_list = list(X_train)
y_train_list = list(y_train)
for i in range(len(X_train_list)):
    old_path = 'img/' + X_train_list[i] + ".jpg"
    new_path = 'train' + '/' + y_train_list[i] + '/' + X_train_list[i] + ".jpg"
    os.replace(old_path, new_path)

In [None]:
X_test_list = list(X_test)
y_test_list = list(y_test)
for i in range(len(X_test_list)):
    old_path = 'img/' + X_test_list[i] + ".jpg"
    new_path = 'test/' + y_test_list[i] + '/' + X_test_list[i] + ".jpg"
    os.replace(old_path, new_path)

## Разбиение выборки на равные части

In [14]:
def get_segment_stat(test_folder_name, train_folder_name, folder_names_list):
    test_fn = test_folder_name
    train_fn = train_folder_name
    
    stat_dict = {'class_name' : folder_names_list, train_fn : [], test_fn : [] }
    
    for st in [train_fn, test_fn]:
        for name in os.listdir(st):
            stat_dict[st].append(len(glob.glob(f"{st}\\{name}\\*.jpg")))
            
    stat_df = pd.DataFrame(stat_dict)
    stat_df['all'] = stat_df[train_fn] + stat_df[test_fn]
    return stat_df

In [17]:
stat_df = get_segment_stat("test", "train", folder_names)
stat_df.set_index("class_name")

Unnamed: 0_level_0,train,test,all
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nevus,93,37,130
melanoma,349,165,514
pigmented benign keratosis,82,33,115
dermatofibroma,717,396,1113
squamous cell carcinoma,4479,2226,6705
basal cell carcinoma,756,343,1099
vascular lesion,137,60,197
actinic keratosis,97,45,142


In [18]:
total_lim = stat_df['all'].min()
test_lim = int(total_lim * ts)
train_lim = total_lim - test_lim
print("total = train + test")
print(total_lim, " = ", train_lim, " + ", test_lim)

total = train + test
115  =  82  +  33


In [19]:
for fn in folder_names:
    train_folder = f'train_{total_lim}/' + fn
    test_folder = f'test_{total_lim}/' + fn
    if not os.path.exists(train_folder):
        os.makedirs(train_folder)
    if not os.path.exists(test_folder):
        os.makedirs(test_folder)

In [None]:
# train =  list[:train_lim]
# test = list[ (-1) * (len(list) - train_lim):]

In [20]:
train_fn = f'train'
print(train_fn)
for clname in os.listdir(train_fn):
    img_names = os.listdir(f"{train_fn}\\{clname}\\")
    random.shuffle(img_names)
    selected_names = img_names[:train_lim]
    print(clname, len(selected_names), "/", len(img_names))
    for name in selected_names:
        old_path = f"{train_fn}\\{clname}\\{name}"
        new_path = f"{train_fn}_{total_lim}\\{clname}\\{name}"
        copyfile(old_path, new_path)

train
actinic keratosis 82 / 93
basal cell carcinoma 82 / 349
dermatofibroma 82 / 82
melanoma 82 / 717
nevus 82 / 4479
pigmented benign keratosis 82 / 756
squamous cell carcinoma 82 / 137
vascular lesion 82 / 97


In [21]:
test_fn = f'test'
print(test_fn)
for clname in os.listdir(test_fn):
    img_names = os.listdir(f"{test_fn}\\{clname}\\")
    random.shuffle(img_names)
    selected_names = img_names[:test_lim]
    print(clname, len(selected_names), "/", len(img_names))
    for name in selected_names:
        old_path = f"{test_fn}\\{clname}\\{name}"
        new_path = f"{test_fn}_{total_lim}\\{clname}\\{name}"
        copyfile(old_path, new_path)

test
actinic keratosis 33 / 37
basal cell carcinoma 33 / 165
dermatofibroma 33 / 33
melanoma 33 / 396
nevus 33 / 2226
pigmented benign keratosis 33 / 343
squamous cell carcinoma 33 / 60
vascular lesion 33 / 45


In [22]:
get_segment_stat(f"test_{total_lim}", f"train_{total_lim}", folder_names)

Unnamed: 0,class_name,train_115,test_115,all
0,nevus,82,33,115
1,melanoma,82,33,115
2,pigmented benign keratosis,82,33,115
3,dermatofibroma,82,33,115
4,squamous cell carcinoma,82,33,115
5,basal cell carcinoma,82,33,115
6,vascular lesion,82,33,115
7,actinic keratosis,82,33,115


## Обогощение выборки до максимума 

In [3]:
path = "C:/Users/Dima/PyFiles/MedNN/img/train_enriched"
cl_names = os.listdir(path)

In [7]:
dataset = {}
for cl in cl_names:
    dataset[cl] = os.listdir(os.path.join(path, cl))

In [5]:
cnt = {}
for cl in cl_names:
    cnt[cl] = len(dataset[cl])

In [6]:
max_cnt = max([v for v in cnt.values()])

In [7]:
add = {}
for cl in cl_names:
    add[cl] = max_cnt - cnt[cl]

In [8]:
def transform(newPth, oldPth):
    img = Image.open(oldPth)
    deg = random.randint(-45, 45)
    hf = bool(random.randint(0,1))
    vf = bool(random.randint(0,1))
    img = img.rotate(deg)
    if hf:
        img = ImageOps.mirror(img)
    if vf:
        img = ImageOps.flip(img)
    img.save(newPth)

In [9]:
for cl in cl_names:
    iters = int(add[cl] / cnt[cl])
    remain = add[cl] - cnt[cl] * iters
    
    for i in range(iters):
        for name in dataset[cl]:
            oldPth = path + '/' + cl + '/' + name
            newPth = path + '/' + cl + '/' + f"{i}_{name}"
            transform(newPth, oldPth)
            
    orig_names = dataset[cl]
    
    for j in range(remain):
        oldPth = path + '/' + cl + '/' + orig_names[j]
        newPth = path + '/' + cl + '/' + f"{i}-{orig_names[j]}"
        transform(newPth, oldPth)

In [5]:
for cl in cl_names:
    print(cl, ':', len(os.listdir(os.path.join(path, cl))))

actinic keratosis : 4479
basal cell carcinoma : 4479
dermatofibroma : 4479
melanoma : 4479
nevus : 4479
pigmented benign keratosis : 4479
squamous cell carcinoma : 4479
vascular lesion : 4479


In [12]:
def grouper(n, iterable):
    it = iter(iterable)
    while True:
       chunk = tuple(itertools.islice(it, n))
       if not chunk:
           return
       yield chunk

In [14]:
for gr in grouper(122, dataset['actinic keratosis']):
    for g in gr:
        print(g)

0_ISIC_0024468.jpg
0_ISIC_0024511.jpg
0_ISIC_0024646.jpg
0_ISIC_0024654.jpg
0_ISIC_0024800.jpg
0_ISIC_0024913.jpg
0_ISIC_0025368.jpg
0_ISIC_0025605.jpg
0_ISIC_0025780.jpg
0_ISIC_0025803.jpg
0_ISIC_0025825.jpg
0_ISIC_0025953.jpg
0_ISIC_0025957.jpg
0_ISIC_0025992.jpg
0_ISIC_0026040.jpg
0_ISIC_0026149.jpg
0_ISIC_0026171.jpg
0_ISIC_0026212.jpg
0_ISIC_0026457.jpg
0_ISIC_0026468.jpg
0_ISIC_0026525.jpg
0_ISIC_0026575.jpg
0_ISIC_0026626.jpg
0_ISIC_0026702.jpg
0_ISIC_0026709.jpg
0_ISIC_0026765.jpg
0_ISIC_0026857.jpg
0_ISIC_0026905.jpg
0_ISIC_0027172.jpg
0_ISIC_0027334.jpg
0_ISIC_0027447.jpg
0_ISIC_0027536.jpg
0_ISIC_0027562.jpg
0_ISIC_0027650.jpg
0_ISIC_0027668.jpg
0_ISIC_0027802.jpg
0_ISIC_0027829.jpg
0_ISIC_0027896.jpg
0_ISIC_0027950.jpg
0_ISIC_0027958.jpg
0_ISIC_0028063.jpg
0_ISIC_0028076.jpg
0_ISIC_0028190.jpg
0_ISIC_0028314.jpg
0_ISIC_0028370.jpg
0_ISIC_0028393.jpg
0_ISIC_0028517.jpg
0_ISIC_0028619.jpg
0_ISIC_0028854.jpg
0_ISIC_0028990.jpg
0_ISIC_0029025.jpg
0_ISIC_0029041.jpg
0_ISIC_00291