Notebook for generating new images for upsampling:

- Minority classes augmented using standard augmentations
- Minority classes augmented using AugMix
- Minority classes augmented using SMOTE

Code for GAN-oversampling minority classes is located in PlaqueGAN directory.

In [14]:
import torch
from torchvision import transforms
import torchvision.utils as vutils

from PIL import Image
import pandas as pd

import os

from tqdm import tqdm

import numpy as np

# Upsampling via standard augmentations

In [2]:
# first need to get the csv data for simple upsampled data - to know which items have been duplicated
csv_path = './CSVs/train_simple_up.csv'
df_up = pd.read_csv(csv_path)
df_up.head()

Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,10748,NA4757-02_AB/NA4757-02_AB_18_25_61.jpg,1.0,0.0,0.0,0.0,0.0,0.0
1,29503,NA4918-02_AB17-24/NA4918-02_AB17-24_9_18_12.jpg,0.0,2.832462,0.0,0.0,0.0,0.0
2,42524,NA4885-02_AB17-24/NA4885-02_AB17-24_4_23_50.jpg,1.0,1.0,0.0,0.0,1.0,0.0
3,34432,NA4749-02_AB/NA4749-02_AB_17_12_50.jpg,0.0,3.77027,0.0,0.0,0.0,0.0
4,4137,NA4751-02_AB/NA4751-02_AB_19_6_34.jpg,0.0,1.0,2.0,0.0,2.0,0.0


In [3]:
df_augment = df_up.loc[df_up.duplicated()==True,:]
df_augment = df_augment.sort_values(by='imagename', ignore_index=True).reset_index()
df_augment.head()

Unnamed: 0,index,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,0,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.0,0.0,0.0,0.0,0.0
1,1,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.0,0.0,0.0,0.0,0.0
2,2,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.0,0.0,0.0,0.0,0.0
3,3,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.0,0.0,0.0,0.0,0.0
4,4,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# create name of images to save 
num_imgs = df_augment.shape[0]
img_names_aug = [f'std_aug/{num}.jpg' for num in range(num_imgs)]

In [5]:
df_augment['imagename_new'] = img_names_aug

In [6]:
df_augment

Unnamed: 0,index,id,imagename,cored,diffuse,CAA,negative,flag,notsure,imagename_new
0,0,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.000000,0.0,0.0,0.0,0.0,std_aug/0.jpg
1,1,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.000000,0.0,0.0,0.0,0.0,std_aug/1.jpg
2,2,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.000000,0.0,0.0,0.0,0.0,std_aug/2.jpg
3,3,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.000000,0.0,0.0,0.0,0.0,std_aug/3.jpg
4,4,14215,NA4009-02_AB/NA4009-02_AB_10_24_0.jpg,1.0,0.000000,0.0,0.0,0.0,0.0,std_aug/4.jpg
...,...,...,...,...,...,...,...,...,...,...
93864,93864,14846,NA_4888_02_AB17-24/NA_4888_02_AB17-24_8_26_15.jpg,1.0,0.743902,0.0,0.0,0.0,0.0,std_aug/93864.jpg
93865,93865,14846,NA_4888_02_AB17-24/NA_4888_02_AB17-24_8_26_15.jpg,1.0,0.743902,0.0,0.0,0.0,0.0,std_aug/93865.jpg
93866,93866,14846,NA_4888_02_AB17-24/NA_4888_02_AB17-24_8_26_15.jpg,1.0,0.743902,0.0,0.0,0.0,0.0,std_aug/93866.jpg
93867,93867,14846,NA_4888_02_AB17-24/NA_4888_02_AB17-24_8_26_15.jpg,1.0,0.743902,0.0,0.0,0.0,0.0,std_aug/93867.jpg


In [84]:
#transforms to apply:
trans_standard = transforms.Compose([
                            transforms.RandomHorizontalFlip(),
                            transforms.RandomVerticalFlip(),
                            transforms.RandomResizedCrop(256, scale=(0.8,1)),
                            transforms.RandomApply([transforms.RandomRotation((90, 90))], p=0.5),
                            transforms.ColorJitter(brightness=0.1, contrast=0.2,saturation=0.2, hue=0.02)])
#                             transforms.RandomAffine(0, translate=(0.05,0.05), scale=(0.9,1.1), shear=10)])

In [90]:
img_path = '../Plaquebox/plaquebox-paper-master/data/tiles/train_and_val/'

for i, single_image_name in tqdm(enumerate(df_augment['imagename']), total=num_imgs):
#     print(single_image_name)
    img = Image.open(img_path + single_image_name)
    img = trans_standard(img)
    img.save(os.path.join(img_path, img_names_aug[i]), quality=95)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93869/93869 [22:13<00:00, 70.38it/s]


In [7]:
df_augment.reset_index(drop=True, inplace=True)

In [13]:
df_augment.drop(labels=['index', 'imagename'], axis=1, inplace=True)

In [22]:
df_augment.columns=['id', 'cored', 'diffuse', 'CAA', 'negative', 'flag', 'notsure', 'imagename']

In [23]:
df_augment= df_augment[['id','imagename','cored','diffuse','CAA','negative','flag','notsure']]

In [24]:
df_augment

Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,14215,std_aug/0.jpg,1.0,0.000000,0.0,0.0,0.0,0.0
1,14215,std_aug/1.jpg,1.0,0.000000,0.0,0.0,0.0,0.0
2,14215,std_aug/2.jpg,1.0,0.000000,0.0,0.0,0.0,0.0
3,14215,std_aug/3.jpg,1.0,0.000000,0.0,0.0,0.0,0.0
4,14215,std_aug/4.jpg,1.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
93864,14846,std_aug/93864.jpg,1.0,0.743902,0.0,0.0,0.0,0.0
93865,14846,std_aug/93865.jpg,1.0,0.743902,0.0,0.0,0.0,0.0
93866,14846,std_aug/93866.jpg,1.0,0.743902,0.0,0.0,0.0,0.0
93867,14846,std_aug/93867.jpg,1.0,0.743902,0.0,0.0,0.0,0.0


In [25]:
df_augment.to_csv('./CSVs/train_stdaug_up.csv')

# AugMix

In [4]:
from augmix import augment_and_mix, augmentations

augmentations.IMAGE_SIZE=256

In [5]:
# create name of images to save 
num_imgs = df_augment.shape[0]
img_names_aug = [f'augmix/{num}.jpg' for num in range(num_imgs)]
df_augment['imagename_new'] = img_names_aug

In [94]:
trans = transforms.Compose([transforms.RandomHorizontalFlip(),
                            transforms.RandomVerticalFlip(),
                            transforms.RandomResizedCrop(256, scale=(0.8,1)),
                            transforms.RandomApply([transforms.RandomRotation((90, 90))], p=0.5)])
preprocess = transforms.Compose([transforms.ToTensor()])

In [95]:
args = {'all_ops': True, 'mixture_width': 3, 'mixture_depth':-1, 'aug_severity': 3}

In [96]:
def aug(image, preprocess, args):
    """Perform AugMix augmentations and compute mixture.
    Args:
    image: PIL.Image input image
    preprocess: Preprocessing function which should return a torch tensor.
    Returns:
    mixed: Augmented and mixed image.
    """
    aug_list = augmentations.augmentations
    if args['all_ops']:
        aug_list = augmentations.augmentations_all

    ws = np.float32(np.random.dirichlet([1] * args['mixture_width']))
    m = np.float32(np.random.beta(1, 1))

    mix = torch.zeros_like(preprocess(image))
    for i in range(args['mixture_width']):
        image_aug = image.copy()
        depth = args['mixture_depth'] if args['mixture_depth'] > 0 else np.random.randint(
            1, 4)
        for _ in range(depth):
            op = np.random.choice(aug_list)
            image_aug = op(image_aug, args['aug_severity'])
        # Preprocessing commutes since all coefficients are convex
        mix += ws[i] * preprocess(image_aug)

    mixed = (1 - m) * preprocess(image) + m * mix
    return mixed

In [139]:
args = {'all_ops': True, 'mixture_width': 3, 'mixture_depth':-1, 'aug_severity': 3}
img_path = '../Plaquebox/plaquebox-paper-master/data/tiles/train_and_val/'

trans = transforms.Compose([transforms.RandomHorizontalFlip(),
                            transforms.RandomVerticalFlip(),
                            transforms.RandomResizedCrop(256, scale=(0.8,1)),
                            transforms.RandomApply([transforms.RandomRotation((90, 90))], p=0.5)])
preprocess = transforms.Compose([transforms.ToTensor()])

for i, single_image_name in tqdm(enumerate(df_augment['imagename']), total=num_imgs):
#     print(single_image_name)
    img = Image.open(img_path + single_image_name)
    img = aug(trans(img), preprocess, args)
    img = transforms.ToPILImage()(img)
    img.save(os.path.join(img_path, img_names_aug[i]), quality=95)

100%|████████████████████████████████████████████████████████████████████████████| 93869/93869 [36:17<00:00, 43.11it/s]


In [140]:
df_augment.reset_index(drop=True, inplace=True)
df_augment.drop(labels=['index', 'imagename'], axis=1, inplace=True)
df_augment.columns=['id', 'cored', 'diffuse', 'CAA', 'negative', 'flag', 'notsure', 'imagename']
df_augment= df_augment[['id','imagename','cored','diffuse','CAA','negative','flag','notsure']]
df_augment.to_csv('./CSVs/train_augmix_up.csv',index=False)

# SMOTE

In [4]:
# create name of images to save 
num_imgs = df_augment.shape[0]
img_names_aug = [f'smote/{num}.jpg' for num in range(num_imgs)]

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
# oversample each of the classes
# first need to get the csv data for the non-upsampled training data
csv_path = './CSVs/train.csv'
labels_df = pd.read_csv(csv_path)
morphologies = ["cored","diffuse","CAA"]
morphologies_filter = ["cored","CAA",["cored","diffuse"],["CAA","diffuse"], ["cored","CAA"]]

labels_df = labels_df[["imagename","cored","diffuse","CAA"]]

# convert labels to 1s and 0s
labels_df[["cored","diffuse","CAA"]] = labels_df[["cored","diffuse","CAA"]].gt(0.99).astype(int)
labels_df.head()

Unnamed: 0,imagename,cored,diffuse,CAA
0,NA4009-02_AB/neg_NA4009-02_AB_0_10_4.jpg,0,0,0
1,NA4009-02_AB/neg_NA4009-02_AB_0_11_2.jpg,0,0,0
2,NA4009-02_AB/neg_NA4009-02_AB_0_16_2.jpg,0,0,0
3,NA4009-02_AB/neg_NA4009-02_AB_0_25_2.jpg,0,0,0
4,NA4009-02_AB/NA4009-02_AB_10_10_14.jpg,0,1,0


In [7]:
img_path = '../Plaquebox/plaquebox-paper-master/data/tiles/train_and_val/'
overall_array = []
for morphs in morphologies_filter:
    if type(morphs)==list:
        morph_name = '-'.join(morphs)
    else:
        morph_name = morphs
        morphs = [morphs]
    query_str = ''
    
    for morph in morphologies:
        if morph in morphs:
            query_str = query_str + morph + '==1 & '
        else:
            query_str = query_str + morph + '==0 & '

    query_str = query_str[:-3]
    temp_df = labels_df.query(query_str)
    temp_df.reset_index(drop=True, inplace=True)
    temp_array = np.ndarray((temp_df.shape[0], 256 * 256 * 3), dtype='float16')
    for i, single_image_name in tqdm(enumerate(temp_df['imagename']), total=temp_df.shape[0]):
        img = Image.open(img_path + single_image_name)
        temp_array[i] = np.array(img).reshape(-1)/255
    overall_array.append(temp_array)

100%|█████████████████████████████████████████████████████████████████████████████| 1624/1624 [00:06<00:00, 264.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1855/1855 [00:07<00:00, 233.92it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 509/509 [00:02<00:00, 241.67it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 364/364 [00:02<00:00, 174.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 254.40it/s]


In [56]:
overall_array = np.concatenate(overall_array, axis=0)

In [9]:
overall_y = [0*np.ones(1624,dtype='int'), 1*np.ones(1855,dtype='int'), 2*np.ones(509,dtype='int'), 3*np.ones(364,dtype='int'), 4*np.ones(7,dtype='int')]

In [144]:
def sample_smote(overall_array, overall_y, save_dir = '../Plaquebox/plaquebox-paper-master/data/tiles/train_and_val'):
    # classes are:
    # cored: 0
    # caa:   1
    # cored + diffuse: 2
    # caa + diffuse: 3
    # cored + caa: 4
    
    # number of each class to generate
    target_gen = np.array([35728, 38955, 11198, 7644, 301])
    
    # doing SMOTE in mini batches to speed up and keep memory low
    sample_fracs = np.array([0.3, 0.3, 1, 1, 1])
    
    num_gen = np.array([0, 0, 0, 0, 0])
    morphs = [0,1,2,3,4,5]
    morphs_map = {0: [1, 0, 0], 1: [0, 0, 1], 2: [1, 1, 0], 3: [0, 1, 1], 4: [1, 0, 1]}
    
    save_counter = 0
    save_paths = []
    labels_stacked = []
    while np.any(num_gen < target_gen):
        print(f'Currently generated: {num_gen}')
        
        classes_sample = np.where(num_gen<target_gen)[0]
        classes_sample = classes_sample.astype(np.int64)
        if classes_sample.shape[0] == 1:
            classes_sample = np.insert(classes_sample,1,4)
        sampled_im_array = []
        sampled_y_array = []
        num_selects = []
        morphs_sample = []
        num_orig = 0
        for class2sample in classes_sample:
            im_array = overall_array[class2sample]
            y_array = overall_y[class2sample]
            sample_frac = sample_fracs[class2sample]
            morph = morphs[class2sample]
            morphs_sample.append(morph)
            num_select = int(np.ceil(sample_frac*im_array.shape[0]))
            num_orig += num_select
            num_selects.append(num_select*2)
            idx_select = np.random.choice(im_array.shape[0], num_select, replace=False)
            sampled_im_array.append(im_array[idx_select])
            sampled_y_array.append(y_array[idx_select])
        
        strat = {k:v for k,v in zip(morphs_sample,num_selects)}
        sm = SMOTE(sampling_strategy=strat)
        
        # generate samples
        sampled_im_array = np.concatenate(sampled_im_array, axis=0)
        sampled_y_array = np.concatenate(sampled_y_array, axis=0)
        X_res, y_res = sm.fit_resample(sampled_im_array, sampled_y_array)

        # get rid of original training data!
        X_res = X_res[num_orig:]
        y_res = y_res[num_orig:]

        # reshape and save
        for img, label in zip(X_res, y_res):
            # reshape and save image
            if num_gen[label]< target_gen[label]:
                pil_img = Image.fromarray((img.reshape(256,256,3)*255).astype(np.uint8))
                img_name = 'smote/' + f'{save_counter}.jpg'
                pil_img.save(os.path.join(save_dir, img_name), quality=95)
                num_gen[label]+=1
                labels_stacked.append(morphs_map[label])
                save_paths.append(img_name)
                save_counter +=1
            else:
                continue
    
    return save_paths, labels_stacked

In [145]:
smote_paths, labels_stacked = sample_smote(overall_array, overall_y, save_dir = '../Plaquebox/plaquebox-paper-master/data/tiles/train_and_val')

Currently generated: [0 0 0 0 0]




Currently generated: [488 557 509 364   7]




Currently generated: [ 976 1114 1018  728   14]




Currently generated: [1464 1671 1527 1092   21]




Currently generated: [1952 2228 2036 1456   28]




Currently generated: [2440 2785 2545 1820   35]




Currently generated: [2928 3342 3054 2184   42]




Currently generated: [3416 3899 3563 2548   49]




Currently generated: [3904 4456 4072 2912   56]




Currently generated: [4392 5013 4581 3276   63]




Currently generated: [4880 5570 5090 3640   70]




Currently generated: [5368 6127 5599 4004   77]




Currently generated: [5856 6684 6108 4368   84]




Currently generated: [6344 7241 6617 4732   91]




Currently generated: [6832 7798 7126 5096   98]




Currently generated: [7320 8355 7635 5460  105]




Currently generated: [7808 8912 8144 5824  112]




Currently generated: [8296 9469 8653 6188  119]




Currently generated: [ 8784 10026  9162  6552   126]




Currently generated: [ 9272 10583  9671  6916   133]




Currently generated: [ 9760 11140 10180  7280   140]




Currently generated: [10248 11697 10689  7644   147]




Currently generated: [10736 12254 11198  7644   154]




Currently generated: [11224 12811 11198  7644   161]




Currently generated: [11712 13368 11198  7644   168]




Currently generated: [12200 13925 11198  7644   175]




Currently generated: [12688 14482 11198  7644   182]




Currently generated: [13176 15039 11198  7644   189]




Currently generated: [13664 15596 11198  7644   196]




Currently generated: [14152 16153 11198  7644   203]




Currently generated: [14640 16710 11198  7644   210]




Currently generated: [15128 17267 11198  7644   217]




Currently generated: [15616 17824 11198  7644   224]




Currently generated: [16104 18381 11198  7644   231]




Currently generated: [16592 18938 11198  7644   238]




Currently generated: [17080 19495 11198  7644   245]




Currently generated: [17568 20052 11198  7644   252]




Currently generated: [18056 20609 11198  7644   259]




Currently generated: [18544 21166 11198  7644   266]




Currently generated: [19032 21723 11198  7644   273]




Currently generated: [19520 22280 11198  7644   280]




Currently generated: [20008 22837 11198  7644   287]




Currently generated: [20496 23394 11198  7644   294]




Currently generated: [20984 23951 11198  7644   301]




Currently generated: [21472 24508 11198  7644   301]




Currently generated: [21960 25065 11198  7644   301]




Currently generated: [22448 25622 11198  7644   301]




Currently generated: [22936 26179 11198  7644   301]




Currently generated: [23424 26736 11198  7644   301]




Currently generated: [23912 27293 11198  7644   301]




Currently generated: [24400 27850 11198  7644   301]




Currently generated: [24888 28407 11198  7644   301]




Currently generated: [25376 28964 11198  7644   301]




Currently generated: [25864 29521 11198  7644   301]




Currently generated: [26352 30078 11198  7644   301]




Currently generated: [26840 30635 11198  7644   301]




Currently generated: [27328 31192 11198  7644   301]




Currently generated: [27816 31749 11198  7644   301]




Currently generated: [28304 32306 11198  7644   301]




Currently generated: [28792 32863 11198  7644   301]




Currently generated: [29280 33420 11198  7644   301]




Currently generated: [29768 33977 11198  7644   301]




Currently generated: [30256 34534 11198  7644   301]




Currently generated: [30744 35091 11198  7644   301]




Currently generated: [31232 35648 11198  7644   301]




Currently generated: [31720 36205 11198  7644   301]




Currently generated: [32208 36762 11198  7644   301]




Currently generated: [32696 37319 11198  7644   301]




Currently generated: [33184 37876 11198  7644   301]




Currently generated: [33672 38433 11198  7644   301]




Currently generated: [34160 38955 11198  7644   301]




Currently generated: [34648 38955 11198  7644   301]




Currently generated: [35136 38955 11198  7644   301]




Currently generated: [35624 38955 11198  7644   301]




In [147]:
smote_paths

['smote/0.jpg',
 'smote/1.jpg',
 'smote/2.jpg',
 'smote/3.jpg',
 'smote/4.jpg',
 'smote/5.jpg',
 'smote/6.jpg',
 'smote/7.jpg',
 'smote/8.jpg',
 'smote/9.jpg',
 'smote/10.jpg',
 'smote/11.jpg',
 'smote/12.jpg',
 'smote/13.jpg',
 'smote/14.jpg',
 'smote/15.jpg',
 'smote/16.jpg',
 'smote/17.jpg',
 'smote/18.jpg',
 'smote/19.jpg',
 'smote/20.jpg',
 'smote/21.jpg',
 'smote/22.jpg',
 'smote/23.jpg',
 'smote/24.jpg',
 'smote/25.jpg',
 'smote/26.jpg',
 'smote/27.jpg',
 'smote/28.jpg',
 'smote/29.jpg',
 'smote/30.jpg',
 'smote/31.jpg',
 'smote/32.jpg',
 'smote/33.jpg',
 'smote/34.jpg',
 'smote/35.jpg',
 'smote/36.jpg',
 'smote/37.jpg',
 'smote/38.jpg',
 'smote/39.jpg',
 'smote/40.jpg',
 'smote/41.jpg',
 'smote/42.jpg',
 'smote/43.jpg',
 'smote/44.jpg',
 'smote/45.jpg',
 'smote/46.jpg',
 'smote/47.jpg',
 'smote/48.jpg',
 'smote/49.jpg',
 'smote/50.jpg',
 'smote/51.jpg',
 'smote/52.jpg',
 'smote/53.jpg',
 'smote/54.jpg',
 'smote/55.jpg',
 'smote/56.jpg',
 'smote/57.jpg',
 'smote/58.jpg',
 'smote

In [150]:
df_out = pd.DataFrame(data=smote_paths,columns=['imagename'])
df_out[['cored','diffuse','CAA']] = labels_stacked
df_out.head()

Unnamed: 0,imagename,cored,diffuse,CAA
0,smote/0.jpg,1,0,0
1,smote/1.jpg,1,0,0
2,smote/2.jpg,1,0,0
3,smote/3.jpg,1,0,0
4,smote/4.jpg,1,0,0


In [151]:
df_out.to_csv('./CSVs/train_smote_up.csv',index=False)