# **Generating Noisy Label Image(nette/woof) dataset**
Tanishq Abraham, 12/28/2020

# imports

In [1]:
from fastai.vision.all import *
set_seed(42,reproducible=True)

# funcs

In [2]:
def get_labels(files):
    labels = []
    for file in files: labels.append(parent_label(file))
    return labels

In [3]:
def generate_noisy_labels(labels,unique_labels,pct_noise):
    noisy_labels = labels.copy()
    num_labels = len(labels)
    num_classes = len(unique_labels)
    noisy_idxs = []
    indices = np.random.permutation(num_labels)
    for i, idx in enumerate(indices):
        if i < pct_noise * num_labels:
            noisy_idxs.append(idx)
            before_label = noisy_labels[idx]
            while noisy_labels[idx] == before_label:
                new_label = unique_labels[np.random.randint(num_classes)]
                noisy_labels[idx] = new_label  
    return noisy_labels, noisy_idxs

In [4]:
def get_imagenette_relative_path(files):
    _files = []
    for i in range(len(files)): _files.append(os.path.join(*str(files[i]).split('/')[-3:]))
    return _files

# load imagenette data

In [5]:
source = untar_data(URLs.IMAGENETTE_320)

In [6]:
train_files = get_image_files(source/'train')

In [7]:
labels = get_labels(train_files)
unique_labels = list(set(labels))

# create noisy labels for imagenette

In [8]:
noisy_labels_1, noisy_idxs_1 = generate_noisy_labels(labels, unique_labels, 0.01)

In [9]:
print(f'percentage noise: {100*len(noisy_idxs_1)/len(noisy_labels_1)}%')

percentage noise: 1.0032738409546942%


In [10]:
example_idx = np.random.randint(len(noisy_idxs_1))
print(noisy_labels_1[noisy_idxs_1[example_idx]], labels[noisy_idxs_1[example_idx]])

n03394916 n03000684


In [11]:
noisy_labels_5, noisy_idxs_5 = generate_noisy_labels(labels, unique_labels, 0.05)
noisy_labels_25, noisy_idxs_25 = generate_noisy_labels(labels, unique_labels, 0.25)
noisy_labels_50, noisy_idxs_50 = generate_noisy_labels(labels, unique_labels, 0.50)

In [12]:
_files = get_imagenette_relative_path(train_files)

In [13]:
train_df = pd.DataFrame({'path': _files, 
              'noisy_labels_1': noisy_labels_1, 
              'noisy_labels_5': noisy_labels_5, 
              'noisy_labels_25': noisy_labels_25,
              'noisy_labels_50': noisy_labels_50,
              'is_valid': [False]*len(_files)
             })

In [14]:
val_files = get_image_files(source/'val')

In [15]:
labels = get_labels(val_files)
_files = get_imagenette_relative_path(val_files)

In [16]:
val_df = pd.DataFrame({'path': _files, 
              'noisy_labels_1': labels, 
              'noisy_labels_5': labels, 
              'noisy_labels_25': labels,
              'noisy_labels_50': labels,
              'is_valid': [True]*len(_files)
             })

In [17]:
df = pd.concat([train_df,val_df])

In [18]:
df.head()

Unnamed: 0,path,noisy_labels_1,noisy_labels_5,noisy_labels_25,noisy_labels_50,is_valid
0,train/n02979186/n02979186_9036.JPEG,n02979186,n02979186,n02979186,n02979186,False
1,train/n02979186/n02979186_11957.JPEG,n02979186,n02979186,n02979186,n03000684,False
2,train/n02979186/n02979186_9715.JPEG,n02979186,n02979186,n03417042,n03000684,False
3,train/n02979186/n02979186_21736.JPEG,n02979186,n02979186,n02979186,n03417042,False
4,train/n02979186/ILSVRC2012_val_00046953.JPEG,n02979186,n02979186,n02979186,n03394916,False


In [19]:
df.to_csv('noisy_imagenette.csv', index=False)

# load imagewoof data

In [20]:
source = untar_data(URLs.IMAGEWOOF_320)

In [21]:
train_files = get_image_files(source/'train')

In [22]:
labels = get_labels(train_files)
unique_labels = list(set(labels))

# create noisy labels for imagewoof

In [23]:
noisy_labels_1, noisy_idxs_1 = generate_noisy_labels(labels, unique_labels, 0.01)

In [24]:
print(f'percentage noise: {100*len(noisy_idxs_1)/len(noisy_labels_1)}%')

percentage noise: 1.0083102493074791%


In [25]:
example_idx = np.random.randint(len(noisy_idxs_1))
print(noisy_labels_1[noisy_idxs_1[example_idx]], labels[noisy_idxs_1[example_idx]])

n02089973 n02087394


In [26]:
noisy_labels_5, noisy_idxs_5 = generate_noisy_labels(labels, unique_labels, 0.05)
noisy_labels_25, noisy_idxs_25 = generate_noisy_labels(labels, unique_labels, 0.25)
noisy_labels_50, noisy_idxs_50 = generate_noisy_labels(labels, unique_labels, 0.50)

In [27]:
_files = get_imagenette_relative_path(train_files)

In [28]:
train_df = pd.DataFrame({'path': _files, 
              'noisy_labels_1': noisy_labels_1, 
              'noisy_labels_5': noisy_labels_5, 
              'noisy_labels_25': noisy_labels_25,
              'noisy_labels_50': noisy_labels_50,
              'is_valid': [False]*len(_files)
             })

In [29]:
val_files = get_image_files(source/'val')

In [30]:
labels = get_labels(val_files)
_files = get_imagenette_relative_path(val_files)

In [31]:
val_df = pd.DataFrame({'path': _files, 
              'noisy_labels_1': labels, 
              'noisy_labels_5': labels, 
              'noisy_labels_25': labels,
              'noisy_labels_50': labels,
              'is_valid': [True]*len(_files)
             })

In [32]:
df = pd.concat([train_df,val_df])

In [33]:
df.head()

Unnamed: 0,path,noisy_labels_1,noisy_labels_5,noisy_labels_25,noisy_labels_50,is_valid
0,train/n02115641/n02115641_3995.JPEG,n02115641,n02115641,n02115641,n02115641,False
1,train/n02115641/n02115641_843.JPEG,n02115641,n02105641,n02115641,n02088364,False
2,train/n02115641/n02115641_2953.JPEG,n02115641,n02115641,n02111889,n02099601,False
3,train/n02115641/n02115641_6458.JPEG,n02115641,n02115641,n02093754,n02115641,False
4,train/n02115641/n02115641_19414.JPEG,n02115641,n02115641,n02115641,n02088364,False


In [34]:
df.to_csv('noisy_imagewoof.csv', index=False)