In [24]:
from nltk.corpus import wordnet as wn
import nltk
import csv
import time
import random

### 1. Download wordnet and files with class names of ImageNet-(2)1k 

In [5]:
_ = nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Read ids of classes in ImageNet-21k and ImageNet-1k.

In [23]:
# Read synset ids and names of classes in ImageNet-21k
synset_ids_21k = []
class_names_21k = []
with open('res/classes_in_imagenet_21k.csv', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    next(csv_reader) # skip first line
    for row in csv_reader:
        synset_ids_21k.append(row[0])  # e.g. 'n00004475'
        class_names_21k.append(row[1].replace(' ', '_')) # e.g. 'organism'

In [7]:
# Read synset ids and names of classes in ImageNet-1k
synset_ids_1k = []
class_names_1k = []
with open('res/classes_in_imagenet_1k.csv', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    i = 0
    for row in csv_reader:
        class_id = row[0].split(':')[0]
        synset_ids_1k.append(class_id)
        class_name = row[0].split(':')[1].split(',')[0].replace(' ', '_')[1:]
        class_names_1k.append(class_name)

### 2. Generate synset_ids for out-of-distribution classes 

In [10]:
def is_far_from_1k(category, class_names_1k, thresh=0.5):
    #print('Checking if {} is far from ImageNet-1k'.format(category))
    obj1 = wn.synsets(category)[0]
    for some_class in class_names_1k:
        if some_class:
            obj2 = wn.synsets(some_class)[0]
            if obj1.path_similarity(obj2) > thresh:
                #print('Nope')
                return False
    #print('Yes!')
    return True

Generate random synset_ids from synset_ids_21k which are not in synset_ids_1k and with minimum path distance to all 1k classes in ImageNet-1k.

In [12]:
nr_ood_classes = 200 # int(sys.argv[1])
ood_synset_ids = []
ood_synset_ids_str = ''
max_id = len(synset_ids_21k)

i = 0
start = time.time()
while i < nr_ood_classes:
    random_idx = random.randint(0, max_id)
    random_synset_id = synset_ids_21k[random_idx]
    random_synset_name = class_names_21k[random_idx]
    if random_synset_id not in synset_ids_1k and is_far_from_1k(random_synset_name, class_names_1k, thresh=0.2):
        ood_synset_ids_str += random_synset_id + ' '
        i += 1
end = time.time()
print('Done generating {} OOD classes after {:.2f} seconds!'.format(nr_ood_classes, end-start))

Done generating 200 OOD classes after 82.37 seconds!


### 3. Generate in-distribution (ID) synset_ids 

Generate random synset_ids from ImageNet-1k. This will be the in-distribution data used.

In [13]:
nr_id_classes = 300 #int(sys.argv[2])
id_synset_ids_str = ''
max_id = len(synset_ids_1k)

i = 0
while i < nr_id_classes:
    random_synset_id = synset_ids_1k[random.randint(0, max_id)]
    id_synset_ids_str += random_synset_id + ' '
    i += 1

In [2]:
# Example of how path similarity works.
# 1
obj1 = wn.synsets('ambulance')[0]
obj2 = wn.synsets('dog')[0]
print('The similarity between {} and {} is: {}'.format('ambulance', 'dog', obj1.path_similarity(obj2)))
# 2
obj1 = wn.synsets('cat')[0]
obj2 = wn.synsets('dog')[0]
print('The similarity between {} and {} is: {}'.format('cat', 'dog', obj1.path_similarity(obj2)))
# 3
obj1 = wn.synsets('cat')[0]
obj2 = wn.synsets('human')[0]
print('The similarity between {} and {} is: {}'.format('cat', 'human', obj1.path_similarity(obj2)))

The similarity between ambulance and dog is: 0.07142857142857142
The similarity between cat and dog is: 0.2
The similarity between cat and human is: 0.14285714285714285


In [14]:
ood_synset_ids_str

'n10690421 n02146201 n10185483 n03708962 n02563182 n12031388 n10696101 n13092240 n07853560 n02376542 n07852532 n02848921 n10672540 n09697401 n11722036 n03543511 n02390938 n00443231 n10387836 n03255167 n13205058 n02968333 n09830400 n10328123 n04450243 n08558770 n09445289 n00470966 n07768068 n02787120 n03982642 n01712008 n04282494 n02582721 n02765028 n01805321 n04951186 n09754217 n10382302 n02680638 n03772584 n04290259 n13158605 n01751215 n12120347 n02645538 n10265891 n03901338 n09894143 n12882158 n07574176 n03677115 n03854722 n02068541 n10015792 n01319467 n12626674 n13193856 n02814116 n03931980 n12528768 n07844786 n13154494 n12957803 n07585906 n10328328 n12166929 n10252222 n12066018 n10358124 n03020034 n03992703 n13031193 n06266710 n01324610 n10700201 n13083023 n10577710 n10366966 n12067029 n02473857 n00445226 n04553389 n10274173 n10655986 n12522678 n12674895 n10252354 n12054195 n13132486 n13148384 n07617611 n03095965 n10277638 n13906767 n12766869 n12727518 n04035912 n09466678 n03694639