# Custom detector predictions outside U

Here, we validate if the custom detectors trained on subsets of Tiny images dataset behave similar on a random subset of data outside U.

In [2]:
import pickle
import numpy as np

## Load indices of U

In [18]:
# U: 500k images filtered from tiny images by Percy Liang's detector that are most similar to CIFAR-10
aux_path = '../data/ti_500K_pseudo_labeled.pickle'

with open(aux_path, 'rb') as f:
    aux = pickle.load(f)
    aux_indices = aux['ti_index']

In [19]:
aux_indices

array([ 2503931, 71093249, 41345827, ..., 71911843, 62792441, 68810450],
      dtype=int64)

In [5]:
# Uniformly sample random indices from tiny images which are not in aux_indices
np.random.seed(4)

not_aux_indices = []

for _ in range(250000):
    idx = np.random.randint(low=0, high=79302016)
    while idx in aux_indices:
        idx = np.random.randint(low=0, high=79302016)
    not_aux_indices.append(idx)
    
not_aux_indices.sort()

In [6]:
len(not_aux_indices)

250000

In [7]:
with open('ti_250k_not_aux_indices.pickle', 'wb') as f:
    pickle.dump(not_aux_indices, f)

In [9]:
with open('ti_250k_not_aux_indices.txt', 'w') as f:
    f.write('\n'.join(str(idx) for idx in not_aux_indices))

## Make the custom detector predictions on these images

In [21]:
aux_indices.sort()

In [23]:
len(aux_indices)

500000

In [27]:
aux_indices[:5]

array([ 70,  71, 112, 176, 178], dtype=int64)

In [31]:
out_U = []
for i in range(1000000, 1350000):
    if i not in aux_indices:
        out_U.append(i)
    if len(out_U) == 250000:
        break
print(len(out_U))

250000


In [33]:
out_U[-5:]

[1253282, 1253283, 1253284, 1253285, 1253286]

In [34]:
out_U[:5]

[1000000, 1000001, 1000002, 1000003, 1000004]

In [35]:
out_U_pos = [i + 1 - 1000000 for i in out_U]

In [36]:
out_U_pos[:5]

[1, 2, 3, 4, 5]

In [37]:
out_U_pos[-5:]

[253283, 253284, 253285, 253286, 253287]

In [41]:
chunks = []
start = 1
for i in range(1, len(out_U_pos)):
    if out_U_pos[i] != out_U_pos[i-1] + 1:
        chunks.append((start, out_U_pos[i-1]))
        start = out_U_pos[i]
chunks.append((start, out_U_pos[-1]))

print(chunks)

[(1, 2134), (2136, 2525), (2527, 2646), (2648, 2722), (2724, 2856), (2858, 2904), (2906, 3269), (3271, 3380), (3382, 3463), (3465, 3503), (3505, 3535), (3537, 3542), (3544, 3755), (3757, 4492), (4494, 4512), (4514, 5260), (5262, 5414), (5416, 5584), (5586, 5604), (5606, 5658), (5660, 5742), (5744, 5929), (5931, 6197), (6199, 6348), (6350, 6410), (6412, 7274), (7276, 7612), (7614, 7930), (7932, 7936), (7938, 7940), (7942, 7945), (7948, 7953), (7955, 7959), (7961, 7964), (7966, 8019), (8021, 8130), (8132, 8154), (8156, 8219), (8221, 8244), (8246, 8305), (8307, 8349), (8351, 8369), (8371, 8410), (8412, 8412), (8414, 8470), (8472, 8497), (8499, 8523), (8525, 8534), (8536, 8573), (8575, 8643), (8645, 8656), (8658, 8703), (8705, 8728), (8730, 8730), (8732, 8737), (8739, 8777), (8779, 8797), (8799, 8816), (8818, 8878), (8880, 8939), (8942, 8949), (8951, 8983), (8985, 9114), (9116, 9123), (9125, 9131), (9133, 9137), (9139, 9173), (9175, 9179), (9181, 9233), (9235, 9287), (9289, 9348), (9350, 9

In [42]:
print(len(chunks))

3017


In [43]:
count = 0
for chunk in chunks:
    count += chunk[1] - chunk[0] + 1
print(count)

250000
