## **Init**

In [1]:
import os
from tqdm.notebook import tqdm
import tensorflow.compat.v1 as tf
import numpy as np
import re
import collections
import sklearn.metrics as sk
from scipy.spatial.distance import cdist

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
base_path = 'gdrive/My Drive/CS769_Assignments/OOD_NLP'
if not os.path.exists(base_path):
  print("Err : Invalid base path")
else:
  print("Base path is valid!")

Base path is valid!


In [4]:
SAVE = True

## **Load dataset & model**

In [5]:
def load_data(self='./data/r8-train.txt'):
    '''
    :param self: the system location of the data to load
    :return: the text (x) and its label (y)
             the text is a list of words and is not processed
    '''

    # stop words taken from nltk
    stop_words = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours',
                  'yourself','yourselves','he','him','his','himself','she','her','hers','herself',
                  'it','its','itself','they','them','their','theirs','themselves','what','which',
                  'who','whom','this','that','these','those','am','is','are','was','were','be',
                  'been','being','have','has','had','having','do','does','did','doing','a','an',
                  'the','and','but','if','or','because','as','until','while','of','at','by','for',
                  'with','about','against','between','into','through','during','before','after',
                  'above','below','to','from','up','down','in','out','on','off','over','under',
                  'again','further','then','once','here','there','when','where','why','how','all',
                  'any','both','each','few','more','most','other','some','such','no','nor','not',
                  'only','own','same','so','than','too','very','s','t','can','will','just','don',
                  'should','now','d','ll','m','o','re','ve','y','ain','aren','couldn','didn',
                  'doesn','hadn','hasn','haven','isn','ma','mightn','mustn','needn','shan',
                  'shouldn','wasn','weren','won','wouldn']

    x, y = [], []
    with open(self, "r") as f:
        for line in f:
            line = re.sub(r'\W+', ' ', line).strip()
            x.append(line[1:])
            x[-1] = ' '.join(word for word in x[-1].split() if word not in stop_words)
            y.append(line[0])
    return x, np.array(y, dtype=int)

def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1
    
    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

def text_to_rank(dataset, _vocab, desired_vocab_size=1000):
    '''
    :param dataset: the text from load_data
    :vocab: a _ordered_ dictionary of vocab words and counts from get_vocab
    :param desired_vocab_size: the desired vocabulary size
    words no longer in vocab become UUUNNNKKK
    :return: the text corpus with words mapped to their vocab rank,
    with all sufficiently infrequent words mapped to UUUNNNKKK; UUUNNNKKK has rank desired_vocab_size
    (the infrequent word cutoff is determined by desired_vocab size)
    '''
    _dataset = dataset[:]     # aliasing safeguard
    vocab_ordered = list(_vocab)
    count_cutoff = _vocab[vocab_ordered[desired_vocab_size-2]] # get word by its rank and map to its count
    
    word_to_rank = {}
    for i in range(len(vocab_ordered)):
        # we add one to make room for any future padding symbol with value 0
        word_to_rank[vocab_ordered[i]] = i
    
    for i in range(len(_dataset)):
        example = _dataset[i]
        example_as_list = example.split()
        for j in range(len(example_as_list)):
            try:
                if _vocab[example_as_list[j]] >= count_cutoff and word_to_rank[example_as_list[j]] < desired_vocab_size:
                    # we need to ensure that other words below the word on the edge of our desired_vocab size
                    # are not also on the count cutoff
                    example_as_list[j] = word_to_rank[example_as_list[j]] 
                else:
                    example_as_list[j] = desired_vocab_size-1  # UUUNNNKKK
            except:
                example_as_list[j] = desired_vocab_size-1  # UUUNNNKKK
        _dataset[i] = example_as_list

    return _dataset

def text_to_matrix(dataset, _vocab, desired_vocab_size=1000):
    sequences = text_to_rank(dataset, _vocab, desired_vocab_size)
    
    mat = np.zeros((len(sequences), desired_vocab_size), dtype=int)
    
    for i, seq in enumerate(sequences):
        for token in seq:
            mat[i][token] = 1
    
    return mat

def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1

    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

In [6]:
def partion_data_in_two(dataset, dataset_labels, in_sample_labels, oos_labels):
    '''
    :param dataset: the text from text_to_rank
    :param dataset_labels: dataset labels
    :param in_sample_labels: a list of newsgroups which the network will/did train on
    :param oos_labels: the complement of in_sample_labels; these newsgroups the network has never seen
    :return: the dataset partitioned into in_sample_examples, in_sample_labels,
    oos_examples, and oos_labels in that order
    '''
    _dataset = dataset[:]     # aliasing safeguard
    _dataset_labels = dataset_labels
    
    in_sample_idxs = np.zeros(np.shape(_dataset_labels), dtype=bool)
    ones_vec = np.ones(np.shape(_dataset_labels), dtype=int)
    for label in in_sample_labels:
        in_sample_idxs = np.logical_or(in_sample_idxs, _dataset_labels == label * ones_vec)

    
    return _dataset[in_sample_idxs], _dataset_labels[in_sample_idxs],\
        _dataset[np.logical_not(in_sample_idxs)], _dataset_labels[np.logical_not(in_sample_idxs)]

In [7]:
# our network trains only on a subset of classes, say 6, but class number 7 might still
# be an in-sample label: we need to squish the labels to be in {0,...,5}
def relabel_in_sample_labels(labels):
    labels_as_list = labels.tolist()
    
    set_of_labels = []
    for label in labels_as_list:
        set_of_labels.append(label)
    labels_ordered = sorted(list(set(set_of_labels)))
    
    relabeled = np.zeros(labels.shape, dtype=int)
    for i in range(len(labels_as_list)):
        relabeled[i] = labels_ordered.index(labels_as_list[i])
    
    return relabeled

In [8]:
batch_size = 32
vocab_size = 1000
num_epochs = 5
n_hidden = 512
nclasses_to_exclude = 2  # 0-3

In [9]:
random_classes = np.arange(8)
np.random.shuffle(random_classes)
to_include = list(random_classes[:8-nclasses_to_exclude])
to_exclude = list(random_classes[8-nclasses_to_exclude:])

In [10]:
print('Loading Data')
X_train, Y_train = load_data(os.path.join(base_path, 'Baseline/Categorization/data/r8-train.txt'))
X_test, Y_test = load_data(os.path.join(base_path, 'Baseline/Categorization/data/r8-test.txt'))

vocab = get_vocab(X_train)
X_train = text_to_matrix(X_train, vocab, vocab_size)
X_test = text_to_matrix(X_test, vocab, vocab_size)

# shuffle
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices]

indices = np.arange(X_test.shape[0])
np.random.shuffle(indices)
X_test = X_test[indices]
Y_test = Y_test[indices]

# split into train/dev
X_dev = X_train[-500:]
Y_dev = Y_train[-500:]
X_train = X_train[:-500]
Y_train = Y_train[:-500]

in_sample_examples, in_sample_labels, oos_examples, oos_labels =\
partion_data_in_two(X_train, Y_train, to_include, to_exclude)
dev_in_sample_examples, dev_in_sample_labels, dev_oos_examples, dev_oos_labels =\
partion_data_in_two(X_dev, Y_dev, to_include, to_exclude)
test_in_sample_examples, test_in_sample_labels, test_oos_examples, dev_oos_labels =\
partion_data_in_two(X_test, Y_test, to_include, to_exclude)

# safely assumes there is an example for each in_sample class in both the training and dev class 
in_sample_labels = relabel_in_sample_labels(in_sample_labels)
dev_in_sample_labels = relabel_in_sample_labels(dev_in_sample_labels)
test_in_sample_labels = relabel_in_sample_labels(test_in_sample_labels)

num_examples = in_sample_labels.shape[0]
num_batches = num_examples//batch_size

print('Data loaded')

Loading Data
Data loaded


In [11]:
graph = tf.Graph()

with graph.as_default():
    x = tf.placeholder(dtype=tf.float32, shape=[None, vocab_size])
    y = tf.placeholder(dtype=tf.int64, shape=[None])
    is_training = tf.placeholder(tf.bool)
    
    # add one to vocab size for the padding symbol

    W_h = tf.Variable(tf.nn.l2_normalize(tf.random_normal([vocab_size, n_hidden]), 0)/tf.sqrt(1 + 0.45))
    b_h = tf.Variable(tf.zeros([n_hidden]))
    
    def gelu_fast(_x):
        return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))
    
    h = tf.cond(is_training,
                lambda: tf.nn.dropout(gelu_fast(tf.matmul(x, W_h) + b_h), 0.5),
                lambda: gelu_fast(tf.matmul(x, W_h) + b_h))
    
    W_out = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, 8-nclasses_to_exclude]), 0)/tf.sqrt(0.45 + 1))
    b_out = tf.Variable(tf.zeros([8-nclasses_to_exclude]))
    
    logits = tf.matmul(h, W_out) + b_out
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))

    global_step = tf.Variable(0, trainable=False)
    lr = tf.train.exponential_decay(1e-3, global_step, 4*num_batches, 0.1, staircase=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)

    acc = 100*tf.reduce_mean(tf.to_float(tf.equal(tf.argmax(logits, 1), y)))

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use `tf.cast` instead.


In [12]:
# initialize
sess = tf.InteractiveSession(graph=graph)
tf.initialize_all_variables().run()
# create saver to train model
saver = tf.train.Saver(max_to_keep=1)

print('Initialized')

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized


In [13]:
best_acc = 0

for epoch in range(num_epochs):
    # shuffle data every epoch
    indices = np.arange(num_examples)
    np.random.shuffle(indices)
    in_sample_examples = in_sample_examples[indices]
    in_sample_labels = in_sample_labels[indices]

    for i in range(num_batches):
        offset = i * batch_size

        x_batch = in_sample_examples[offset:offset + batch_size]
        y_batch = in_sample_labels[offset:offset + batch_size]

        _, l, batch_acc = sess.run([optimizer, loss, acc], feed_dict={x: x_batch, y: y_batch, is_training: True})


    curr_dev_acc = sess.run(
        acc, feed_dict={x: dev_in_sample_examples, y: dev_in_sample_labels, is_training: False})
    if best_acc < curr_dev_acc:
        best_acc = curr_dev_acc
        saver.save(sess, os.path.join(base_path, "Baseline/Categorization/data/best_r8_model.ckpt"))

    print('Epoch %d | Minibatch loss %.3f | Minibatch accuracy %.3f | Dev accuracy %.3f' %
          (epoch+1, l, batch_acc, curr_dev_acc))

Epoch 1 | Minibatch loss 0.131 | Minibatch accuracy 93.750 | Dev accuracy 96.026
Epoch 2 | Minibatch loss 0.087 | Minibatch accuracy 100.000 | Dev accuracy 96.909
Epoch 3 | Minibatch loss 0.018 | Minibatch accuracy 100.000 | Dev accuracy 96.247
Epoch 4 | Minibatch loss 0.022 | Minibatch accuracy 100.000 | Dev accuracy 96.689
Epoch 5 | Minibatch loss 0.018 | Minibatch accuracy 100.000 | Dev accuracy 96.689


In [14]:
# restore variables from disk
saver.restore(sess, os.path.join(base_path, "Baseline/Categorization/data/best_r8_model.ckpt"))
print("Best model restored!")

print('Dev accuracy:', sess.run(acc, feed_dict={x: dev_in_sample_examples, y: dev_in_sample_labels, is_training:False}))

INFO:tensorflow:Restoring parameters from gdrive/My Drive/CS769_Assignments/OOD_NLP/Baseline/Categorization/data/best_r8_model.ckpt
Best model restored!
Dev accuracy: 96.90949


## **Initialize output folders**

In [15]:
#init names
OUTPUT_FOLDER_NAME = os.path.join(base_path, "outputs")
DATA_NAME = "Reuters6"
OOD_DATA_NAME = "Reuters2"

if(not os.path.exists(OUTPUT_FOLDER_NAME)):
    os.makedirs(OUTPUT_FOLDER_NAME)
    
print(OUTPUT_FOLDER_NAME)

gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs


In [16]:
#init features folders
FEATURES_FOLDER = os.path.join(OUTPUT_FOLDER_NAME, "features")
FEATURES_DATA_FOLDER = os.path.join(FEATURES_FOLDER, DATA_NAME)
OOD_FEATURES_DATA_FOLDER = os.path.join(FEATURES_FOLDER, OOD_DATA_NAME)
    
if(not os.path.exists(FEATURES_DATA_FOLDER)):
    os.makedirs(FEATURES_DATA_FOLDER)

if(not os.path.exists(OOD_FEATURES_DATA_FOLDER)):
    os.makedirs(OOD_FEATURES_DATA_FOLDER)
    
#init distances folders
DISTS_FOLDER = os.path.join(OUTPUT_FOLDER_NAME, "dists")
DISTS_DATA_FOLDER = os.path.join(DISTS_FOLDER, DATA_NAME)
OOD_DISTS_DATA_FOLDER = os.path.join(DISTS_FOLDER, OOD_DATA_NAME)

if(not os.path.exists(DISTS_DATA_FOLDER)):
    os.makedirs(DISTS_DATA_FOLDER)

if(not os.path.exists(OOD_DISTS_DATA_FOLDER)):
    os.makedirs(OOD_DISTS_DATA_FOLDER)
    
#init closest class folders
CLOSEST_CLASS_FOLDER = os.path.join(OUTPUT_FOLDER_NAME, "closest_classes")
CLOSEST_CLASS_DATA_FOLDER = os.path.join(CLOSEST_CLASS_FOLDER, DATA_NAME)
OOD_CLOSEST_CLASS_DATA_FOLDER = os.path.join(CLOSEST_CLASS_FOLDER, OOD_DATA_NAME)

if(not os.path.exists(CLOSEST_CLASS_DATA_FOLDER)):
    os.makedirs(CLOSEST_CLASS_DATA_FOLDER)

if(not os.path.exists(OOD_CLOSEST_CLASS_DATA_FOLDER)):
    os.makedirs(OOD_CLOSEST_CLASS_DATA_FOLDER)
    
#init labels folders
LABELS_FOLDER = os.path.join(OUTPUT_FOLDER_NAME, "labels")
LABELS_DATA_FOLDER = os.path.join(LABELS_FOLDER, DATA_NAME)

if(not os.path.exists(LABELS_DATA_FOLDER)):
    os.makedirs(LABELS_DATA_FOLDER)
    
#init means folders
MEANS_FOLDER = os.path.join(OUTPUT_FOLDER_NAME, "means")
MEANS_DATA_FOLDER = os.path.join(MEANS_FOLDER, DATA_NAME)

if(not os.path.exists(MEANS_DATA_FOLDER)):
    os.makedirs(MEANS_DATA_FOLDER)
    
#init radius folders
RADIUS_FOLDER = os.path.join(OUTPUT_FOLDER_NAME, "radius")
RADIUS_DATA_FOLDER = os.path.join(RADIUS_FOLDER, DATA_NAME)

if(not os.path.exists(RADIUS_DATA_FOLDER)):
    os.makedirs(RADIUS_DATA_FOLDER)

## **Classes Info**

In [17]:
classes = list(set(in_sample_labels))
NUM_CLASSES = len(classes)
print(classes)
print(NUM_CLASSES)

[0, 1, 2, 3, 4, 5]
6


## **Generating features**

In [18]:
FEATURE_LAYER_SIZE = h.shape[-1]
print(FEATURE_LAYER_SIZE)

512


In [19]:
total = 0 
per_class_examples = []
for classID in classes:
    in_samples_class = in_sample_examples[in_sample_labels==classID]
    ss = len(in_samples_class)
    total+=ss
    per_class_examples.append(ss)
print(per_class_examples)
print(total == len(in_sample_examples))

[1431, 2598, 39, 189, 99, 233]
True


In [20]:
t = tqdm(classes)
for classID in classes:
    in_samples_class = in_sample_examples[in_sample_labels==classID]
    train_set_features = sess.run([h], feed_dict={x: in_samples_class, is_training: False})
    train_set_features = np.squeeze(np.asarray(train_set_features))
    if SAVE:
      save_name = "{0}_train_features.npy".format(classID, train_set_features.shape[0])
      save_location =  os.path.join(FEATURES_DATA_FOLDER, save_name)
      np.save(save_location, train_set_features)
      print("Saved generated features for " + str(classID) + " at " + save_location + " having shape - " + str(train_set_features.shape))
if not SAVE:
  print("Calculated but didn't save train_set_features")

  0%|          | 0/6 [00:00<?, ?it/s]

Saved generated features for 0 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/0_train_features.npy having shape - (1431, 512)
Saved generated features for 1 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/1_train_features.npy having shape - (2598, 512)
Saved generated features for 2 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/2_train_features.npy having shape - (39, 512)
Saved generated features for 3 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/3_train_features.npy having shape - (189, 512)
Saved generated features for 4 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/4_train_features.npy having shape - (99, 512)
Saved generated features for 5 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/5_train_features.npy having shape - (233, 512)


In [21]:
test_set_features = sess.run([h], feed_dict={x: test_in_sample_examples, is_training: False})
test_set_features = np.squeeze(np.asarray(test_set_features))
if SAVE:
  save_name = "test_features.npy"
  save_location =  os.path.join(FEATURES_DATA_FOLDER, save_name)
  np.save(save_location, test_set_features)
  print("Saved generated test set features" + " at " + save_location + " having len - " + str(test_set_features.shape))
else:
  print("Calculated but didn't save test_set_features")

Saved generated test set features at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters6/test_features.npy having len - (1987, 512)


In [22]:
ood_set_features = sess.run([h], feed_dict={x: test_oos_examples, is_training: False})
ood_set_features = np.squeeze(np.asarray(ood_set_features))
if SAVE:
  save_name = "ood_set_features.npy"
  save_location =  os.path.join(OOD_FEATURES_DATA_FOLDER, save_name)
  np.save(save_location, ood_set_features)
  print("Saved generated test set features" + " at " + save_location + " having len - " + str(ood_set_features.shape))
else:
  print("Calculated but didn't save ood_set_features")

Saved generated test set features at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/features/Reuters2/ood_set_features.npy having len - (202, 512)


## **Generating class means**

In [23]:
class_means=[]
t = tqdm(classes)
for classID in classes:
    t.set_description("Calculating mean for class - " + str(classID))
    save_name = "{0}_train_features.npy".format(classID)
    class_features_path = os.path.join(FEATURES_DATA_FOLDER, save_name)
    class_features = np.load(class_features_path)
    cMean = np.mean(class_features, axis=0)
    class_means.append(cMean)
class_means = np.asarray(class_means)
if SAVE:
  save_name = "train_class_means.npy"
  save_location =  os.path.join(MEANS_DATA_FOLDER, save_name)
  np.save(save_location, class_means)
  print("Saved generated means at " + save_location + " having shape - " + str(class_means.shape))
else:
  print("Calculated but didn't save class_means")

  0%|          | 0/6 [00:00<?, ?it/s]

Saved generated means at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/means/Reuters6/train_class_means.npy having shape - (6, 512)


## **Calculate distances & closest classes**

In [24]:
t = tqdm(classes)
for classID in classes:
    train_dists = []
    train_closest_classes = []
    t.set_description("Calculating distance for class - " + str(classID))
    save_name = "{0}_train_features.npy".format(classID)
    save_location = os.path.join(FEATURES_DATA_FOLDER, save_name)
    train_set_features = np.load(save_location)
    for feature in train_set_features:
        d = cdist(class_means, np.expand_dims(feature, axis=0), metric='cosine')
        idx = np.argmin(d)
        d = d[idx][0]
        train_closest_classes.append(idx)
        train_dists.append(d)
        
    train_dists = np.asarray(train_dists)
    train_closest_classes = np.asarray(train_closest_classes)
    if SAVE:
      # distances
      save_name = "{0}_train_distances.npy".format(classID)
      save_location =  os.path.join(DISTS_DATA_FOLDER, save_name)
      np.save(save_location, train_dists)
      print("Saved distances for " + str(classID) + " at " + save_location + " having shape - " + str(train_dists.shape))
      # closest classes
      save_name = "{0}_train_closest_classes.npy".format(classID)
      save_location =  os.path.join(CLOSEST_CLASS_DATA_FOLDER, save_name)
      np.save(save_location, train_closest_classes)
      print("Saved closest classes for " + str(classID) + " at " + save_location + " having shape - " + str(train_closest_classes.shape))

if not SAVE:
  print("Calculated but didn't save train_distances and train_closest_classes")

  0%|          | 0/6 [00:00<?, ?it/s]

Saved distances for 0 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/dists/Reuters6/0_train_distances.npy having shape - (1431,)
Saved closest classes for 0 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/closest_classes/Reuters6/0_train_closest_classes.npy having shape - (1431,)
Saved distances for 1 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/dists/Reuters6/1_train_distances.npy having shape - (2598,)
Saved closest classes for 1 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/closest_classes/Reuters6/1_train_closest_classes.npy having shape - (2598,)
Saved distances for 2 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/dists/Reuters6/2_train_distances.npy having shape - (39,)
Saved closest classes for 2 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/closest_classes/Reuters6/2_train_closest_classes.npy having shape - (39,)
Saved distances for 3 at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/dists/Reuters6/3_train_distances.npy having shape - (

In [25]:
test_dists = []
test_closest_classes = []
save_name = "test_features.npy"
save_location = os.path.join(FEATURES_DATA_FOLDER, save_name)
test_set_features = np.load(save_location)
for feature in test_set_features:
    d = cdist(class_means, np.expand_dims(feature, axis=0), metric='cosine')
    idx = np.argmin(d)
    d = d[idx][0]
    test_closest_classes.append(idx)
    test_dists.append(d)
test_dists = np.asarray(test_dists)
test_closest_classes = np.asarray(test_closest_classes)

if SAVE:
  # dists
  save_name = "test_distances.npy"
  save_location =  os.path.join(DISTS_DATA_FOLDER, save_name)
  np.save(save_location, test_dists)
  print("Saved distances for test data at " + save_location + " having shape - " + str(test_dists.shape))
  # closest classes
  save_name = "test_closest_classes.npy"
  save_location =  os.path.join(CLOSEST_CLASS_DATA_FOLDER, save_name)
  np.save(save_location, test_closest_classes)
  print("Saved closest classes for test data at " + save_location + " having shape - " + str(test_closest_classes.shape))
else:
  print("Calculated but didn't save test_dists and test_closest_classes")

Saved distances for test data at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/dists/Reuters6/test_distances.npy having shape - (1987,)
Saved closest classes for test data at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/closest_classes/Reuters6/test_closest_classes.npy having shape - (1987,)


In [29]:
ood_set_dists = []
ood_set_closest_classes = []
save_name = "ood_set_features.npy"
save_location = os.path.join(OOD_FEATURES_DATA_FOLDER, save_name)
ood_set_features = np.load(save_location)
for feature in ood_set_features:
    d = cdist(class_means, np.expand_dims(feature, axis=0), metric='cosine')
    idx = np.argmin(d)
    d = d[idx][0]
    ood_set_closest_classes.append(idx)
    ood_set_dists.append(d)
ood_set_dists = np.asarray(ood_set_dists)
ood_set_closest_classes = np.asarray(ood_set_closest_classes)

if SAVE:
  # dists
  save_name = "ood_set_distances.npy"
  save_location =  os.path.join(OOD_DISTS_DATA_FOLDER, save_name)
  np.save(save_location, ood_set_dists)
  print("Saved distances for ood set data at " + save_location + " having shape - " + str(ood_set_dists.shape))
  # closest classes
  save_name = "ood_set_closest_classes.npy"
  save_location =  os.path.join(OOD_CLOSEST_CLASS_DATA_FOLDER, save_name)
  np.save(save_location, ood_set_closest_classes)
  print("Saved closest classes for ood set data at " + save_location + " having shape - " + str(ood_set_closest_classes.shape))
else:
  print("Calculated but didn't save ood_set_dists and ood_set_closest_classes")

Saved distances for ood set data at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/dists/Reuters2/ood_set_distances.npy having shape - (202,)
Saved closest classes for ood set data at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/closest_classes/Reuters2/ood_set_closest_classes.npy having shape - (202,)


## **Generating class radius**

In [26]:
FRACTION_TO_COVER = 0.95

In [27]:
#load class means, if not already loaded
save_name = "train_class_means.npy"
save_location =  os.path.join(MEANS_DATA_FOLDER, save_name)
class_means = np.load(save_location)

In [28]:
t = tqdm(classes)
class_radii = []
for classID in classes:
  t.set_description("Calculating radius for class - " + str(classID))
  class_size = per_class_examples[classID]
  modified_class_size = int(class_size*FRACTION_TO_COVER)
  #load distances
  save_name = "{0}_train_distances.npy".format(classID, class_size)
  save_location = os.path.join(DISTS_DATA_FOLDER, save_name)
  class_distances = np.load(save_location)
  # set radius
  class_radii.append(class_distances[modified_class_size])

class_radii = np.asarray(class_radii)
save_name = "train_class_radii.npy"
save_location =  os.path.join(RADIUS_DATA_FOLDER, save_name)
np.save(save_location, class_radii)
print("Saved generated radii at " + save_location + " having shape - " + str(class_radii.shape))

  0%|          | 0/6 [00:00<?, ?it/s]

Saved generated radii at gdrive/My Drive/CS769_Assignments/OOD_NLP/outputs/radius/Reuters6/train_class_radii.npy having shape - (6,)
