In [15]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from dataset import *
import nmslib

## 0. Loading

<b> Load the trained model and formatted test data <b/>

In [2]:
mdl = torch.load('model.pt')

In [3]:
mdl.training = False

In [4]:
test_loader = torch.load('test_loader.pt')

## 1. Calculate Object Encoding

<b> Here we associate every object with the encoding vector calculated on the MergeRNN layer. <b/> 

This is easily achieved by setting the get_state flag in the model to true.

In [5]:
obj_encs = []
for x in test_loader:
    encs = mdl.forward(x, get_state=True)
    obj_encs.append(encs)

In [6]:
obj_encs = obj_encs[0]

In [7]:
df = pd.DataFrame(obj_encs.tolist())

In [8]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,2.35042,0.112386,-0.807134,-1.599127,-0.467856,1.501227,-0.602747,0.110598,2.738788,-0.085319,...,-0.150545,-0.14148,-0.108402,-0.476537,-1.837693,0.081284,0.101653,-0.26507,0.110261,-1.224855
1,1.87604,0.112386,-0.807134,-1.599127,-0.509676,1.18197,-0.602747,2.009841,3.352639,-0.085319,...,-0.150545,-0.233311,-0.108402,-0.489228,-1.837693,0.081284,0.101653,-0.230116,0.110261,-1.224855
2,1.21678,0.112386,-0.807134,-1.599127,-0.624903,0.540378,-0.602747,2.496461,2.882096,-0.085319,...,-0.150545,-0.221824,-0.108402,-0.502505,-1.837693,0.081284,0.101653,-0.275354,0.110261,-1.224855


In [9]:
ix = pd.read_pickle('test_xs_ix.pkl')

In [10]:
df.index = ix

In [11]:
df.Qhead()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,2.35042,0.112386,-0.807134,-1.599127,-0.467856,1.501227,-0.602747,0.110598,2.738788,-0.085319,...,-0.150545,-0.14148,-0.108402,-0.476537,-1.837693,0.081284,0.101653,-0.26507,0.110261,-1.224855
14,1.87604,0.112386,-0.807134,-1.599127,-0.509676,1.18197,-0.602747,2.009841,3.352639,-0.085319,...,-0.150545,-0.233311,-0.108402,-0.489228,-1.837693,0.081284,0.101653,-0.230116,0.110261,-1.224855
17,1.21678,0.112386,-0.807134,-1.599127,-0.624903,0.540378,-0.602747,2.496461,2.882096,-0.085319,...,-0.150545,-0.221824,-0.108402,-0.502505,-1.837693,0.081284,0.101653,-0.275354,0.110261,-1.224855
23,1.868699,0.112386,-0.807134,-1.599127,-0.767239,0.385271,-0.602747,3.361639,3.606521,-0.085319,...,-0.150545,-0.251225,-0.108402,-0.611845,-1.837693,0.081284,0.101653,-0.282142,0.110261,-1.224855
34,1.413542,0.112386,-0.807134,-1.599127,-0.692696,0.898524,-0.602747,0.430164,2.741134,-0.085319,...,-0.150545,-0.183926,-0.108402,-0.526038,-1.837693,0.081284,0.101653,-0.258345,0.110261,-1.224855


### 1.1 Create NMSLIB Index 

In [16]:
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(space='cosinesimil')
index.addDataPointBatch(df)
index.createIndex({'post': 2}, print_progress=True)

In [17]:
index.saveIndex('object_encoding_index.hnsw')

### 1.2 Query Index for Similar Objects

The query will return the k most similar objects in the whole dataset based on cosine similarity.

We classify objects as unknown objects if they have relatively low similarity scores

In [19]:
def get_knns(index, vecs):
     return zip(*index.knnQueryBatch(vecs, k=3, num_threads=8))

In [24]:
res = get_knns(index=index, vecs=df)

In [25]:
ix, dists = res

In [31]:
dists = [1-x for x in dists]

In [33]:
furthest = [np.min(x) for x in dists]

In [44]:
obj_similarities = pd.DataFrame(furthest, index=df.index, columns=['min_similarity'])

<b> Top 5 most dissimilar objects based on this method: <b/>

In [45]:
obj_similarities.sort_values(by='min_similarity', ascending=True).head()

Unnamed: 0_level_0,min_similarity
object_id,Unnamed: 1_level_1
12656,0.67595
1304,0.867995
12689,0.874829
31238,0.877676
5217,0.885842


Pick a threshold based on the distribution to decide what is an unknown object

In [51]:
threshold = 0.95

In [52]:
dissimlar_objs = obj_similarities.loc[obj_similarities['min_similarity'] < threshold]

In [54]:
dissimlar_objs = dissimlar_objs.index

In [55]:
len(dissimlar_objs)

33

In [56]:
pd.to_pickle(dissimlar_objs, 'unknown_objects.pkl')