<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#_compute_dist-Euclidean-distance" data-toc-modified-id="_compute_dist-Euclidean-distance-0.0.1"><span class="toc-item-num">0.0.1&nbsp;&nbsp;</span>_compute_dist Euclidean distance</a></span></li></ul></li><li><span><a href="#Computing-matches-Euclidean-similarity" data-toc-modified-id="Computing-matches-Euclidean-similarity-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Computing matches Euclidean similarity</a></span></li><li><span><a href="#Computing-matches-Euclidean-distance" data-toc-modified-id="Computing-matches-Euclidean-distance-0.2"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Computing matches Euclidean distance</a></span></li><li><span><a href="#Computing-matches-Cosine-similarity" data-toc-modified-id="Computing-matches-Cosine-similarity-0.3"><span class="toc-item-num">0.3&nbsp;&nbsp;</span>Computing matches Cosine similarity</a></span></li></ul></li><li><span><a href="#Document-Memmap" data-toc-modified-id="Document-Memmap-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Document Memmap</a></span></li></ul></div>

In [3]:
from jina import Document, DocumentArray
import numpy as np
from jina.types.arrays.neuraloperations import cosine_distance
from jina.types.arrays.neuraloperations import euclidean_distance_squared
from jina.types.arrays.neuraloperations import compute_distances

In [4]:
#%load_ext autoreload
#%autoreload 2

In [5]:
n_features = 100

def get_document_arrays():
    d1 = Document(embedding=np.array([0,0,0,0,0]))
    d2 = Document(embedding=np.array([1,0,0,0,0]))
    d3 = Document(embedding=np.array([1,1,1,1,0]))
    d4 = Document(embedding=np.array([1,2,2,1,0]))

    d1_m = Document(embedding=np.array([0,0.1,0,0,0]))
    d2_m = Document(embedding=np.array([1,0.1,0,0,0]))
    d3_m = Document(embedding=np.array([1,1.2,1,1,0]))
    d4_m = Document(embedding=np.array([1,2.2,2,1,0]))
    d5_m = Document(embedding=np.array([4,5.2,2,1,0]))

    D  = DocumentArray([d1,d2,d3,d4])
    D2 = DocumentArray([d1_m, d2_m, d3_m, d4_m, d5_m])
    return D, D2

In [6]:
D, D2 = get_document_arrays()

In [7]:
E = np.stack(D.get_attributes('embedding'))
E.argsort(axis=1)

array([[0, 1, 2, 3, 4],
       [1, 2, 3, 4, 0],
       [4, 0, 1, 2, 3],
       [4, 0, 3, 1, 2]])

#### _compute_dist Euclidean distance

In [8]:
X = np.stack(D.get_attributes('embedding'))
Y = np.stack(D2.get_attributes('embedding'))

dists = compute_distances(X, Y, 'euclidean')
idx, dist = D._get_sorted_smallest_k(dists, 2)
dist

array([[0.1       , 1.00498756],
       [0.1       , 1.00498756],
       [0.2       , 1.56204994],
       [0.2       , 1.28062485]])

In [9]:
idx, dist = D._get_sorted_smallest_k(dists,3)
dist

array([[0.1       , 1.00498756, 2.10713075],
       [0.1       , 1.00498756, 1.8547237 ],
       [0.2       , 1.56204994, 1.67630546],
       [0.2       , 1.28062485, 2.93428015]])

### Computing matches Euclidean similarity

By default `D.match` uses a similarity metric.

This means matched results are ordered from highest to lowest

In [10]:
D, D2 = get_document_arrays()
D.match(D2, metric='euclidean', limit=3)

In [11]:
print(f'query={D[2].embedding}')
for m in D[2].matches:
    print(m.embedding, 'score =', m.scores['euclidean'].value)

query=[1 1 1 1 0]
[1.  1.2 1.  1.  0. ] score = 0.8333333134651184
[1.  2.2 2.  1.  0. ] score = 0.39031246304512024
[1.  0.1 0.  0.  0. ] score = 0.3736494183540344


In [12]:
print(f'query={D[1].embedding}')
for m in D[1].matches:
    print(m.embedding, 'score =', m.scores['euclidean'].value)

query=[1 0 0 0 0]
[1.  0.1 0.  0.  0. ] score = 0.9090909361839294
[0.  0.1 0.  0.  0. ] score = 0.49875620007514954
[1.  1.2 1.  1.  0. ] score = 0.35029658675193787


### Computing matches Euclidean distance

If we want to use a distance we need to specify **`is_distance=True`**.

If we do so we should get results ordered from lowest to highest

In [13]:
D, D2 = get_document_arrays()
D.match(D2, metric='euclidean', limit=3, is_distance=True)

In [14]:
print(f'query={D[2].embedding}')
for m in D[2].matches:
    print(m.embedding, 'distance =', m.scores['euclidean'].value)

query=[1 1 1 1 0]
[1.  1.2 1.  1.  0. ] distance = 0.20000000298023224
[1.  2.2 2.  1.  0. ] distance = 1.5620499849319458
[1.  0.1 0.  0.  0. ] distance = 1.6763054132461548


### Computing matches Cosine similarity

In [15]:
D, D2 = get_document_arrays()
D.match(D2, metric='cosine', limit=3)

  return 1 - np.dot(X, Y.T) / np.outer(


In [16]:
print(f'query={D[2].embedding}')
for m in D[2].matches:
    print(m.embedding, 'dist=', m.scores['cosine'].value)

query=[1 1 1 1 0]
[1.  1.2 1.  1.  0. ] dist= 0.9966158866882324
[1.  2.2 2.  1.  0. ] dist= 0.9415579438209534
[4.  5.2 2.  1.  0. ] dist= 0.8800925612449646


In [17]:
scores = [m.scores['cosine'].value for m in D[2].matches]
scores ==sorted(scores)[::-1]

True

If we want cosine distance we need to specify `is_distance=True`

In [79]:
D, D2 = get_document_arrays()
D.match(D2, metric='cosine', limit=3, is_distance=True)

  return 1 - np.dot(X, Y.T) / np.outer(


In [80]:
print(f'query={D[2].embedding}')
for m in D[2].matches:
    print(m.embedding, 'dist=', m.scores['cosine'].value)

query=[1 1 1 1 0]
[1.  1.2 1.  1.  0. ] dist= 0.0033841044642031193
[1.  2.2 2.  1.  0. ] dist= 0.05844205245375633
[4.  5.2 2.  1.  0. ] dist= 0.11990746855735779


In [81]:
distances = [m.scores['cosine'].value for m in D[2].matches]
distances ==sorted(distances)

True

In [82]:
[m.scores['cosine'].value for d in D for m in d.matches]

[nan,
 nan,
 nan,
 0.0049628098495304585,
 0.42289015650749207,
 0.5254210233688354,
 0.0033841044642031193,
 0.05844205245375633,
 0.11990746855735779,
 0.0011076244991272688,
 0.03951964154839516,
 0.1148839220404625]

## Document Memmap

In [218]:
from jina.types.arrays.memmap import DocumentArrayMemmap

##### test_docarraymemmap_match_docarray

In [225]:
D1, D2 = get_document_arrays()
D1.match(D2, metric='euclidean', limit=3, is_distance=True)
values_docarray = [m.scores['euclidean'].value for d in D1 for m in d.matches]

!rm -rf ./my-memmap

from jina import Document

D1memmap = DocumentArrayMemmap('./my-memmap')
D1memmap.extend(D)
D1memmap.match(D2, metric='euclidean', limit=3, is_distance=True)

print(f'query={dam[2].embedding}')
for m in D1memmap[2].matches:
    print(m.embedding, 'distance =', m.scores['euclidean'].value)
    
values_docarraymemmap = [m.scores['euclidean'].value for d in D1memmap for m in d.matches]

query=[1 1 1 1 0]
[1.  1.2 1.  1.  0. ] distance = 0.20000000298023224
[1.  2.2 2.  1.  0. ] distance = 1.5620499849319458
[1.  0.1 0.  0.  0. ] distance = 1.6763054132461548


In [226]:
values_docarray == values_docarraymemmap

True

##### test_match_docarray_docarraymemmap

In [227]:
metric = 'euclidean'
D1, D2 = get_document_arrays()
D1.match(D2, metric=metric, limit=3, is_distance=True)
values_docarray = [m.scores[metric].value for d in D1 for m in d.matches]

!rm -rf ./my-memmap
D1, D2 = get_document_arrays()
D2memmap = DocumentArrayMemmap('./my-memmap')
D2memmap.extend(D2)
D1.match(D2memmap, metric=metric, limit=3, is_distance=True)
values_docarraymemmap = [m.scores[metric].value for d in D1 for m in d.matches]

In [228]:
values_docarraymemmap == values_docarraymemmap

True

##### test_docarraymemmap_match_docarraymemmap

In [229]:
metric = 'euclidean'
is_distance = 'False'
D1, D2 = get_document_arrays()
D1.match(D2, metric=metric, limit=3, is_distance=True)
values_docarray = [m.scores[metric].value for d in D1 for m in d.matches]

!rm -rf ./my-memmap1
!rm -rf ./my-memmap2
D1, D2 = get_document_arrays()
D1memmap = DocumentArrayMemmap('./my-memmap1')
D1memmap.extend(D1)
D2memmap = DocumentArrayMemmap('./my-memmap2')
D2memmap.extend(D2)
D1memmap.match(D2memmap, metric=metric, limit=3, is_distance=is_distance)

#values_docarraymemmap = [m.scores[metric].value for d in D1memmap for m in d.matches]


In [230]:
values_docarraymemmap = [m.scores[metric].value for d in D1memmap for m in d.matches]
values_docarraymemmap

[]

In [231]:
D1memmap[0].matches

<jina.types.arrays.match.MatchArray length=0 at 140270561565280>

In [232]:
values_docarray

[0.10000000149011612,
 1.0049875974655151,
 2.107130765914917,
 0.10000000149011612,
 1.0049875974655151,
 1.8547236919403076,
 0.20000000298023224,
 1.5620499849319458,
 1.6763054132461548,
 0.20000000298023224,
 1.2806248664855957,
 2.9342801570892334]