### Install Annoy
To install, simply do pip install --user annoy to pull down the latest version from PyPI.

In [2]:
from annoy import AnnoyIndex
import random

In [3]:


f = 40
t = AnnoyIndex(f,'angular')  # Length of item vector that will be indexed
for i in range(1000):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10) # 10 trees
t.save('test4_angular.ann')

# ...



True

#### 現在有1000個40維的點


In [4]:
f = 40
u = AnnoyIndex(f,'angular')
u.load('test4_angular.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 10)) # will find the 10 nearest neighbors

[0, 919, 588, 92, 79, 760, 852, 118, 789, 402]


In [5]:
# 第一個點的最近鄰的索引:用索引找
u.get_nns_by_item(0, 10) # will find the 10 nearest neighbors

[0, 919, 588, 92, 79, 760, 852, 118, 789, 402]

In [6]:
# 第一個點的最近鄰的索引:用向量找
v=u.get_item_vector(0)
u.get_nns_by_vector(v,10)

[0, 919, 588, 92, 79, 760, 852, 118, 789, 402]

In [8]:
u.get_nns_by_item(0, 10) # will find the 10 nearest neighbors

[0, 919, 588, 92, 79, 760, 852, 118, 789, 402]

### Full Python API

<ul>
<li><code>AnnoyIndex(f, metric)</code> returns a new index that's read-write and stores vector of <code>f</code> dimensions. Metric can be <code>"angular"</code>, <code>"euclidean"</code>, <code>"manhattan"</code>, <code>"hamming"</code>, or <code>"dot"</code>.</li>
<li><code>a.add_item(i, v)</code> adds item <code>i</code> (any nonnegative integer) with vector <code>v</code>. Note that it will allocate memory for <code>max(i)+1</code> items.</li>
<li><code>a.build(n_trees)</code> builds a forest of <code>n_trees</code> trees. More trees gives higher precision when querying. After calling <code>build</code>, no more items can be added.</li>
<li><code>a.save(fn, prefault=False)</code> saves the index to disk and loads it (see next function). After saving, no more items can be added.</li>
<li><code>a.load(fn, prefault=False)</code> loads (mmaps) an index from disk. If prefault is set to True, it will pre-read the entire file into memory (using mmap with MAP_POPULATE). Default is False.</li>
<li><code>a.unload()</code> unloads.</li>
<li><code>a.get_nns_by_item(i, n, search_k=-1, include_distances=False)</code> returns the <code>n</code> closest items. During the query it will inspect up to <code>search_k</code> nodes which defaults to <code>n_trees * n</code> if not provided. <code>search_k</code> gives you a run-time tradeoff between better accuracy and speed. If you set <code>include_distances</code> to <code>True</code>, it will return a 2 element tuple with two lists in it: the second one containing all corresponding distances.</li>
<li><code>a.get_nns_by_vector(v, n, search_k=-1, include_distances=False)</code> same but query by vector <code>v</code>.</li>
<li><code>a.get_item_vector(i)</code> returns the vector for item <code>i</code> that was previously added.</li>
<li><code>a.get_distance(i, j)</code> returns the distance between items <code>i</code> and <code>j</code>. NOTE: this used to return the <em>squared</em> distance, but has been changed as of Aug 2016.</li>
<li><code>a.get_n_items()</code> returns the number of items in the index.</li>
<li><code>a.get_n_trees()</code> returns the number of trees in the index.</li>
<li><code>a.on_disk_build(fn)</code> prepares annoy to build the index in the specified file instead of RAM (execute before adding items, no need to save after build)</li>
<li><code>a.set_seed(seed)</code> will initialize the random number generator with the given seed.  Only used for building up the tree, i. e. only necessary to pass this before adding the items.  Will have no effect after calling a.build(n_trees) or a.load(fn).</li>
</ul>
<p>Notes:</p>
<ul>
<li>There's no bounds checking performed on the values so be careful.</li>
<li>Annoy uses Euclidean distance of normalized vectors for its angular distance, which for two vectors u,v is equal to <code>sqrt(2(1-cos(u,v)))</code></li>
</ul>


####  Angular distance:
Annoy用sqrt(2(1-cos_sim(u,v)))作為Angular distance    
cosine_sim的值域[-1,1]  
Angular distance(A_D)落在[0,2]   
這邊我假設:
Angular similarity:1-A_D/2


In [113]:
from scipy import spatial
import math
id0=u.get_item_vector(0)
id1=u.get_item_vector(139)

cos_sim = 1 - spatial.distance.cosine(id0, id1)
angular_d=math.sqrt(2*(1-cos_sim))
print('cos_sim:',cos_sim)
print('angular_d:',angular_d)
print('Annoy_angular_d:',u.get_distance(0,139))

cos_sim: 0.4543499348601068
angular_d: 1.044653114808828
Annoy_angular_d: 1.0446531772613525


### id和distance存在dictionary的方式 

In [9]:
result=u.get_nns_by_item(0, 10, include_distances=True)

In [10]:
result[1][1:5]

[1.022614598274231, 1.0473228693008423, 1.0828430652618408, 1.0877268314361572]

In [11]:
bag=[]
for row in zip(result[0],result[1]):
    bag.append(
        {
            'id':row[0],
            'Similarity':row[1]
        }
        
    )

In [12]:
bag

[{'id': 0, 'Similarity': 0.0},
 {'id': 919, 'Similarity': 1.022614598274231},
 {'id': 588, 'Similarity': 1.0473228693008423},
 {'id': 92, 'Similarity': 1.0828430652618408},
 {'id': 79, 'Similarity': 1.0877268314361572},
 {'id': 760, 'Similarity': 1.1008208990097046},
 {'id': 852, 'Similarity': 1.1060607433319092},
 {'id': 118, 'Similarity': 1.1178791522979736},
 {'id': 789, 'Similarity': 1.1275181770324707},
 {'id': 402, 'Similarity': 1.1500883102416992}]