# faiss基础模块
faiss中的索引基于几个基础算法构建，只不过在faiss中是一种高效的实现。他们分别是k-means聚类、PCA降维、PQ编码、解码。

## k-means聚类

In [7]:

import faiss

#数据
import numpy as np 
d = 512          #维数
n_data = 2000   
np.random.seed(0) 
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
    data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32')

# 聚类
ncentroids = 1024
niter = 20
verbose = True
d = data.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=20, verbose=True)
kmeans.train(data)

#输出聚类中心
print(len(kmeans.centroids))
print(len(kmeans.centroids[0]))
print(kmeans.centroids)

1024
512
[[3.046999  3.0121088 3.0124333 ... 3.0203993 3.011947  2.9342847]
 [2.7589808 3.0725713 2.9360871 ... 3.0773525 2.902585  2.995511 ]
 [3.1167192 2.9537685 2.9987445 ... 3.0199993 2.9278672 3.050025 ]
 ...
 [2.9502757 3.0440164 2.9121387 ... 2.9652288 3.2078865 3.009649 ]
 [2.9459333 3.0297534 2.9002755 ... 2.9255435 2.8951385 2.9468067]
 [2.9947238 3.1082706 2.9418213 ... 3.0144033 3.046606  2.9184723]]


In [4]:
help(faiss.Kmeans)

Help on class Kmeans in module faiss:

class Kmeans(builtins.object)
 |  Kmeans(d, k, **kwargs)
 |  
 |  shallow wrapper around the Clustering object. The important method
 |  is train().
 |  
 |  Methods defined here:
 |  
 |  __init__(self, d, k, **kwargs)
 |      d: input dimension, k: nb of centroids. Additional
 |      parameters are passed on the ClusteringParameters object,
 |      including niter=25, verbose=False, spherical = False
 |  
 |  assign(self, x)
 |  
 |  train(self, x)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [8]:
#计算某个向量属于哪一个子类，返回聚类中心次序和L2距离
D, I = kmeans.index.search(data[:5], 1)
print(D)
print(I)

[[4.899538 ]
 [2.2404225]
 [3.0874515]
 [4.472025 ]
 [2.1018007]]
[[ 61]
 [767]
 [393]
 [415]
 [175]]


In [9]:
help(kmeans.index.search)

Help on method replacement_search in module faiss:

replacement_search(x, k) method of faiss.swigfaiss.IndexFlatL2 instance



In [11]:
len(kmeans.centroids)

1024

In [13]:
#返回距离某个聚类中心最近的5个向量
index = faiss.IndexFlatL2 (d)
index.add (data)
D, I = index.search (kmeans.centroids, 5)
print(D)
print(I)

[[3.2480469 4.0878906 4.2226562 4.2304688 4.3759766]
 [0.        8.475586  8.520508  8.709961  8.7421875]
 [0.        8.439453  8.458008  8.519531  8.62207  ]
 ...
 [0.        8.826172  8.837891  8.891602  8.931641 ]
 [2.5205078 3.046875  3.2646484 5.705078  5.932617 ]
 [2.2167969 2.2207031 6.609375  6.6591797 6.665039 ]]
[[1083  472  356 1892   34]
 [1411  414  198  620 1129]
 [ 140  317 1686   24  402]
 ...
 [ 753 1776  331  389  279]
 [ 432 1096  240  879  329]
 [ 625 1211  751  106 1318]]


## PCA降维

In [14]:
help(faiss.PCAMatrix)

Help on class PCAMatrix in module faiss.swigfaiss:

class PCAMatrix(LinearTransform)
 |  PCAMatrix(d_in=0, d_out=0, eigen_power=0, random_rotation=False)
 |  
 |  Method resolution order:
 |      PCAMatrix
 |      LinearTransform
 |      VectorTransform
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __del__ lambda self
 |  
 |  __getattr__ lambda self, name
 |  
 |  __init__(self, d_in=0, d_out=0, eigen_power=0, random_rotation=False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__ = _swig_repr(self)
 |  
 |  __setattr__ lambda self, name, value
 |  
 |  apply_py = apply_method(self, x)
 |  
 |  copy_from(self, other)
 |  
 |  prepare_Ab(self)
 |  
 |  train = replacement_vt_train(self, x)
 |  
 |  train_c = train(self, n, x)
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __swig_destroy__ = delete_PCAMatrix(...)
 |  
 |  ----------------------------------

In [17]:
mat = faiss.PCAMatrix (512, 64)  # 从512维降为64维
mat.train(data)
assert mat.is_trained
tr = mat.apply_py(data)
print(tr.shape)

(2000, 64)


## PQ编码/解码
ProductQuantizer对象可以将向量编码为code。

In [15]:
data[0]

array([3.1764052, 3.0400157, 3.0978737, 3.2240894, 3.186756 , 2.9022722,
       3.0950089, 2.9848642, 2.9896781, 3.0410597, 3.0144043, 3.1454275,
       3.0761037, 3.0121675, 3.0443864, 3.0333674, 3.1494079, 2.979484 ,
       3.0313067, 2.9145904, 2.744701 , 3.065362 , 3.0864437, 2.9257834,
       3.2269754, 2.8545635, 3.004576 , 2.9812815, 3.1532779, 3.146936 ,
       3.0154948, 3.0378163, 2.9112215, 2.8019204, 2.9652088, 3.015635 ,
       3.123029 , 3.120238 , 2.9612672, 2.9697697, 2.8951447, 2.8579981,
       2.829373 , 3.1950777, 2.9490347, 2.9561925, 2.8747206, 3.077749 ,
       2.8386102, 2.978726 , 2.9104533, 3.0386903, 2.9489195, 2.8819368,
       2.997182 , 3.042833 , 3.0066516, 3.0302472, 2.9365678, 2.9637258,
       2.932754 , 2.9640446, 2.9186854, 2.8273718, 3.0177426, 2.959822 ,
       2.83698  , 3.0462782, 2.90927  , 3.0051944, 3.072909 , 3.0128982,
       3.11394  , 2.8765175, 3.040234 , 2.931519 , 2.9129202, 2.942115 ,
       2.9688447, 3.0056164, 2.883485 , 3.0900826, 

In [19]:
d = 512  # 数据维度
cs = 4  # code size (bytes)

# 训练数据集
xt = data  #训练集

# dataset to encode (could be same as train)
x = data

pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

# encode编码 
codes = pq.compute_codes(x)

# decode解码
x2 = pq.decode(codes)

# 编码-解码后与原始数据的差
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)

0.0008765541


标量量化器（scalar quantizer）与之类似。

In [20]:
d = 512  # 数据维度

# 训练集
xt = data

# dataset to encode (could be same as train)
x = data

# QT_8bit allocates 8 bits per dimension (QT_4bit also works)
sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
sq.train(xt)

# encode 编码
codes = sq.compute_codes(x)

# decode 解码
x2 = sq.decode(codes)

# 计算编码-解码后与原始数据的差
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)

6.7287445e-08
