# Product Quantization

In [5]:
# English CoNLL17 corpus, Word2Vec Continuous Skipgram, 4027169 word embeddings 
# from http://vectors.nlpl.eu/repository/
f = open('data/word2vec/model.txt', encoding = "ISO-8859-1")
lines = f.readlines()

In [2]:
print(len(lines))
print(lines[0])
print(lines[3])

4027170
4027169 100

the -0.145301 -0.150899 0.389752 0.019979 0.209313 -0.059876 -0.037605 -0.191378 -0.492421 0.094606 0.471501 0.171619 -0.091895 -0.188857 -0.187415 0.102550 0.521365 0.492694 0.014640 0.324407 0.132155 0.392447 0.177873 -0.233233 -0.194923 -0.393363 0.018775 0.244165 -0.285453 -0.175422 -0.149823 -0.130606 -0.216950 0.308142 -0.192615 0.212360 0.090488 -0.229107 0.118502 0.217839 -0.379018 -0.042318 -0.315532 -0.186368 -0.028538 0.253762 0.487518 -0.055428 -0.239519 -0.209573 0.140636 -0.090901 -0.384449 0.447566 -0.184971 0.261921 0.440821 0.062585 0.181714 -0.252114 0.122724 0.015310 -0.143186 -0.209463 -0.174111 0.143348 0.295857 -0.156869 0.169965 0.038492 -0.122283 0.095772 0.314591 0.047793 -0.162416 -0.008667 -0.258904 0.129512 -0.086891 -0.131997 -0.256616 -0.071309 0.175144 0.060490 0.043357 0.224987 0.263239 0.236568 0.264060 0.109120 0.076898 0.172123 -0.248498 0.042334 0.053334 0.048210 0.239207 -0.083211 0.214255 -0.118595 



In [3]:
import numpy as np

keys = []
embeddings = []
for i in range(1, 100001): # use len(lines) for the whole table, testing on first 1000000
    line = lines[i].split(" ")
    keys.append(line[0])
    embeddings.append(line[1:-1])

In [15]:
# keys = []
# embeddings = []
# with open("data/word2vec/model.txt", encoding="ISO-8859-1") as infile:
#     first = True
#     for line in infile:
#         if first:
#             first = False
#         else:
#             vector = line.split(" ")
#             keys.append(vector[0])
#             embeddings.append(vector[1:-1])

In [4]:
print(keys[:50]) # first 50 keys

['</s>', ',', 'the', '.', 'of', 'and', 'to', 'a', 'in', '-', ')', '(', ':', 'for', 'is', '"', 'on', 'i', 'that', 'with', 'it', 'was', 'by', 'as', "'s", 'at', 'you', 'this', 'from', 'are', 'be', 'or', 'he', 'have', 'not', 'an', 'but', 'his', 'all', ';', 'your', 'they', '...', 'one', 'we', '?', 'has', 'more', 'new', 'will']


In [5]:
print(embeddings[-1]) # vector embedding example

['-0.118295', '0.203800', '0.394878', '-0.848517', '0.486469', '-0.115450', '-0.455054', '-0.370111', '0.170637', '0.406961', '0.257320', '0.353440', '0.297030', '0.325485', '0.283583', '0.042222', '0.270777', '0.308950', '-0.182712', '-0.291722', '0.228428', '0.192821', '-0.161166', '-0.512930', '0.222564', '-0.263916', '-0.051140', '-0.048887', '0.289932', '-0.560604', '-0.130937', '0.034237', '-0.247062', '0.746098', '-0.407875', '0.033511', '-0.092927', '0.202167', '-0.634714', '-0.013026', '-0.184374', '0.180221', '0.017293', '-0.190451', '0.300834', '0.291095', '0.734695', '-0.347320', '-0.269567', '-0.335337', '0.061642', '-0.046950', '0.151776', '-0.195496', '0.035068', '-0.249855', '0.360329', '-0.049098', '-0.221995', '-0.235719', '-0.176848', '0.029172', '0.044547', '-0.108529', '-0.552660', '0.313222', '-0.304578', '0.418278', '0.646111', '0.241757', '-0.310834', '0.110718', '0.272438', '0.529656', '0.274987', '0.132159', '-0.241227', '0.137113', '-0.302926', '-0.391801', '

In [3]:
import math
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def product_quantization(embeddings, M, k, verbose=False, silhouette_scoring=False, inertia_scoring=False):
    """
    keys: embeddings keys
    embeddings: embedding vectors
    M: size of vector subsections
    k: number of clusters for k-means clustering

    Runs product quantization on the embeddings.
    Returns each embedding as an array of integers, each representing the id of a centroid in that region
    """
    if verbose:
        print(embeddings)
    # build split embeddings
    # split_embeddings[i] -> list of vectors, each which represents the ith section of M units of embeddings
    num_subsections = math.ceil(len(embeddings[0]) / M)
    print(f"Splitting {len(embeddings)} embeddings of size {len(embeddings[0])} into {num_subsections} subsections of size {M}")
    split_embeddings = [[] for _ in range(num_subsections)]
    for embedding in embeddings:
        subsections = [embedding[i:i + M] for i in range(0, len(embedding), M)]
        for i in range(len(subsections)):
            split_embeddings[i].append(subsections[i])

    if verbose:
        print(split_embeddings)
    
    print(f"Performing k means search with k = {k}")
    embeddings_as_centroid_ids = [[] for _ in range(len(embeddings))]

    # given a subsection index and a centroid id within that subsection, return the centroid
    # codebooks = [
    #   [centroid0, centroid1, ..., centroidk], # subsection_0
    #   [centroid0, centroid1, ..., centroidk], # subsection_1
    #   ... 
    #   [centroid0, centroid1, ..., centroidk], # subsection_num_subsections
    # ]
    codebooks = [[] for _ in range(num_subsections)]
    section_index = 0
    for section in split_embeddings:
        print(f"Starting k means for section {section_index}")
        X = np.array(section)
        if verbose:
            print("performing k means with ...")
            print(X)
        kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
        labels = kmeans.predict(X)
        centroids = kmeans.cluster_centers_
        if silhouette_scoring:
            return silhouette_score(X, kmeans.labels_, metric='euclidean')
        if inertia_scoring:
            return kmeans.inertia_
        for i in range(len(labels)):
            centroid_id = labels[i]
            embeddings_as_centroid_ids[i].append(centroid_id)
        for i in range(len(centroids)):
            codebooks[section_index].append(centroids[i])
        section_index += 1
    
    return embeddings_as_centroid_ids, codebooks

In [None]:
# verified output from example here: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# product_quantization([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]], 1, 2, verbose=True)

In [None]:
# same example as above. with subsections of size M=2, centroids should be the same as above but repeated for the 2 subsections
# product_quantization([[1, 2, 1, 2], [1, 4, 1, 4], [1, 0, 1, 0], [10, 2, 10, 2], [10, 4, 10, 4], [10, 0, 10, 0]], 2, 2, verbose=True)

In [None]:
# quantized, codebooks = product_quantization([[1, 2, 1, 2, 10, 2], [1, 4, 1, 4, 10, 4], [1, 0, 1, 0, 10, 0], [10, 2, 10, 2, 1, 2], [10, 4, 10, 4, 1, 4], [10, 0, 10, 0, 1, 0]], 3, 2, verbose=True)
# quantized, codebooks

# Theoretical Memory Use
* The size of the new centroid_id based embedding table should be N embeddings times (D / M) dimensional vectors times k for size of integers as opposed to N * D * 32
* The size of the codebooks should be 


# Silhouette Method and Elbow Method to determine optimal k for k-Means

Very low silhouette scores and the lack of abnormal drops in the inertia curve signals to me that there is no single "best" value of k for k means. Given these observations, choosing k=256 seems to be the most moderate choice in balancing embedding variance and compression. K=256 also seems to be a good choice for the "elbow" of the inertia curve. 

In [11]:
# import random
# sampled_embeddings = random.sample(embeddings, 10000)
# silhouette_scores = []
# for k in [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
#     silhouette_scores.append(product_quantization(sampled_embeddings, M=10, k=k, silhouette_scoring=True))
# silhouette_scores
# [0.07371708125729268,
#  0.06497374546551112,
#  0.06076052094252413,
#  0.06372546359032381,
#  0.06776646181674541,
#  0.07199628263885234,
#  0.07976291826725852,
#  0.08442804342327115,
#  0.0766254825841382,
#  0.04187258368671491]

In [12]:
# distortion_scores = []
# for k in [16, 32, 64, 90, 128, 180, 256, 375, 512, 1024, 2048, 4096]:
#    distortion_scores.append(product_quantization(sampled_embeddings, M=10, k=k, inertia_scoring=True))
# distortion_scores

In [13]:
# import matplotlib.pyplot as plt
# plt.scatter([16, 32, 64, 90, 128, 180, 256, 375, 512, 1024, 2048, 4096], distortion_scores, marker='o');

In [10]:
quantized, codebooks = product_quantization([[1, 2], [3, 4]], M=20, k=2)
print(quantized)
keys = [1, 2]

Splitting 2 embeddings of size 2 into 1 subsections of size 20
Performing k means search with k = 2
[[0], [1]]


In [11]:
np.savetxt('data/word2vec/keys.txt', np.array(keys), fmt="%s")
np.savetxt('data/word2vec/quantized.txt', quantized, fmt='%i')
np.savetxt('data/word2vec/codebooks.txt', np.array(codebooks).reshape(np.array(codebooks).shape[0], -1), fmt='%i')
print(len(codebooks[0][0]))

2
