# Product Quantization

In [1]:
# English CoNLL17 corpus, Word2Vec Continuous Skipgram, 4027169 word embeddings 
# from http://vectors.nlpl.eu/repository/
f = open('data/word2vec/model.txt', encoding = "ISO-8859-1")
lines = f.readlines()

In [1]:
print(len(lines))
print(lines[0]
print(lines[3])

SyntaxError: invalid syntax (2071990274.py, line 3)

In [None]:
import numpy as np

keys = []
embeddings = []
for i in range(1, 100001): # use len(lines) for the whole table, testing on first 1000000
    line = lines[i].split(" ")
    keys.append(line[0])
    embeddings.append(line[1:-1])

In [4]:
keys = []
embeddings = []
with open("data/genomes/genomes_minH_block100.npy.xz", encoding="ISO-8859-1") as infile:
    first = True
    for line in infile:
        if first:
            first = False
        else:
            vector = line.split(" ")
            keys.append(vector[0])
            embeddings.append(vector[1:-1])

In [25]:
import faiss

class FaissKMeans:
    def __init__(self, n_clusters=8, n_init=10, max_iter=300):
        self.n_clusters = n_clusters
        self.n_init = n_init
        self.max_iter = max_iter
        self.kmeans = None
        self.cluster_centers_ = None
        self.inertia_ = None

    def fit(self, X):
        self.kmeans = faiss.Kmeans(d=X.shape[1],
                                   k=self.n_clusters,
                                   niter=self.max_iter,
                                   nredo=self.n_init)
        self.kmeans.train(X.astype(np.float32))
        self.cluster_centers_ = self.kmeans.centroids
        self.inertia_ = self.kmeans.obj[-1]

    def predict(self, X):
        return self.kmeans.index.search(X.astype(np.float32), 1)[1]

In [41]:
import math
from sklearn.metrics import silhouette_score

def product_quantization(embeddings, M, k, verbose=False, silhouette_scoring=False, inertia_scoring=False):
    """
    embeddings: embedding vectors
    M: size of vector subsections
    k: number of clusters for k-means clustering

    Runs product quantization on the embeddings.
    Returns each embedding as an array of integers, each representing the id of a centroid in that region
    """
    # build split embeddings
    # split_embeddings[i] -> list of vectors, each which represents the ith section of M units of embeddings
    num_subsections = math.ceil(len(embeddings[0]) / M)
    print(
        f"Splitting {len(embeddings)} embeddings of size {len(embeddings[0])} into {num_subsections} subsections of size {M}")
    split_embeddings = [[] for _ in range(num_subsections)]
    for embedding in embeddings:
        subsections = [embedding[i:i + M] for i in range(0, len(embedding), M)]
        for i in range(len(subsections)):
            split_embeddings[i].append(subsections[i])

    print(f"Performing k means search with k = {k}")
    embeddings_as_centroid_ids = [[] for _ in range(len(embeddings))]

    # given a subsection index and a centroid id within that subsection, return the centroid
    # codebooks = [
    #   [centroid0, centroid1, ..., centroidk], # subsection_0
    #   [centroid0, centroid1, ..., centroidk], # subsection_1
    #   ...
    #   [centroid0, centroid1, ..., centroidk], # subsection_num_subsections
    # ]
    codebooks = [[] for _ in range(num_subsections)]
    section_index = 0
    for section in split_embeddings:
        print(f"Starting k means for section {section_index}")
        X = np.array(section)
        kmeans = FaissKMeans(n_clusters=k)
        kmeans.fit(X)
        labels = kmeans.predict(X)
        centroids = kmeans.cluster_centers_
        if silhouette_scoring:
            return silhouette_score(X, kmeans.labels_, metric='euclidean')
        if inertia_scoring:
            return kmeans.inertia_
        for i in range(len(labels)):
            centroid_id = labels[i]
            embeddings_as_centroid_ids[i].append(centroid_id[0])
        for i in range(len(centroids)):
            codebooks[section_index].append(centroids[i])
        section_index += 1

    return embeddings_as_centroid_ids, codebooks

In [None]:
# verified output from example here: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# product_quantization([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]], 1, 2, verbose=True)

In [None]:
# testing faiss
# product_quantization([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]], 1, 2, verbose=True)

In [None]:
# same example as above. with subsections of size M=2, centroids should be the same as above but repeated for the 2 subsections
# product_quantization([[1, 2, 1, 2], [1, 4, 1, 4], [1, 0, 1, 0], [10, 2, 1, 1], [10, 4, 10, 4], [10, 0, 10, 0]], 2, 2, verbose=True)

In [None]:
quantized, codebooks = product_quantization([[1, 2, 1, 2, 10, 2], [1, 4, 1, 4, 10, 4], [1, 0, 1, 0, 10, 0], [10, 2, 10, 2, 1, 2], [10, 4, 10, 4, 1, 4], [10, 0, 10, 0, 1, 0]], 3, 2, verbose=True)


# Theoretical Memory Use
* The size of the new centroid_id based embedding table should be N embeddings times (D / M) dimensional vectors times k for size of integers as opposed to N * D * 32
* The size of the codebooks should be 


# Silhouette Method and Elbow Method to determine optimal k for k-Means

Very low silhouette scores and the lack of abnormal drops in the inertia curve signals to me that there is no single "best" value of k for k means. Given these observations, choosing k=256 seems to be the most moderate choice in balancing embedding variance and compression. K=256 also seems to be a good choice for the "elbow" of the inertia curve. 

In [None]:
# import random
# sampled_embeddings = random.sample(embeddings, 10000)
# silhouette_scores = []
# for k in [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
#     silhouette_scores.append(product_quantization(sampled_embeddings, M=10, k=k, silhouette_scoring=True))
# silhouette_scores
# [0.07371708125729268,
#  0.06497374546551112,
#  0.06076052094252413,
#  0.06372546359032381,
#  0.06776646181674541,
#  0.07199628263885234,
#  0.07976291826725852,
#  0.08442804342327115,
#  0.0766254825841382,
#  0.04187258368671491]

In [None]:
# distortion_scores = []
# for k in [16, 32, 64, 90, 128, 180, 256, 375, 512, 1024, 2048, 4096]:
#    distortion_scores.append(product_quantization(sampled_embeddings, M=10, k=k, inertia_scoring=True))
# distortion_scores

In [None]:
# import matplotlib.pyplot as plt
# plt.scatter([16, 32, 64, 90, 128, 180, 256, 375, 512, 1024, 2048, 4096], distortion_scores, marker='o');
import numpy as np

In [None]:
quantized, codebooks = product_quantization([[1.4, 2.77, 5], [3, 4, 7], [6, 8, 3], [5.5, 8, 2]], M=1, k=2)
print(codebooks)
keys = [1, 2]

In [None]:
np.savetxt('data/word2vec/keys.txt', np.array(keys), fmt="%s")
np.savetxt('data/word2vec/quantized.txt', quantized, fmt='%i')
np.savetxt('data/word2vec/codebooks.txt', np.array(codebooks).reshape(np.array(codebooks).shape[0], -1), fmt='%1.3f')
print(len(codebooks[0][0]))

In [None]:
f = open('data/word2vec/quantized.txt', encoding = "ISO-8859-1")
line = f.readlines()[0]
print(len(line.split(" ")))

In [13]:
import numpy as np
result = np.load("/home/det4/embedding/data/amazonPolarity/amazon_polarity_test.npy", allow_pickle=True)
np.savetxt("/home/det4/embedding/data/amazonPolarity/amazon_polarity_test.txt", result, fmt='%i')

In [14]:
import numpy as np
result = np.load("/home/det4/embedding/data/genomes/genomes_minH_block100.npy", allow_pickle=True)
np.savetxt("/home/det4/embedding/data/genomes/genomes_minH_block100.txt", result, fmt='%i')