In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [1]:
class ProductQuantization:
    def __init__(self, num_sub, num_clu):
        """
        Initialize the PQ class.

        Parameters:
        K (int): Number of subvectors.
        L (int): Number of clusters for each subvector space.
        """
        self.num_subvectors = num_sub
        self.num_clusters = num_clu
        self.codebooks = []


    def cut(self, data):
        """
        Split vectors into multiple subvectors.

        Parameters:
        data (List): Input vector data.

        Returns:
         Subvector arrays after splitting.
        """
        all_data = np.array([np.array_split(vec, self.num_subvectors) for vec in data])
        return all_data


    def kmeans(self, data):
        """
        Perform KMeans clustering for each subvector space and construct the codebook.

        Parameters:
        data : Input vector data.
        """
        all_data = self.cut(data)  # Calculate the mean error for the entire data set
        for i in range(self.num_subvectors):
            sub_vectors = np.vstack([vec[i] for vec in all_data])  # Extraction of a subvector from the i-th subspace
            kmeans = KMeans(n_clusters=self.num_clusters, random_state=42).fit(sub_vectors)
            self.codebooks.append(kmeans.cluster_centers_)


    def quantize(self, vector):
        """
        Quantize the original vector into indices of the codebook.

        Parameters:
        vector : The input vector.

        Returns:
        List: Quantized indices.
        """
        subvector_size = len(vector) // self.num_subvectors
        quantized_indices = []
        for i in range(self.num_subvectors):
            subvector = vector[i*subvector_size:(i+1)*subvector_size]
            codebook = self.codebooks[i]
            distances = np.linalg.norm(codebook - subvector, axis=1)
            quantized_indices.append(np.argmin(distances))
        return quantized_indices


    def reconstruct(self, quantized_indices):
        """
        Reconstruct the vector based on the quantized indices.

        Parameters:
        quantized_indices (List): The list of quantized indices.

        Returns:
        The reconstructed vector.
        """
        reconstructed_vector = []
        for i, idx in enumerate(quantized_indices):
            reconstructed_vector.append(self.codebooks[i][idx])
        return np.concatenate(reconstructed_vector)


    def compute_error(self, original_vector, reconstructed_vector):
        """
        Compute the absolute and relative error between original and reconstructed vector.

        Parameters:
        original_vector : The original vector.
        reconstructed_vector : The reconstructed vector.

        Returns:
        Tuple: Absolute error, relative error.
        """
        absolute_error = np.linalg.norm(original_vector - reconstructed_vector)
        relative_error = absolute_error / np.linalg.norm(original_vector)
        return absolute_error, relative_error


    def evaluate_dataset(self, data):
        """
        Evaluate the average error across the dataset.

        Parameters:
        data (List): Input dataset.

        Returns:
        Tuple: Mean absolute error, mean relative error.
        """
        absolute_errors = []
        relative_errors = []
        for vector in data:
            quantized_indices = self.quantize(vector)
            reconstructed_vector = self.reconstruct(quantized_indices)
            abs_err, rel_err = self.compute_error(vector, reconstructed_vector)
            absolute_errors.append(abs_err)
            relative_errors.append(rel_err)
        return np.mean(absolute_errors), np.mean(relative_errors)


    def print_codebooks(self):
        """
        Display the codebooks for each subvector.
        """
        for i, codebook in enumerate(self.codebooks):
            df = pd.DataFrame(codebook)
            print(f"Codebook for subvector {i+1}:")
            print(df)

# Generate data
data = np.random.normal(loc = 0, scale = 1, size = (10000, 768))

# num_sub = 4 denotes 4 subspaces and num_clu = 16 denotes 16 classes clustered in each subspace
pq = ProductQuantization(num_sub = 4, num_clu = 16)
pq.kmeans(data)

# Quantification of tests and reconstruction of multiple vectors
original_vector = data[0]
quantized_indices = pq.quantize(original_vector)
reconstructed_vector = pq.reconstruct(quantized_indices)

# Codebook output as a DataFrame for manual verification
pq.print_codebooks()

# Calculate the mean error for the entire dataset
mean_absolute_error, mean_relative_error = pq.evaluate_dataset(data)
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Mean Relative Error: {mean_relative_error}")

Codebook for subvector 1:
         0         1         2         3         4         5         6    \
0   0.030480 -0.119639 -0.096747  0.025427 -0.094753  0.417124 -0.051446   
1  -0.052896 -0.263104  0.189923  0.226979 -0.084419 -0.234670 -0.173113   
2  -0.311142 -0.200950  0.023813  0.037042  0.032061 -0.191592  0.063583   
3  -0.057471 -0.206931 -0.090752  0.122274 -0.042809  0.027891 -0.264854   
4  -0.333751  0.034957  0.013319  0.362404  0.225175  0.049865 -0.308281   
5   0.002198 -0.055979  0.081299  0.172955 -0.085070 -0.154067 -0.141806   
6  -0.012632  0.117362 -0.108425 -0.141804  0.014180  0.036264 -0.085298   
7  -0.056948  0.222933  0.330165 -0.011275  0.000293 -0.018705  0.311166   
8   0.149619  0.060697 -0.045302  0.078821 -0.090821 -0.024372 -0.085892   
9  -0.019026  0.017143 -0.162784 -0.065920  0.075649 -0.050894  0.083884   
10  0.345531 -0.060016  0.362170 -0.196723 -0.208362  0.114162 -0.119513   
11  0.067610  0.086755  0.232063  0.149892  0.128611  0.070223

The value of mean absolute error has a value of 27.35, which is large, implying that the difference between the original and reconstructed vectors is more significant
The average relative error, the result shows that the relative error is close to 1, indicating that the reconstructed vector is very close to the original vector.
It is possible that due to the uniformity of normally distributed data, the model has difficulty in finding effective cluster centres through KMeans clustering, which leads to higher reconstruction errors.