In [8]:
import numpy as np
from sklearn.decomposition import PCA
from utils.typing import Tensor, N, D

seed = 42
np.random.seed(seed)

In [9]:
n_sample_types = 3
d = 3
n_components = 1

data_types = np.random.rand(n_sample_types, d)
sample_counts = np.random.randint(1, 10, size=n_sample_types)

In [10]:
data_types

array([[0.37454012, 0.95071431, 0.73199394],
       [0.59865848, 0.15601864, 0.15599452],
       [0.05808361, 0.86617615, 0.60111501]])

In [11]:
data_repeated = np.repeat(data_types, sample_counts, axis=0)
data_repeated

array([[0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.37454012, 0.95071431, 0.73199394],
       [0.59865848, 0.15601864, 0.15599452],
       [0.59865848, 0.15601864, 0.15599452],
       [0.59865848, 0.15601864, 0.15599452],
       [0.05808361, 0.86617615, 0.60111501],
       [0.05808361, 0.86617615, 0.60111501],
       [0.05808361, 0.86617615, 0.60111501],
       [0.05808361, 0.86617615, 0.60111501],
       [0.05808361, 0.86617615, 0.60111501],
       [0.05808361, 0.86617615, 0.60111501]])

In [12]:
pca = PCA(n_components=n_components, svd_solver="full")
pca = pca.fit(data_repeated)
pca.transform(data_types)

array([[ 0.1838945 ],
       [-0.81311361],
       [ 0.16136415]])

In [13]:
class WPCA:
    def __init__(self, n_components=2):
        self.n_components = n_components

    def fit(self, X: Tensor[N, D], weights: Tensor[N]) -> None:
        weights = weights / np.sum(weights)
        self.mean_ = np.average(X, axis=0, weights=weights)
        _, _, Vt = np.linalg.svd(
            np.sqrt(weights)[:, np.newaxis] * (X - self.mean_),
            full_matrices=False
        )
        self.components_ = Vt[:self.n_components]
        return self

    def transform(self, X: Tensor[N, D]) -> Tensor[N, D]:
        return (X - self.mean_) @ self.components_.T

In [14]:
weights = sample_counts / np.sum(sample_counts)
wpca = WPCA(n_components=1)
wpca.fit(data_types, weights)
wpca.transform(data_types)

array([[ 0.1838945 ],
       [-0.81311361],
       [ 0.16136415]])

In [15]:
np.allclose(
    wpca.transform(data_types),
    pca.transform(data_types),
    atol=1e-6
)

True