Merge pull request #62 from guilgautier/alpha-dpp-dev

Incorporate alpha dpp sampler Green lights from Travis https://travis-ci.com/github/guilgautier/DPPy/builds/203807250
guilgautier · Nov 24, 2020 · d89d7e9 · d89d7e9
2 parents 37ca8e3 + 58625f9
commit d89d7e9
Show file tree

Hide file tree

Showing 13 changed files with 1,550 additions and 527 deletions.
diff --git a/docs/bibliography/biblio.bib b/docs/bibliography/biblio.bib
@@ -179,17 +179,17 @@ @InProceedings{Dere19
   abstract = 	 { Given a fixed $n\times d$ matrix $\mathbf{X}$, where $n\gg d$, we study the complexity of sampling from a  distribution over all subsets of rows where the probability of a subset is proportional to the squared volume of the parallelepiped spanned by the rows (a.k.a. a determinantal point process). In this task, it is important to minimize the preprocessing cost of the procedure (performed once) as well as the  sampling cost (performed repeatedly). To that end, we propose a new determinantal point process algorithm which has the following two properties, both of which are novel: (1) a preprocessing step which runs in time $O\big(\text{number-of-non-zeros}(\mathbf{X})\cdot\log n\big)+\text{poly}(d)$, and (2) a sampling step which runs in $\text{poly}(d)$ time, independent of the number of rows $n$. We achieve this by introducing a new \textit{regularized} determinantal point process (R-DPP), which serves as an intermediate distribution in the sampling procedure by reducing the number of rows from $n$ to $\text{poly}(d)$. Crucially, this intermediate distribution does not distort the probabilities of the target sample. Our key novelty in defining the R-DPP  is the use of a Poisson random variable for controlling the probabilities of different subset sizes, leading to new determinantal formulas such as the normalization constant for this distribution. Our algorithm has applications in many diverse areas where determinantal point processes have been used, such as  machine learning, stochastic optimization, data summarization and low-rank matrix reconstruction.}
 }
 @inproceedings{DeCaVa19,
-abstract = {We study the complexity of sampling from a distribution over all index subsets of the set {\{}1, ..., n{\}} with the probability of a subset S proportional to the determinant of the submatrix L S of some n × n p.s.d. matrix L, where L S corresponds to the entries of L indexed by S. Known as a de-terminantal point process (DPP), this distribution is widely used in machine learning to induce diversity in subset selection. In practice, we often wish to sample multiple subsets S with small expected size k = E[|S|] n from a very large matrix L, so it is important to minimize the pre-processing cost of the procedure (performed once) as well as the sampling cost (performed repeatedly). To that end, we propose a new algorithm which, given access to L, samples exactly from a DPP while satisfying the following two properties: (1) its preprocessing cost is n {\textperiodcentered} poly(k) (sublinear in the size of L) and (2) its sampling cost is poly(k) (independent of the size of L). Prior to this work, state-of-the-art exact samplers required O(n 3) preprocessing time and sampling time linear in n or dependent on L's spectrum.},
-archivePrefix = {arXiv},
-arxivId = {1905.13476},
 author = {Derezi{\'{n}}ski, Michal and Calandriello, Daniele and Valko, Michal},
-booktitle = {Workshop on Negative Dependence in Machine Learning, International Conference on Machine Learning (ICML)},
-eprint = {1905.13476},
-file = {:Users/ggautier/Documents/Mendeley Desktop/Derez{\'{i}}, Calandriello, Valko - 2019 - Exact sampling of determinantal point processes with sublinear time preprocessing.pdf:pdf},
 title = {{Exact sampling of determinantal point processes with sublinear time preprocessing}},
-url = {https://negative-dependence-in-ml-workshop.lids.mit.edu/wp-content/uploads/sites/29/2019/06/icml.pdf},
+booktitle={Advances in Neural Information Processing Systems},
 year = {2019}
 }
+@inproceedings{CaDeVa20,
+author = {Calandriello, Daniele and Derezi{\'{n}}ski, Michal and Valko, Michal},
+title = {{Sampling from a k-DPP without looking at all items}},
+booktitle={Advances in Neural Information Processing Systems},
+year = {2020}
+}
 @incollection{DeWaHs18,
 title = {Leveraged volume sampling for linear regression},
 author = {Derezinski, Michal and Warmuth, Manfred K and Hsu, Daniel J},

diff --git a/docs/continuous_dpps/sampling.rst b/docs/continuous_dpps/sampling.rst
@@ -162,7 +162,7 @@ where :math:`\mathbf{K}_{i-1} = \left[\left\langle \Phi(x_p)^{\top} \Phi(x_q) \r
     - Some :ref:`beta_ensembles_definition_OPE` (specific instances of projection DPPs) can be :ref:`sampled <beta_ensembles_sampling>` in :math:`\mathcal{O}(r^2)` by computing the eigenvalues of properly randomised tridiagonal matrices.
     - The :ref:`multivariate Jacobi ensemble <multivariate_jacobi_ope>` whose :py:meth:`~dppy.multivariate_jacobi_ope.MultivariateJacobiOPE.sample` method relies on the chain rule described by :eq:`eq:continuous_dpps_exact_sampling_projection_DPP_chain_rule_geometric`.
 
-.. _finite_dpps_exact_sampling_spectral_method:
+.. _continuous_dpps_exact_sampling_spectral_method:
 
 Generic DPPs: the spectral method
 ---------------------------------

diff --git a/docs/finite_dpps/exact_sampling.rst b/docs/finite_dpps/exact_sampling.rst
@@ -8,11 +8,35 @@ Exact sampling
 Consider a finite DPP defined by its correlation kernel :math:`\mathbf{K}` :eq:`eq:inclusion_proba_DPP_K` or likelihood kernel :math:`\mathbf{L}` :eq:`eq:likelihood_DPP_L`.
 There exist three main types of exact sampling procedures:
 
-1. The spectral method requires the eigendecomposition of the correlation kernel :math:`\mathbf{K}` or the likelihood kernel :math:`\mathbf{L}`. It is based on the fact that :ref:`generic DPPs are mixtures of projection DPPs <finite_dpps_mixture>` together with the application of the chain rule to sample projection DPPs. It is presented in Section :ref:`finite_dpps_exact_sampling_spectral_method`.
-
-2. A Cholesky-based procedure which requires the correlation kernel :math:`\mathbf{K}` (even non-Hermitian!). It boilds down to applying the chain rule on sets; where each item in turn is decided to be excluded or included in the sample. It is presented in Section :ref:`finite_dpps_exact_sampling_cholesky_method`.
-
-3. Recently, :cite:`DeCaVa19` have also proposed an alternative method to get exact samples: first sample an intermediate distribution and correct the bias by thinning the intermediate sample using a carefully designed DPP. It is presented in Section :ref:`finite_dpps_exact_sampling_intermediate_sampling_method`.
+1. The spectral method (used by default) requires the eigendecomposition of the correlation kernel :math:`\mathbf{K}` or the likelihood kernel :math:`\mathbf{L}`. It is based on the fact that :ref:`generic DPPs are mixtures of projection DPPs <finite_dpps_mixture>` together with the application of the chain rule to sample projection DPPs. It is presented in Section :ref:`finite_dpps_exact_sampling_spectral_method`.
+
+2. A Cholesky-based procedure which requires the correlation kernel :math:`\mathbf{K}` (even non-Hermitian!). It boils down to applying the chain rule on sets; where each item in turn is decided to be excluded or included in the sample. It is presented in Section :ref:`finite_dpps_exact_sampling_cholesky_method`.
+
+3. Recently, :cite:`DeCaVa19` have also proposed an alternative method to get exact samples: first sample an intermediate distribution and correct the bias by thinning the intermediate sample using a carefully designed DPP. This approach does not require a Cholesky/Eigen-decomposition of the DPP, but the runtime instead scale with the expected sample size of the DPP (see :ref:`finite_dpps_number_of_points`). It is presented in Section :ref:`finite_dpps_exact_sampling_intermediate_sampling_method`. A more refined procedure based on this approach was introduced in :cite:`CaDeVa20` for k-DPP sampling.
+
+In general, for small :math:`N` (i.e. less than 1000) spectral or cholesky samplers
+are recommended for numerical stability. For larger :math:`N` (i.e. up to millions)
+and moderate :math:`k` (i.e. in the hundreds) intermediate sampling is recommended for scalability.
+
+The following table summarizes the complexity of all exact samplers currently available,
+where the expected sample size :math:`\mathbb{E}[|X|]` is equal to :math:`k` for k-DPPs
+and :math:`d_{\text{eff}}` for random-sized DPPs.
+
++-----------------------+--------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+------------------------------------------------------------------------------+
+|                       | ``mode=``    | Time to first sample                                                        |Time to subsequent samples                                                   | Notes                                                                        |
++                       +              +--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+                                                                              +
+|                       |              | DPP                                  |  k-DPP                               | DPP                                  |  k-DPP                               |                                                                              |
++=======================+==============+======================================+======================================+======================================+======================================+==============================================================================+
+| Spectral sampler      |``"GS"``,     | :math:`O(N^3)`                       |:math:`O(N^3)`                        |:math:`O(N d_{\text{eff}}^2)`         |:math:`O(N k^2)`                      | The three variants differ slightly,                                          |
+|                       |``"GS_bis"``, |                                      |                                      |                                      |                                      | and depending on the DPP they can                                            |
+|                       |``"KuTa12"``  |                                      |                                      |                                      |                                      | have different numerical stability.                                          |
++-----------------------+--------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+------------------------------------------------------------------------------+
+| Cholesky sampler      | ``"chol"``   | :math:`O(N^3)`                       |:math:`O(N^3)`                        |:math:`O(N^3)`                        |:math:`O(N^3)`                        | Also works for non-Hermitian DPPs.                                           |
++-----------------------+--------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+------------------------------------------------------------------------------+
+| Intermediate sampler  | ``"vfx"``    | :math:`O(N d_{\text{eff}}^6)`        |:math:`O(N k^{10} + k^{15})`          |:math:`O(d_{\text{eff}}^6)`           |:math:`O(k^6)`                        | For ``"alpha"`` we report worst case runtime, but depending on the DPP       |
++                       +--------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+ structure best case runtime can be much faster than ``"vfx"``. For           |
+|                       | ``"alpha"``  | :math:`O(N d_{\text{eff}}^5)`        |:math:`O(N k^6/d_{\text{eff}} + k^9)` |:math:`O(d_{\text{eff}}^6)`           |:math:`O(k^6)`                        | particularly ill-posed DPPs ``"vfx"`` can be more numerically stable.        |
++-----------------------+--------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+------------------------------------------------------------------------------+
 
 .. note::
 
@@ -551,17 +575,18 @@ This method is based on the concept of a **distortion-free intermediate sample**
 
 	\mathbb{P}[i \in \mathcal{X}] = \big[\mathbf{L}(I + \mathbf{L})^{-1}\big]_{ii}=\tau_i,\quad i\text{th 1-ridge leverage score}.
 
-Suppose that we draw a sample of :math:`t` points i.i.d. proportional to ridge leverage scores, i.e., :math:`\sigma=(\sigma_1, \sigma_2,...,\sigma_t)` such that :math:`\mathbb{P}[\sigma_j=i]\propto\tau_i`. Intuitively, this sample is similar fo :math:`\mathcal{X}\sim \operatorname{DPP}(\mathbf{L})` except that it "ignores" all the dependencies between the points. However, if we sample sufficiently many points i.i.d. according to RLS, then a proper sample :math:`\mathcal{X}` will likely be contained within it. This can be formally shown for :math:`t = O(\mathbb{E}[|\mathcal{X}|]^2)`. When :math:`\mathbb{E}[|\mathcal{X}|]^2\ll N`, then this allows us to reduce the size of the DPP kernel :math:`\mathbf{L}` from :math:`N\times N` to a much smaller size :math:`\mathbf{\tilde{L}}` :math:`t\times t`. Making this sampling exact requires considerably more care, because even with a large :math:`t` there is always a small probability that the i.i.d. sample :math:`\sigma` is not sufficiently diverse. We guard against this possibility by rejection sampling.
+Suppose that we draw a sample :math:`\sigma` of :math:`t` points i.i.d. proportional to ridge leverage scores, i.e., :math:`\sigma=(\sigma_1, \sigma_2,...,\sigma_t)` such that :math:`\mathbb{P}[\sigma_j=i]\propto\tau_i`. Intuitively, this sample is similar fo :math:`\mathcal{X}\sim \operatorname{DPP}(\mathbf{L})` because the marginals are the same, but it "ignores" all the dependencies between the points. However, if we sample sufficiently many points i.i.d. according to RLS, then a proper sample :math:`\mathcal{X}` will likely be contained within :math:`\sigma`. This can be formally shown for :math:`t = O(\mathbb{E}[|\mathcal{X}|]^2)`. When :math:`\mathbb{E}[|\mathcal{X}|]^2\ll N`, then this allows us to reduce the size of the DPP kernel :math:`\mathbf{L}` from :math:`N\times N` to a much smaller size :math:`\mathbf{\tilde{L}}` :math:`t\times t`. Making this sampling exact requires considerably more care, because even with a large :math:`t` there is always a small probability that the i.i.d. sample :math:`\sigma` is not sufficiently diverse. We guard against this possibility by rejection sampling.
 
 .. important::
    Use this method for sampling  :math:`\mathcal{X} \sim \operatorname{DPP}(\mathbf{L})` when :math:`\mathbb{E}\left[|\mathcal{X}|\right]\ll\sqrt{N}`.
 
    - Preprocessing costs :math:`\mathcal{O}\big(N\cdot \text{poly}(\mathbb{E}\left[|\mathcal{X}|\right])\, \text{polylog}(N)\big)`.
    - Each sample costs :math:`\mathcal{O}\big(\mathbb{E}[|\mathcal{X}|]^6\big)`.
 
+   There are two implementations of intermediate sampling available in :code:`dppy`: the :code:`mode='vfx'` sampler and the :code:`mode='alpha'` sampler.
+
 In practice
 ===========
-
 .. testcode::
 
     from numpy.random import RandomState

diff --git a/docs/finite_dpps/properties.rst b/docs/finite_dpps/properties.rst
@@ -154,6 +154,8 @@ Expectation
 		= \sum_{n=1}^N \lambda_n
 		= \sum_{n=1}^N \frac{\gamma_n}{1+\gamma_n}.
 
+The expected size of a DPP with likelihood matrix :math:`\mathbf{L}` is also related to the effective dimension :math:`d_{\text{eff}}(\mathbf{L}) = \operatorname{trace} (\mathbf{L}(\mathbf{L}+\mathbf{I})^{-1})= \operatorname{trace} \mathbf{K} = \mathbb{E}[|\mathcal{X}|]` of :math:`\mathbf{L}`, a quantity with many applications in randomized numerical linear algebra and statistical learning theory (see e.g., :cite:`DeCaVa19`).
+
 Variance
 --------
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -103,7 +103,7 @@ How to cite this work?
 ======================
 
 We wrote a companion paper to
-`DPPy <https://github.com/guilgautier/DPPy>`__ which got accepted for publication in the `MLOSS <http://www.jmlr.org/mloss/>`__ track of JMLR.
+`DPPy <https://github.com/guilgautier/DPPy>`__ which got accepted for publication in the `MLOSS <http://www.jmlr.org/mloss/>`__ track of JMLR, see :cite:`GPBV19`.
 
 If you use this package, please consider citing it with this piece of
 BibTeX: