fix reference formatting

ddbourgin · Sep 11, 2019 · 712dfe4 · 712dfe4
1 parent 9f36064
commit 712dfe4
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 27 deletions.
diff --git a/docs/img/decision_tree.png b/docs/img/decision_tree.png
diff --git a/docs/numpy_ml.trees.rst b/docs/numpy_ml.trees.rst
@@ -10,6 +10,18 @@ is associated with a decision rule, which dictates how to divide the data the
 node inherits from its parent among each of its children. Each leaf node is
 associated with at least one data point from the original training set.
 
+.. figure:: img/decision_tree.png
+    :width: 95%
+    :align: center
+
+    A binary decision tree trained on the dataset :math:`X = \{ \mathbf{x}_1,
+    \ldots, \mathbf{x}_{10} \}`. Each example in the dataset is a 5-dimensional
+    vector of real-valued features labeled :math:`x_1, \ldots, x_5`. Unshaded
+    circles correspond to internal decision nodes, while shaded circles
+    correspond to leaf nodes. Each leaf node is associated with a subset of the
+    examples in `X`, selected based on the decision rules along the path from
+    root to leaf.
+
 At test time, new examples travel from the tree root to one of the leaves,
 their path through the tree determined by the decision rules at each of the
 nodes it visits. When a test example arrives at a leaf node, the targets for

diff --git a/numpy_ml/neural_nets/losses/losses.py b/numpy_ml/neural_nets/losses/losses.py
@@ -246,7 +246,7 @@ def __init__(self):
 
         References
         ----------
-        .. [1] Kingma & Welling (2014). "Auto-encoding variational Bayes".
+        .. [1] Kingma, D. P. & Welling, M. (2014). "Auto-encoding variational Bayes".
            *arXiv preprint arXiv:1312.6114.* https://arxiv.org/pdf/1312.6114.pdf
         """
         super().__init__()
@@ -331,7 +331,7 @@ def grad(y, y_pred, t_mean, t_log_var):
 class WGAN_GPLoss(ObjectiveBase):
     def __init__(self, lambda_=10):
         """
-        The loss function for a Wasserstein GAN with gradient penalty.
+        The loss function for a Wasserstein GAN [*]_ [*]_ with gradient penalty.
 
         Notes
         -----
@@ -364,19 +364,20 @@ def __init__(self, lambda_=10):
             \mathbf{z}  &\sim  \mathcal{N}(0, \mathbb{1}) \\\\
             \\alpha  &\sim  \\text{Uniform}(0, 1)
 
+        References
+        ----------
+        .. [*] Gulrajani, I., Ahmed, F., Arjovsky, M., Dumoulin, V., &
+           Courville, A. (2017) "Improved training of Wasserstein GANs"
+           *Advances in Neural Information Processing Systems, 31*: 5769-5779.
+        .. [*] Goodfellow, I. J, Abadie, P. A., Mirza, M., Xu, B., Farley, D.
+           W., Ozair, S., Courville, A., & Bengio, Y. (2014) "Generative
+           adversarial nets" *Advances in Neural Information Processing
+           Systems, 27*: 2672-2680.
+
         Parameters
         ----------
         lambda_ : float
             The gradient penalty coefficient. Default is 10.
-
-        References
-        ----------
-        .. [1] Gulrajani et al. (2017) "Improved training of Wasserstein GANs"
-           *Advances in Neural Information Processing Systems, 31*, 5769-5779.
-           https://arxiv.org/pdf/1704.00028.pdf
-        .. [2] Goodfellow et al. (2014) "Generative adversarial nets" *Advances in
-           Neural Information Processing Systems, 27*, 2672-2680.
-           https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf
         """
         self.lambda_ = lambda_
         super().__init__()
@@ -424,11 +425,9 @@ def loss(self, Y_fake, module, Y_real=None, gradInterp=None):
             Whether to calculate the loss for the critic ('C') or the generator
             ('G'). If calculating loss for the critic, `Y_real` and
             `gradInterp` must not be None.
-        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`, or
-        None
+        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)` or None
             The output of the critic for `X_real`. Default is None.
-        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,
-        n_feats)` or None
+        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_feats)` or None
             The gradient of the critic output for `X_interp` wrt. `X_interp`.
             Default is None.
 
@@ -468,11 +467,9 @@ def grad(self, Y_fake, module, Y_real=None, gradInterp=None):
             Whether to calculate the gradient for the critic loss ('C') or the
             generator loss ('G'). If calculating grads for the critic, `Y_real`
             and `gradInterp` must not be None.
-        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)`, or
-        None
+        Y_real : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,)` or None
             The output of the critic for `X_real`. Default is None.
-        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex,
-        n_feats)`, or None
+        gradInterp : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, n_feats)` or None
             The gradient of the critic output on `X_interp` wrt. `X_interp`.
             Default is None.
 
@@ -566,6 +563,14 @@ def __init__(
         function, and :math:`Q(x)` corresponds to the probability of the values
         in `x` under `Q`.
 
+        References
+        ----------
+        .. [1] Gutmann, M. & Hyvarinen, A. (2010). Noise-contrastive
+           estimation: A new estimation principle for unnormalized statistical
+           models. *AISTATS, 13*: 297-304.
+        .. [2] Minh, A. & Teh, Y. W. (2012). A fast and simple algorithm for
+           training neural probabilistic language models. *ICML, 29*: 1751-1758.
+
         Parameters
         ----------
         n_classes : int
@@ -581,7 +586,7 @@ def __init__(
         optimizer : str, :doc:`Optimizer <numpy_ml.neural_nets.optimizers>` object, or None
             The optimization strategy to use when performing gradient updates
             within the :meth:`update` method.  If None, use the :class:`SGD
-            <numpy_ml.neural_nets.optimizers.optimizers.SGD>` optimizer with
+            <numpy_ml.neural_nets.optimizers.SGD>` optimizer with
             default parameters. Default is None.
         subtract_log_label_prob : bool
             Whether to subtract the log of the probability of each label under
@@ -598,13 +603,6 @@ def __init__(
             The loss hyperparameter values.
         derived_variables: dict
             Useful intermediate values computed during the loss computation.
-
-        References
-        ----------
-        .. [1] Gutmann & Hyvarinen (2010). Noise-contrastive estimation: A new
-               estimation principle for unnormalized statistical models. *AISTATS 13*.
-        .. [2] Minh & Teh (2012). A fast and simple algorithm for training neural
-               probabilistic language models. *ICML 29*.
         """
         super().__init__()