In [1]:
%%javascript
MathJax.Hub.Config({
    TeX: { equationNumbers: { autoNumber: "AMS" } }
});

<IPython.core.display.Javascript object>

## f-divergence

GANs can be trained with different $f$-divergences \cite{nowozin2016f}. In statistics and probability theory, an $f$-divergence is a function $D_{f}\left( P \parallel Q \right)$ that measures the difference between two probability distributions $P$ and $Q$ \cite{csiszar2004information, liese2006divergences}. If $P$ and $Q$ are absolutely continuous distributions with respect to a reference $dx$ on $\mathcal{X}$ and $p$ and $q$ are its probability density function respectively, then we define the $f$-divergence,

\begin{equation} \label{eq:fdiv}
    D_f(P \parallel Q) = \int_{\mathcal{X}} q(x) f \left( \frac{p(x)}{q(x)} \right) dx
\end{equation}
    

where the _generator function_ $f: \mathbb{R}_{+} \mapsto \mathbb{R}$ is a convex, lower-semicontinuous function satisfying
$f(1) = 0$. Every convex, lower-semicontinuous function $f$ has a \textit{convex conjugate} function $f^{*}$ known as _Fenchel conjugate_ \cite{hiriart2012fundamentals}. The function is defined as  $f^{*}(t) = \sup\limits_{u \in \text{dom}_{f}} \{ut -  f(u)\}$,

Using Fenchel Conjugate in (\ref{eq:fdiv}),

\begin{align*}
    D_f(P \parallel Q) &= \int_{\mathcal{X}} q(x) \sup\limits_{t \in \text{dom}_{f^{*}} } \left\{ t \frac{p(x)}{q(x)} - f^{*}(t) \right\}  dx \\
    & \text{By Jensen Inequality,} \\
     &\geq \sup\limits_{T \in \mathcal{T}} \left( \int_{\mathcal{X}}p(x)T(x)dx - \int_{\mathcal{X}}q(x)f^{*}(T(x)) dx \right) \\
    &= \sup\limits_{T \in \mathcal{T}} \left( \mathbb{E}_{x \sim P} \left[T(X)\right] - \mathbb{E}_{x \sim Q} \left[f^{*}(T(X))\right] \right)
\end{align*}

where $\mathcal{T}$ is an arbitrary class of function $T : \mathcal{X} \mapsto \mathbb{R}$.
The lower bound is tight for $T^{*}(x) = f^{'} \left( \frac{p(x)}{q(x)} \right)$ \cite{nguyen2010estimating} where $f'$ is the first order derivative of $f$.

If we have a model $q_{\theta}(x)$ that should match the true distribution $p(x)$, we need to adjust the parameter $\theta$ using gradient descent to minimize the $f$-divergence. Our goal will be to find the best parameter $\theta^{*}$ using 

\begin{align}
    \theta^{*} &= \underset{\theta}{\arg \min}\ D_{f} \left( P \parallel Q_{\theta} \right) \nonumber \\
    &= \underset{\theta}{\arg \min}\ \mathbb{E}_{x \sim P} \left[ f^{'} \left( \frac{p(x)}{q_{\theta}(x)} \right) \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[f^{*}\left(f^{'} \left( \frac{p(x)}{q_{\theta}(x)} \right)\right)\right] \label{eq:theta-f-div}
\end{align}

Nowozin et al. \cite{nowozin2016f} introduce $f$-GAN objective function. $Q$ is our generative model, taking as input a random vector and outputting a sample of interest and parametrized by $\theta$. $T$ is our discriminator function, taking as input a sample and returning a scalar and parametrized by $\omega$.

\begin{align*}
    F(\theta, \omega) &= \mathbb{E}_{x \sim P} \left[T_{\omega}(x) \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[f^{*}(T_{\omega}(x)) \right]
\end{align*}
The output of the discriminator $T_\omega(x)$ needs to respect the domain $\text{dom}_{f^{*}}$ of the conjugate function $f^{*}$. Therefore, we need an output activation function of the discriminator specific to the $f$-divergence used. 
\begin{align*}
    F(\theta, \omega) &= \mathbb{E}_{x \sim P} \left[g_f(V_{\omega}(x)) \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[f^{*}(g_f(V_{\omega}(x))) \right] \\
    &= \mathbb{E}_{x \sim P, \hat{x} \sim Q_{\theta}} \left[ g_f(V_{\omega}(x)) - f^{*}(g_f(V_{\omega}(\hat{x}))) \right] \\
\end{align*}
where $V_\omega : \mathcal{X} \mapsto \mathbb{R}$ without any range constraints on the output and $g_f : \mathbb{R} \mapsto \text{dom}_{f^{*}}$ is an output activation function of the discriminator specific to the $f$-divergence used.

Now, we find the saddle point of $F(\theta, \omega)$ by minimizing with respect to $\theta$ and maximizing with respect to $\omega$.
\begin{equation}
    \theta^{*} = \underset{\theta}{\arg\min}\ \max_{\omega} F(\theta, \omega) \label{eq:loss-f-div}
\end{equation}





\begin{array}{l l l l l } 
\text{Name} & \text{Generator } f(u) & \text{Conjugate }f^{*}(t) & \text{Output activation }g_f & T^*(x)\\ 
 \hline
\text{Kullback-Leibler (KL)} & u \log u & \exp{(t - 1)} & v & 1 + \log \frac{p(x)}{q(x)} \\
\text{Reverse KL} & -\log u & -1-\log(-t) & -\exp(v) & -\frac{q(x)}{p(x)}\\
\text{Jensen-Shanon} & -(u+1)\log \frac{1 + u}{2} + u \log u & -\log(2-\exp(t)) & \log(2) - \log(1 + \exp(-v)) & \log \frac{2p(x)}{p(x) + q(x)}
\end{array}


Table \ref{table:f-div} has a list of common $f$-divergences that we consider in this paper and provide their Fenchel conjugates $f^{*}$.

### Kullback Leibler Divergence

Substituting in (\ref{eq:theta-f-div}),

\begin{align*}
    \theta^{*} &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ 1 + \log \left( \frac{p(x)}{q_{\theta}(x)} \right) \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[ \frac{p(x)}{q_{\theta}(x)} \right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ 1 + \log \left( \frac{p(x)}{q_{\theta}(x)} \right) \right] - \mathbb{E}_{x \sim P} \left[ 1 \right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ \log \left( \frac{p(x)}{q_{\theta}(x)} \right) \right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ \log{p(x)} - \log{q_{\theta}(x)} \right] \\
    &= \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P}  \left[ - \log{q_{\theta}(x)} \right] \\
    &= \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P}  \left[ \log{q_{\theta}(x)} \right] \\
\end{align*}

Substituting in (\ref{eq:loss-f-div}), we get the loss function for Kullback-Leibler GAN,

\begin{align*}
    \theta^{*} &= \underset{\theta}{\arg\min}\ \max_{\omega} \mathbb{E}_{x \sim P, \hat{x} \sim Q_{\theta}} \left[ V_{\omega}(x) - \exp(V_{\omega}(\hat{x})-1) \right]
\end{align*}

### Reverse K-L Divergence

Substituting in (\ref{eq:theta-f-div}),

\begin{align*}
    \theta^{*} &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ - \frac{q_{\theta}(x)}{p(x)} \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[ -1 - \log \left(\frac{q_{\theta}(x)}{p(x)}\right) \right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim Q_{\theta}} \left[ -1 \right] + \mathbb{E}_{x \sim Q_{\theta}} \left[ 1 + \log \left(\frac{q_{\theta}(x)}{p(x)}\right) \right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim Q_{\theta}} \left[\log \left(\frac{q_{\theta}(x)}{p(x)}\right) \right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim Q_{\theta}} \left[\log q_{\theta}(x) - \log{p(x)}\right] \\
    &=  \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim Q_{\theta}} \left[\log q_{\theta}(x) \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[ \log{p(x)}\right] \\
    &=  \underset{\theta}{\arg\min}\ -\mathbb{E}_{x \sim Q_{\theta}} \left[\log q_{\theta}(x) \right] + \mathbb{E}_{x \sim Q_{\theta}} \left[ \log{p(x)}\right] \\
\end{align*}

Substituting in (\ref{eq:loss-f-div}), we get the loss function for Reverse KL GAN

\begin{align*}
    \theta^{*} &= \underset{\theta}{\arg\min}\ \max_{\omega} \mathbb{E}_{x \sim P, \hat{x} \sim Q_{\theta}} \left[ -\exp(V_{\omega}(x)) + 1 + V_{\omega}(\hat{x}) \right]
\end{align*}

### Jensen-Shannon Divergence, 

Substituting in (\ref{eq:theta-f-div}),

\begin{align*}
    \theta^{*} &= \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ \log{\frac{2p(x)}{p(x) + q_{\theta}(x)}} \right] - \mathbb{E}_{x \sim Q_{\theta}} \left[  - \log \left( 2 - \frac{2p(x)}{p(x) + q_{\theta}(x)} \right) \right] \\
    &= \underset{\theta}{\arg\min}\ \mathbb{E}_{x \sim P} \left[ \log{\frac{2p(x)}{p(x) + q_{\theta}(x)}} \right] + \mathbb{E}_{x \sim Q_{\theta}} \left[   \log \frac{2q(x)}{p(x) + q_{\theta}(x)}  \right] \\
\end{align*}

Substituting in (\ref{eq:loss-f-div}), we get the loss function for Jensen-Shannon GAN

\begin{align*}
    \theta^{*} &= \underset{\theta}{\arg\min}\ \max_{\omega} \mathbb{E}_{x \sim P, \hat{x} \sim Q_{\theta}} \left[ \log{\frac{2}{1 + \exp{(-V_{\omega}(x))}}} + \log \left(2 - \frac{2}{1 + \exp{(-V_{\omega}(\hat{x}))}}\right)\right] \\
    &= \underset{\theta}{\arg\min}\ \max_{\omega} \mathbb{E}_{x \sim P, \hat{x} \sim Q_{\theta}} \left[ \log{\frac{2}{1 + \exp{(-V_{\omega}(x))}}} + \log \left(\frac{2\exp{(-V_{\omega}(\hat{x}))}}{1 + \exp{(-V_{\omega}(\hat{x}))}}\right)\right] 
\end{align*}
This loss function is very similar to the loss function in original GAN