Copyright 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Example of taking gradients with respect to flax variables inside a flax module

## Setting up our environment

Here we provide the code needed to set up the environment for our notebook.

In [1]:
# Install the latest JAXlib version.
!pip install --upgrade -q pip jax jaxlib
# Install Flax at head:
!pip install --upgrade -q git+https://github.com/google/flax.git

/bin/sh: pip: command not found
/bin/sh: pip: command not found


In [2]:
import jax
from typing import Any, Callable, Sequence, Optional
from jax import lax, random, numpy as jnp
import flax
from flax.core import freeze, unfreeze
from flax import linen as nn

from jax.config import config
config.enable_omnistaging() # Linen requires enabling omnistaging

### Create a toy flax module with parameters and variable

In [49]:
class ToyModule(nn.Module):
  """Toy flax module with one dense layer and one flax variable."""

  @nn.compact
  def __call__(self, x):
    is_initialized = self.has_variable('stats', 'counter')
    # add a variable
    v = self.variable('stats', 'counter', jnp.zeros, x.shape[1:])
    # update variable if initialized
    if is_initialized:
      v.value += 1
    x = v.value * x
    x = nn.Dense(features=3)(x)
    x = nn.log_softmax(x)
    return x

In [50]:
def onehot(labels, num_classes=3):
  x = (labels[..., None] == jnp.arange(num_classes)[None])
  return x.astype(jnp.float32)

def cross_entropy_loss(logits, labels):
  return -jnp.mean(jnp.sum(onehot(labels) * logits, axis=-1))

def loss_fn(variables):
  dummy_input = jax.random.normal(key, (1, 10))
  dummy_label = jnp.array([1])
  # Mark variable collection as mutable
  logits, updated_state = ToyModule().apply(variables, dummy_input, mutable=['stats'])
  print(logits)
  loss = cross_entropy_loss(logits, dummy_label)
  return loss, logits

### Initialize model

In [51]:
key = jax.random.PRNGKey(0)
init_shape = jnp.ones((1, 10), jnp.float32)
params_and_variables = ToyModule().init(key, init_shape)

### Call loss_fn with initial parameters and variables

In [52]:
loss_fn(params_and_variables)

[[-0.7990668 -1.3123766 -1.2691445]]


(DeviceArray(1.3123766, dtype=float32),
 DeviceArray([[-0.7990668, -1.3123766, -1.2691445]], dtype=float32))

### Prepare gradient function of `loss_fn`

In [53]:
grad_fn = jax.value_and_grad(loss_fn, has_aux=True)

### Compute gradients wrt all parameters and variables

In [54]:
(_, logits), grad = grad_fn(params_and_variables)

Traced<ConcreteArray([[-0.7990668 -1.3123766 -1.2691445]])>with<JVPTrace(level=2/0)>
  with primal = DeviceArray([[-0.7990668, -1.3123766, -1.2691445]], dtype=float32)
       tangent = Traced<ShapedArray(float32[1,3]):JaxprTrace(level=1/0)>


### Gradients wrt to variable

In [55]:
grad['stats']['counter'].shape

(10,)

In [56]:
grad['stats']['counter']

DeviceArray([ 0.10620452,  0.04868598,  0.01061121,  0.05297215,
             -0.08144682, -0.02126385,  0.23443498, -0.27031758,
              0.18989496, -0.02676392], dtype=float32)

### Gradients wrt to params

In [58]:
grad['params']['Dense_0']['kernel'].shape

(10, 3)

In [57]:
grad['params']['Dense_0']['kernel']

DeviceArray([[-0.16735636,  0.2719463 , -0.10458997],
             [ 0.11883753, -0.19310547,  0.07426795],
             [-0.08209157,  0.133395  , -0.05130343],
             [-0.3313836 ,  0.53848296, -0.2070994 ],
             [-0.198026  ,  0.32178307, -0.12375707],
             [-0.06842665,  0.11119014, -0.04276349],
             [-0.30194026,  0.4906389 , -0.18869866],
             [-0.2657403 ,  0.43181565, -0.16607536],
             [ 0.3290759 , -0.53473306,  0.20565718],
             [ 0.25514343, -0.41459623,  0.15945281]], dtype=float32)