In [6]:
import os
import urllib.request
from urllib.error import HTTPError

# Github URL where python scripts are stored.
base_url = "https://raw.githubusercontent.com/phlippe/uvadlc_notebooks/master/docs/tutorial_notebooks/scaling/JAX/"
# Files to download.
python_files = ["single_gpu.py", "utils.py"]
# For each file, check whether it already exists. If not, try downloading it.
for file_name in python_files:
    if not os.path.isfile(file_name):
        file_url = base_url + file_name
        print(f"Downloading {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_name)
        except HTTPError as e:
            print(
                "Something went wrong. Please try to download the file directly from the GitHub repository, or contact the author with the full output including the following error:\n",
                e,
            )

In [7]:
from utils import simulate_CPU_devices

simulate_CPU_devices()

In [1]:
import functools
from pprint import pprint
from typing import Any, Callable, Dict, Sequence, Tuple

import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
import optax
from absl import logging
from jax import lax
from jax.experimental.shard_map import shard_map
from jax.sharding import Mesh
from jax.sharding import PartitionSpec as P
from ml_collections import ConfigDict
from flax.training import train_state
from single_gpu import TrainState
import time

PyTree = Any
Metrics = Dict[str, Tuple[jax.Array, ...]]

In [2]:
class DPClassifier(nn.Module):
    config: ConfigDict

    @nn.compact
    def __call__(self, x: jax.Array, train: bool) -> jax.Array:
        x = nn.Dense(
            features=self.config.hidden_size,
            dtype=self.config.dtype,
            name="input_dense",
        )(x)
        x = nn.silu(x)
        x = nn.Dropout(rate=self.config.dropout_rate, deterministic=not train)(x)
        x = nn.Dense(
            features=self.config.num_classes,
            dtype=self.config.dtype,
            name="output_dense",
        )(x)
        x = x.astype(jnp.float32)
        return x

In [3]:
data_config = ConfigDict(
    dict(
        batch_size=16,
        num_classes=8,
        input_size=32,
    )
)
model_config = ConfigDict(
    dict(
        hidden_size=8,
        dropout_rate=0.1,
        dtype=jnp.bfloat16,
        num_classes=data_config.num_classes,
        data_axis_name="data",
    )
)
optimizer_config = ConfigDict(
    dict(
        learning_rate=1e-3,
        num_minibatches=4,
    )
)
config = ConfigDict(
    dict(
        model=model_config,
        optimizer=optimizer_config,
        data=data_config,
        data_axis_name=model_config.data_axis_name,
        seed=42,
    )
)

In [4]:
class KeyState:
    def __init__(self, base_key: jax.random.key):
        self.key = jax.random.key(base_key)

    def __call__(self, num: int = 2):
        self.key, rng = jax.random.split(self.key, num=num)
        return rng

In [5]:
model = DPClassifier(config=config.model)
optimizer = optax.adamw(
    learning_rate=config.optimizer.learning_rate,
)
class TrainStateWithRNG(train_state.TrainState):
        rng: Any

In [6]:
key = KeyState(config.seed)
x=jax.random.normal(key(), (config.data.batch_size, config.data.input_size))
y = jax.random.randint(key(), (config.data.batch_size,), 0, config.data.num_classes)
variables = model.init({"params": key()}, x, train=False)
params = variables.pop("params")
device_array = np.array(jax.devices())
mesh = Mesh(device_array, ("x",))
print(jax.tree.reduce(lambda acc, current: acc + current.size, jax.tree.leaves(params), 0))
jax.devices()

336


[CudaDevice(id=0), CudaDevice(id=1)]

In [7]:
def init_device(params, rng, local_model, config):
        tx = optax.chain(
            optax.clip_by_global_norm(1),
            optax.inject_hyperparams(optax.adam)(learning_rate=1e-3),
        )
        state = TrainStateWithRNG.create(
            apply_fn=local_model.apply,
            params=params,
            tx=tx,
            rng=rng,
        )
        return state

In [8]:
sharded_init = shard_map(
            functools.partial(init_device, rng=key(), local_model=model, config=model_config),
            mesh,
            in_specs=(P()),
            out_specs=(P()),
        )

state_initialized = sharded_init(params)

In [9]:
def fold_key(key, axis):
        axis_index = jax.lax.axis_index(axis)
        return jax.random.fold_in(key, axis_index)

In [10]:
def cross_entropy_loss(model, params, key, x, y, train=True):
        dropout_key = fold_key(key, "x")
        B, T = x.shape
        pred = model.apply({'params': params}, x, train=train, rngs={'dropout': dropout_key})
        log_prob = jax.nn.log_softmax(pred, axis=-1)
        print(f'b shape {B}')
        loss = -jnp.mean(log_prob[jnp.arange(B), y])
        loss = jax.lax.pmean(loss, 'x')
        print(f'loss shape {loss.shape}')
        return loss
#loss = cross_entropy_loss(model, params, key(), x, y)

In [None]:
def train_step(loss_fn, params, key, *args, **kwargs):
        loss_grad = jax.value_and_grad(
            loss_fn,
            argnums=0,
            has_aux=False
        )
        loss, grads = loss_grad(params, key, *args, **kwargs, train=True)
        # don't need cache in training
        print('got grads')
        metrics = {
            'loss': loss,
        }
        return grads, metrics

In [13]:
def accumulate_grads(key, x, y, state):
        print("starting training")
        loss_fn = jax.tree_util.Partial(cross_entropy_loss, model)

        start = time.time()
        train_loss = 0.0

        grads = None
        acc_metrics = None
        for i in range(2):
            print(f"iteration {i}")
            grads_step, metrics =  train_step(loss_fn, state.params, key, x, y)
            grads = grads_step if grads is None else jax.tree.map(
                lambda x, y: x + y, grads, grads_step
            )
            acc_metrics = metrics if acc_metrics is None else jax.tree.map(jnp.add, acc_metrics, metrics)
        print(f"accumulated grads {grads}")
        grads = jax.tree.map(lambda x: x / 2, grads)
        acc_metrics = jax.tree.map(lambda x : x/2, acc_metrics)

        return grads, acc_metrics

In [14]:
def train_step_device(state, x, y):
        key, step_key = jax.random.split(state.rng)
        grads, step_metrics = accumulate_grads(step_key, x, y, state)
        new_state = state.apply_gradients(grads=grads, rng=key)

        return new_state, step_metrics


In [15]:
train_step_dp_fn =  shard_map(
            train_step_device,
            mesh,
            in_specs=(P(), P("x",), P("x",)),
            out_specs=(P(), P()),
        )

In [None]:
state, metrics = train_step_dp_fn(state_initialized, x, y)
state

starting training
iteration 0
b shape 8


In [None]:
print("DP Parameters")
pprint(jax.tree.map(lambda x: (x.shape, x.sharding), state_initialized.params))

DP Parameters
{'input_dense': {'bias': ((8,),
                          NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host)),
                 'kernel': ((32, 8),
                            NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host))},
 'output_dense': {'bias': ((8,),
                           NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host)),
                  'kernel': ((8, 8),
                             NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host))}}


In [None]:
state = state_initialized
for _ in range(100):
    state_dp, metrics_dp = train_step_dp_fn(state, x, y)
    state = state_dp
    print(metrics_dp)
state_dp, final_metrics_dp = train_step_dp_fn(state_dp, x, y)
print(final_metrics_dp)

starting training
b shape 2
b shape 2
{'loss': Array(2.3031156, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.24708, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.2780728, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.227149, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.1909137, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.224891, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.1924787, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.2086816, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.1944566, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.1665606, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.1524467, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss': Array(2.1309204, dtype=float32)}
starting training
b shape 2
b shape 2
{'loss

In [None]:
print(state)

{'input_dense': {'bias': Array([-0.05493164,  0.0703125 ,  0.10253906,  0.10351562,  0.01184082,
        0.06005859,  0.13085938,  0.07275391], dtype=float32), 'kernel': Array([[ 0.05273438, -0.04614258, -0.06933594, -0.13964844, -0.03833008,
        -0.01757812, -0.06933594,  0.00747681],
       [ 0.10449219,  0.04125977,  0.01916504, -0.05541992,  0.03564453,
         0.04882812, -0.1484375 ,  0.04663086],
       [ 0.08496094,  0.05517578,  0.00106812, -0.0300293 ,  0.03222656,
         0.09423828,  0.02941895,  0.00457764],
       [-0.02563477, -0.05615234,  0.10302734,  0.00665283,  0.04418945,
        -0.06030273, -0.03491211,  0.06225586],
       [-0.06591797,  0.01416016, -0.02746582,  0.02087402, -0.00337219,
         0.05004883,  0.09716797,  0.00543213],
       [-0.04711914,  0.02038574, -0.00531006,  0.06347656,  0.01806641,
        -0.02807617,  0.05834961, -0.02404785],
       [-0.02282715,  0.04174805,  0.08105469,  0.06591797,  0.03930664,
        -0.02038574, -0.0092773

In [None]:
p = state.params['input_dense']['kernel']
jax.debug.visualize_array_sharding(x)

In [None]:
print("DP Parameters")
pprint(jax.tree.map(lambda x: (x.shape, x.sharding), state.params))
print("Metrics")
pprint(jax.tree.map(lambda x: (x.shape, x.sharding), metrics))

DP Parameters
{'input_dense': {'bias': ((8,),
                          NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host)),
                 'kernel': ((32, 8),
                            NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host))},
 'output_dense': {'bias': ((8,),
                           NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host)),
                  'kernel': ((8, 8),
                             NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host))}}
Metrics
{'loss': ((),
          NamedSharding(mesh=Mesh('x': 8, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=unpinned_host))}


In [None]:
from jax.sharding import NamedSharding

In [None]:
jax.debug.visualize_sharding((16,32), NamedSharding(mesh, P("x")))