From 8961cf357621a1202aa3508b92e57af6108523a5 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 10 May 2020 02:18:18 -0400
Subject: [PATCH 01/18] typo fixes

---
 numpy_ml/bandits/policies.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/numpy_ml/bandits/policies.py b/numpy_ml/bandits/policies.py
index a3f3bb6..6d4c4b9 100644
--- a/numpy_ml/bandits/policies.py
+++ b/numpy_ml/bandits/policies.py
@@ -202,13 +202,12 @@ def __init__(self, C=1, ev_prior=0.5):
 
             \text{UCB}(a, t) = \text{EV}_t(a) + C \sqrt{\frac{2 \log t}{N_t(a)}}
 
-        where :math:`\text{UCB}(a, t)` is the upper confidence bound on the
-        expected value of arm `a` at time `t`, :math:`\text{EV}_t(a)` is the
-        average of the rewards recieved so far from pulling arm `a`, `C` is a
-        parameter controlling the confidence upper bound of the estimate for
-        :math:`\text{UCB}(a, t)` (for logarithmic regret bounds, `C` must
-        equal 1), and :math:`N_t(a)` is the number of times arm `a` has been
-        pulled during the previous `t - 1` timesteps.
+        where :math:`\text{EV}_t(a)` is the average of the rewards recieved so
+        far from pulling arm `a`, `C` is a free parameter controlling the
+        "optimism" of the confidence upper bound for :math:`\text{UCB}(a, t)`
+        (for logarithmic regret bounds, `C` must equal 1), and :math:`N_t(a)`
+        is the number of times arm `a` has been pulled during the previous `t -
+        1` timesteps.
 
         References
         ----------
@@ -220,7 +219,8 @@ def __init__(self, C=1, ev_prior=0.5):
         ----------
         C : float in (0, +infinity)
             A confidence/optimisim parameter affecting the degree of
-            exploration. The UCB1 algorithm assumes `C=1`. Default is 1.
+            exploration, where larger values encourage greater exploration. The
+            UCB1 algorithm assumes `C=1`. Default is 1.
         ev_prior : float
             The starting expected value for each arm before any data has been
             observed. Default is 0.5.
@@ -292,10 +292,10 @@ def __init__(self, alpha=1, beta=1):
         where :math:`k \in \{1,\ldots,K \}` indexes arms in the MAB and
         :math:`\theta_k` is the parameter of the Bernoulli likelihood for arm
         `k`. The sampler begins by selecting an arm with probability
-        proportional to it's payoff probability under the initial Beta prior.
+        proportional to its payoff probability under the initial Beta prior.
         After pulling the sampled arm and receiving a reward, `r`, the sampler
         computes the posterior over the model parameters (arm payoffs) via
-        Bayes' rule, and then samples a new action in proportion to it's payoff
+        Bayes' rule, and then samples a new action in proportion to its payoff
         probability under this posterior. This process (i.e., sample action
         from posterior, take action and receive reward, compute updated
         posterior) is repeated until the number of trials is exhausted.

From 78533fb8745d0eb9eb98ce2bac04cc569c297048 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 18:50:21 -0400
Subject: [PATCH 02/18] rename tests to plots

---
 numpy_ml/hmm/{tests.py => plots.py}       | 1 +
 numpy_ml/lda/{tests.py => plots.py}       | 1 +
 numpy_ml/rl_models/{tests.py => plots.py} | 1 +
 3 files changed, 3 insertions(+)
 rename numpy_ml/hmm/{tests.py => plots.py} (99%)
 rename numpy_ml/lda/{tests.py => plots.py} (99%)
 rename numpy_ml/rl_models/{tests.py => plots.py} (99%)

diff --git a/numpy_ml/hmm/tests.py b/numpy_ml/hmm/plots.py
similarity index 99%
rename from numpy_ml/hmm/tests.py
rename to numpy_ml/hmm/plots.py
index 35fce5a..f400d5d 100644
--- a/numpy_ml/hmm/tests.py
+++ b/numpy_ml/hmm/plots.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import numpy as np
 import matplotlib
 
diff --git a/numpy_ml/lda/tests.py b/numpy_ml/lda/plots.py
similarity index 99%
rename from numpy_ml/lda/tests.py
rename to numpy_ml/lda/plots.py
index 1ff7b2f..51eda85 100644
--- a/numpy_ml/lda/tests.py
+++ b/numpy_ml/lda/plots.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import numpy as np
 
 import matplotlib
diff --git a/numpy_ml/rl_models/tests.py b/numpy_ml/rl_models/plots.py
similarity index 99%
rename from numpy_ml/rl_models/tests.py
rename to numpy_ml/rl_models/plots.py
index 9d73863..8b5469c 100644
--- a/numpy_ml/rl_models/tests.py
+++ b/numpy_ml/rl_models/plots.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import gym
 
 from .trainer import Trainer

From cfc89d4f3295df6bacf29000e358b5eb28e86b3a Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 18:53:50 -0400
Subject: [PATCH 03/18] add test dir

---
 numpy_ml/tests/__init__.py            |    1 +
 numpy_ml/tests/nn_torch_models.py     | 2292 +++++++++++++++++++++++
 numpy_ml/tests/test_ngram.py          |  254 +++
 numpy_ml/tests/test_nn.py             | 2447 +++++++++++++++++++++++++
 numpy_ml/tests/test_nn_activations.py |  337 ++++
 numpy_ml/tests/test_nonparametric.py  |  119 ++
 numpy_ml/tests/test_preprocessing.py  |  252 +++
 numpy_ml/tests/test_trees.py          |  355 ++++
 numpy_ml/tests/test_utils.py          |  274 +++
 9 files changed, 6331 insertions(+)
 create mode 100644 numpy_ml/tests/__init__.py
 create mode 100644 numpy_ml/tests/nn_torch_models.py
 create mode 100644 numpy_ml/tests/test_ngram.py
 create mode 100644 numpy_ml/tests/test_nn.py
 create mode 100644 numpy_ml/tests/test_nn_activations.py
 create mode 100644 numpy_ml/tests/test_nonparametric.py
 create mode 100644 numpy_ml/tests/test_preprocessing.py
 create mode 100644 numpy_ml/tests/test_trees.py
 create mode 100644 numpy_ml/tests/test_utils.py

diff --git a/numpy_ml/tests/__init__.py b/numpy_ml/tests/__init__.py
new file mode 100644
index 0000000..20ff959
--- /dev/null
+++ b/numpy_ml/tests/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for various numpy-ml modules"""
diff --git a/numpy_ml/tests/nn_torch_models.py b/numpy_ml/tests/nn_torch_models.py
new file mode 100644
index 0000000..a5ae3dc
--- /dev/null
+++ b/numpy_ml/tests/nn_torch_models.py
@@ -0,0 +1,2292 @@
+# flake8: noqa
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import tensorflow as tf
+
+import numpy as np
+
+#######################################################################
+#       Gold-standard implementations for testing custom layers       #
+#                       (Requires Pytorch)                            #
+#######################################################################
+
+
+def torchify(var, requires_grad=True):
+    return torch.autograd.Variable(torch.FloatTensor(var), requires_grad=requires_grad)
+
+
+def torch_gradient_generator(fn, **kwargs):
+    def get_grad(z):
+        z1 = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
+        z2 = fn(z1, **kwargs).sum()
+        z2.backward()
+        grad = z1.grad.numpy()
+        return grad
+
+    return get_grad
+
+
+def torch_xe_grad(y, z):
+    z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
+    y = torch.LongTensor(y.argmax(axis=1))
+    loss = F.cross_entropy(z, y, reduction="sum")
+    loss.backward()
+    grad = z.grad.numpy()
+    return grad
+
+
+def torch_mse_grad(y, z, act_fn):
+    y = torch.FloatTensor(y)
+    z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
+    y_pred = act_fn(z)
+    loss = F.mse_loss(y_pred, y, reduction="sum")  # size_average=False).sum()
+    loss.backward()
+    grad = z.grad.numpy()
+    return grad
+
+
+class TorchVAELoss(nn.Module):
+    def __init__(self):
+        super(TorchVAELoss, self).__init__()
+
+    def extract_grads(self, X, X_recon, t_mean, t_log_var):
+        eps = np.finfo(float).eps
+        X = torchify(X, requires_grad=False)
+        X_recon = torchify(np.clip(X_recon, eps, 1 - eps))
+        t_mean = torchify(t_mean)
+        t_log_var = torchify(t_log_var)
+
+        BCE = torch.sum(F.binary_cross_entropy(X_recon, X, reduction="none"), dim=1)
+
+        # see Appendix B from VAE paper:
+        # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
+        # https://arxiv.org/abs/1312.6114
+        # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
+        KLD = -0.5 * torch.sum(1 + t_log_var - t_mean.pow(2) - t_log_var.exp(), dim=1)
+
+        loss = torch.mean(BCE + KLD)
+        loss.backward()
+
+        grads = {
+            "loss": loss.detach().numpy(),
+            "dX_recon": X_recon.grad.numpy(),
+            "dt_mean": t_mean.grad.numpy(),
+            "dt_log_var": t_log_var.grad.numpy(),
+        }
+        return grads
+
+
+class TorchWGANGPLoss(nn.Module):
+    def __init__(self, lambda_=10):
+        self.lambda_ = torchify([lambda_])
+        super(TorchWGANGPLoss, self).__init__()
+
+    def forward(self, Y_real, Y_fake, gradInterp):
+        GY_fake = Y_fake.copy()
+        self.Y_real = torchify(Y_real)
+        self.Y_fake = torchify(Y_fake)
+        self.GY_fake = torchify(GY_fake)
+        self.gradInterp = torchify(gradInterp)
+
+        # calc grad penalty
+        norm = self.gradInterp.norm(2, dim=1)
+        self.norm1 = torch.sqrt(torch.sum(self.gradInterp.pow(2), dim=1))
+        assert torch.allclose(norm, self.norm1)
+
+        self.gpenalty = self.lambda_ * ((self.norm1 - 1).pow(2)).mean()
+        self.C_loss = self.Y_fake.mean() - self.Y_real.mean() + self.gpenalty
+        self.G_loss = -self.GY_fake.mean()
+
+    def extract_grads(self, Y_real, Y_fake, gradInterp):
+        self.forward(Y_real, Y_fake, gradInterp)
+
+        self.C_loss.backward()
+        self.G_loss.backward()
+
+        grads = {
+            "Y_real": self.Y_real.detach().numpy(),
+            "Y_fake": self.Y_fake.detach().numpy(),
+            "gradInterp": self.gradInterp.detach().numpy(),
+            "GP": self.gpenalty.detach().numpy(),
+            "C_loss": self.C_loss.detach().numpy(),
+            "G_loss": self.G_loss.detach().numpy(),
+            "C_dY_real": self.Y_real.grad.numpy(),
+            "C_dGradInterp": self.gradInterp.grad.numpy(),
+            "C_dY_fake": self.Y_fake.grad.numpy(),
+            "G_dY_fake": self.GY_fake.grad.numpy(),
+        }
+        return grads
+
+
+class TorchLinearActivation(nn.Module):
+    def __init__(self):
+        super(TorchLinearActivation, self).__init__()
+        pass
+
+    @staticmethod
+    def forward(input):
+        return input
+
+    @staticmethod
+    def backward(grad_output):
+        return torch.ones_like(grad_output)
+
+
+class TorchBatchNormLayer(nn.Module):
+    def __init__(self, n_in, params, mode, momentum=0.9, epsilon=1e-5):
+        super(TorchBatchNormLayer, self).__init__()
+
+        scaler = params["scaler"]
+        intercept = params["intercept"]
+
+        if mode == "1D":
+            self.layer1 = nn.BatchNorm1d(
+                num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True
+            )
+        elif mode == "2D":
+            self.layer1 = nn.BatchNorm2d(
+                num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True
+            )
+
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+    def forward(self, X):
+        # (N, H, W, C) -> (N, C, H, W)
+        if X.ndim == 4:
+            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+
+        if not isinstance(X, torch.Tensor):
+            X = torchify(X)
+
+        self.X = X
+        self.Y = self.layer1(self.X)
+        self.Y.retain_grad()
+
+    def extract_grads(self, X, Y_true=None):
+        self.forward(X)
+
+        if isinstance(Y_true, np.ndarray):
+            Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])
+            self.loss1 = (
+                0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()
+            )
+        else:
+            self.loss1 = self.Y.sum()
+
+        self.loss1.backward()
+
+        X_np = self.X.detach().numpy()
+        Y_np = self.Y.detach().numpy()
+        dX_np = self.X.grad.numpy()
+        dY_np = self.Y.grad.numpy()
+
+        if self.X.dim() == 4:
+            orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
+            if isinstance(Y_true, np.ndarray):
+                Y_true = np.moveaxis(Y_true, orig, X_swap)
+            X_np = np.moveaxis(X_np, orig, X_swap)
+            Y_np = np.moveaxis(Y_np, orig, X_swap)
+            dX_np = np.moveaxis(dX_np, orig, X_swap)
+            dY_np = np.moveaxis(dY_np, orig, X_swap)
+
+        grads = {
+            "loss": self.loss1.detach().numpy(),
+            "X": X_np,
+            "momentum": 1 - self.layer1.momentum,
+            "epsilon": self.layer1.eps,
+            "intercept": self.layer1.bias.detach().numpy(),
+            "scaler": self.layer1.weight.detach().numpy(),
+            "running_mean": self.layer1.running_mean.detach().numpy(),
+            "running_var": self.layer1.running_var.detach().numpy(),
+            "y": Y_np,
+            "dLdy": dY_np,
+            "dLdIntercept": self.layer1.bias.grad.numpy(),
+            "dLdScaler": self.layer1.weight.grad.numpy(),
+            "dLdX": dX_np,
+        }
+        if isinstance(Y_true, np.ndarray):
+            grads["Y_true"] = Y_true
+        return grads
+
+
+class TorchLayerNormLayer(nn.Module):
+    def __init__(self, feat_dims, params, mode, epsilon=1e-5):
+        super(TorchLayerNormLayer, self).__init__()
+
+        self.layer1 = nn.LayerNorm(
+            normalized_shape=feat_dims, eps=epsilon, elementwise_affine=True
+        )
+
+        scaler = params["scaler"]
+        intercept = params["intercept"]
+
+        if mode == "2D":
+            scaler = np.moveaxis(scaler, [0, 1, 2], [-2, -1, -3])
+            intercept = np.moveaxis(intercept, [0, 1, 2], [-2, -1, -3])
+
+        assert scaler.shape == self.layer1.weight.shape
+        assert intercept.shape == self.layer1.bias.shape
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+    def forward(self, X):
+        # (N, H, W, C) -> (N, C, H, W)
+        if X.ndim == 4:
+            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+
+        if not isinstance(X, torch.Tensor):
+            X = torchify(X)
+
+        self.X = X
+        self.Y = self.layer1(self.X)
+        self.Y.retain_grad()
+
+    def extract_grads(self, X, Y_true=None):
+        self.forward(X)
+
+        if isinstance(Y_true, np.ndarray):
+            Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])
+            self.loss1 = (
+                0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()
+            )
+        else:
+            self.loss1 = self.Y.sum()
+
+        self.loss1.backward()
+
+        X_np = self.X.detach().numpy()
+        Y_np = self.Y.detach().numpy()
+        dX_np = self.X.grad.numpy()
+        dY_np = self.Y.grad.numpy()
+        intercept_np = self.layer1.bias.detach().numpy()
+        scaler_np = self.layer1.weight.detach().numpy()
+        dIntercept_np = self.layer1.bias.grad.numpy()
+        dScaler_np = self.layer1.weight.grad.numpy()
+
+        if self.X.dim() == 4:
+            orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
+            orig_p, p_swap = [0, 1, 2], [-1, -3, -2]
+            if isinstance(Y_true, np.ndarray):
+                Y_true = np.moveaxis(Y_true, orig, X_swap)
+            X_np = np.moveaxis(X_np, orig, X_swap)
+            Y_np = np.moveaxis(Y_np, orig, X_swap)
+            dX_np = np.moveaxis(dX_np, orig, X_swap)
+            dY_np = np.moveaxis(dY_np, orig, X_swap)
+            scaler_np = np.moveaxis(scaler_np, orig_p, p_swap)
+            intercept_np = np.moveaxis(intercept_np, orig_p, p_swap)
+            dScaler_np = np.moveaxis(dScaler_np, orig_p, p_swap)
+            dIntercept_np = np.moveaxis(dIntercept_np, orig_p, p_swap)
+
+        grads = {
+            "loss": self.loss1.detach().numpy(),
+            "X": X_np,
+            "epsilon": self.layer1.eps,
+            "intercept": intercept_np,
+            "scaler": scaler_np,
+            "y": Y_np,
+            "dLdy": dY_np,
+            "dLdIntercept": dIntercept_np,
+            "dLdScaler": dScaler_np,
+            "dLdX": dX_np,
+        }
+        if isinstance(Y_true, np.ndarray):
+            grads["Y_true"] = Y_true
+        return grads
+
+
+class TorchAddLayer(nn.Module):
+    def __init__(self, act_fn, **kwargs):
+        super(TorchAddLayer, self).__init__()
+        self.act_fn = act_fn
+
+    def forward(self, Xs):
+        self.Xs = []
+        x = Xs[0].copy()
+        if not isinstance(x, torch.Tensor):
+            x = torchify(x)
+
+        self.sum = x.clone()
+        x.retain_grad()
+        self.Xs.append(x)
+
+        for i in range(1, len(Xs)):
+            x = Xs[i]
+            if not isinstance(x, torch.Tensor):
+                x = torchify(x)
+
+            x.retain_grad()
+            self.Xs.append(x)
+            self.sum += x
+
+        self.sum.retain_grad()
+        self.Y = self.act_fn(self.sum)
+        self.Y.retain_grad()
+        return self.Y
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+        grads = {
+            "Xs": X,
+            "Sum": self.sum.detach().numpy(),
+            "Y": self.Y.detach().numpy(),
+            "dLdY": self.Y.grad.numpy(),
+            "dLdSum": self.sum.grad.numpy(),
+        }
+        grads.update(
+            {"dLdX{}".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}
+        )
+        return grads
+
+
+class TorchMultiplyLayer(nn.Module):
+    def __init__(self, act_fn, **kwargs):
+        super(TorchMultiplyLayer, self).__init__()
+        self.act_fn = act_fn
+
+    def forward(self, Xs):
+        self.Xs = []
+        x = Xs[0].copy()
+        if not isinstance(x, torch.Tensor):
+            x = torchify(x)
+
+        self.prod = x.clone()
+        x.retain_grad()
+        self.Xs.append(x)
+
+        for i in range(1, len(Xs)):
+            x = Xs[i]
+            if not isinstance(x, torch.Tensor):
+                x = torchify(x)
+
+            x.retain_grad()
+            self.Xs.append(x)
+            self.prod *= x
+
+        self.prod.retain_grad()
+        self.Y = self.act_fn(self.prod)
+        self.Y.retain_grad()
+        return self.Y
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+        grads = {
+            "Xs": X,
+            "Prod": self.prod.detach().numpy(),
+            "Y": self.Y.detach().numpy(),
+            "dLdY": self.Y.grad.numpy(),
+            "dLdProd": self.prod.grad.numpy(),
+        }
+        grads.update(
+            {"dLdX{}".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}
+        )
+        return grads
+
+
+class TorchSkipConnectionIdentity(nn.Module):
+    def __init__(self, act_fn, pad1, pad2, params, hparams, momentum=0.9, epsilon=1e-5):
+        super(TorchSkipConnectionIdentity, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            hparams["in_ch"],
+            hparams["out_ch"],
+            hparams["kernel_shape1"],
+            padding=pad1,
+            stride=hparams["stride1"],
+            bias=True,
+        )
+
+        self.act_fn = act_fn
+
+        self.batchnorm1 = nn.BatchNorm2d(
+            num_features=hparams["out_ch"],
+            momentum=1 - momentum,
+            eps=epsilon,
+            affine=True,
+        )
+
+        self.conv2 = nn.Conv2d(
+            hparams["out_ch"],
+            hparams["out_ch"],
+            hparams["kernel_shape2"],
+            padding=pad2,
+            stride=hparams["stride2"],
+            bias=True,
+        )
+
+        self.batchnorm2 = nn.BatchNorm2d(
+            num_features=hparams["out_ch"],
+            momentum=1 - momentum,
+            eps=epsilon,
+            affine=True,
+        )
+
+        orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]
+        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
+        W = params["components"]["conv1"]["W"]
+        b = params["components"]["conv1"]["b"]
+        W = np.moveaxis(W, orig, W_swap)
+        assert self.conv1.weight.shape == W.shape
+        assert self.conv1.bias.shape == b.flatten().shape
+        self.conv1.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+        scaler = params["components"]["batchnorm1"]["scaler"]
+        intercept = params["components"]["batchnorm1"]["intercept"]
+        self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
+        W = params["components"]["conv2"]["W"]
+        b = params["components"]["conv2"]["b"]
+        W = np.moveaxis(W, orig, W_swap)
+        assert self.conv2.weight.shape == W.shape
+        assert self.conv2.bias.shape == b.flatten().shape
+        self.conv2.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+        scaler = params["components"]["batchnorm2"]["scaler"]
+        intercept = params["components"]["batchnorm2"]["intercept"]
+        self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+    def forward(self, X):
+        if not isinstance(X, torch.Tensor):
+            # (N, H, W, C) -> (N, C, H, W)
+            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+            X = torchify(X)
+
+        self.X = X
+        self.X.retain_grad()
+
+        self.conv1_out = self.conv1(self.X)
+        self.conv1_out.retain_grad()
+
+        self.act_fn1_out = self.act_fn(self.conv1_out)
+        self.act_fn1_out.retain_grad()
+
+        self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)
+        self.batchnorm1_out.retain_grad()
+
+        self.conv2_out = self.conv2(self.batchnorm1_out)
+        self.conv2_out.retain_grad()
+
+        self.batchnorm2_out = self.batchnorm2(self.conv2_out)
+        self.batchnorm2_out.retain_grad()
+
+        self.layer3_in = self.batchnorm2_out + self.X
+        self.layer3_in.retain_grad()
+
+        self.Y = self.act_fn(self.layer3_in)
+        self.Y.retain_grad()
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+
+        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
+        grads = {
+            # layer parameters
+            "conv1_W": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),
+            "conv1_b": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "bn1_intercept": self.batchnorm1.bias.detach().numpy(),
+            "bn1_scaler": self.batchnorm1.weight.detach().numpy(),
+            "bn1_running_mean": self.batchnorm1.running_mean.detach().numpy(),
+            "bn1_running_var": self.batchnorm1.running_var.detach().numpy(),
+            "conv2_W": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),
+            "conv2_b": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "bn2_intercept": self.batchnorm2.bias.detach().numpy(),
+            "bn2_scaler": self.batchnorm2.weight.detach().numpy(),
+            "bn2_running_mean": self.batchnorm2.running_mean.detach().numpy(),
+            "bn2_running_var": self.batchnorm2.running_var.detach().numpy(),
+            # layer inputs/outputs (forward step)
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "conv1_out": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),
+            "act1_out": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),
+            "bn1_out": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),
+            "conv2_out": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),
+            "bn2_out": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),
+            "add_out": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),
+            "Y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
+            # layer gradients (backward step)
+            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
+            "dLdAdd": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),
+            "dLdBn2_out": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),
+            "dLdConv2_out": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),
+            "dLdBn1_out": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),
+            "dLdActFn1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
+            "dLdConv1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+            # layer parameter gradients (backward step)
+            "dLdBn2_intercept": self.batchnorm2.bias.grad.numpy(),
+            "dLdBn2_scaler": self.batchnorm2.weight.grad.numpy(),
+            "dLdConv2_W": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),
+            "dLdConv2_b": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),
+            "dLdBn1_intercept": self.batchnorm1.bias.grad.numpy(),
+            "dLdBn1_scaler": self.batchnorm1.weight.grad.numpy(),
+            "dLdConv1_W": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),
+            "dLdConv1_b": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),
+        }
+        return grads
+
+
+class TorchCausalConv1d(torch.nn.Conv1d):
+    """https://github.com/pytorch/pytorch/issues/1333
+
+    NB: this is only ensures that the convolution out length is the same as
+    the input length IFF stride = 1. Otherwise, in/out lengths will differ.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        self.__padding = (kernel_size - 1) * dilation
+
+        super(TorchCausalConv1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.__padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, input):
+        result = super(TorchCausalConv1d, self).forward(input)
+        if self.__padding != 0:
+            return result[:, :, : -self.__padding]
+        return result
+
+
+class TorchWavenetModule(nn.Module):
+    def __init__(self, params, hparams, conv_1x1_pad):
+        super(TorchWavenetModule, self).__init__()
+        self.conv_dilation = TorchCausalConv1d(
+            in_channels=hparams["components"]["conv_dilation"]["in_ch"],
+            out_channels=hparams["components"]["conv_dilation"]["out_ch"],
+            kernel_size=hparams["components"]["conv_dilation"]["kernel_width"],
+            stride=hparams["components"]["conv_dilation"]["stride"],
+            dilation=hparams["components"]["conv_dilation"]["dilation"] + 1,
+            bias=True,
+        )
+
+        self.conv_1x1 = nn.Conv1d(
+            in_channels=hparams["components"]["conv_1x1"]["in_ch"],
+            out_channels=hparams["components"]["conv_1x1"]["out_ch"],
+            kernel_size=hparams["components"]["conv_1x1"]["kernel_width"],
+            stride=hparams["components"]["conv_1x1"]["stride"],
+            padding=conv_1x1_pad,
+            dilation=hparams["components"]["conv_1x1"]["dilation"] + 1,
+            bias=True,
+        )
+
+        W = params["components"]["conv_dilation"]["W"]
+        b = params["components"]["conv_dilation"]["b"]
+        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
+        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
+        self.conv_dilation.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv_dilation.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+        assert self.conv_dilation.weight.shape == W.shape
+        assert self.conv_dilation.bias.shape == b.flatten().shape
+
+        W = params["components"]["conv_1x1"]["W"]
+        b = params["components"]["conv_1x1"]["b"]
+        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
+        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
+        self.conv_1x1.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv_1x1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+        assert self.conv_1x1.weight.shape == W.shape
+        assert self.conv_1x1.bias.shape == b.flatten().shape
+
+    def forward(self, X_main, X_skip):
+        # (N, W, C) -> (N, C, W)
+        self.X_main = np.moveaxis(X_main, [0, 1, 2], [0, -1, -2])
+        self.X_main = torchify(self.X_main)
+        self.X_main.retain_grad()
+
+        self.conv_dilation_out = self.conv_dilation(self.X_main)
+        self.conv_dilation_out.retain_grad()
+
+        self.tanh_out = torch.tanh(self.conv_dilation_out)
+        self.sigm_out = torch.sigmoid(self.conv_dilation_out)
+
+        self.tanh_out.retain_grad()
+        self.sigm_out.retain_grad()
+
+        self.multiply_gate_out = self.tanh_out * self.sigm_out
+        self.multiply_gate_out.retain_grad()
+
+        self.conv_1x1_out = self.conv_1x1(self.multiply_gate_out)
+        self.conv_1x1_out.retain_grad()
+
+        self.X_skip = torch.zeros_like(self.conv_1x1_out)
+        if X_skip is not None:
+            self.X_skip = torchify(np.moveaxis(X_skip, [0, 1, 2], [0, -1, -2]))
+        self.X_skip.retain_grad()
+
+        self.Y_skip = self.X_skip + self.conv_1x1_out
+        self.Y_main = self.X_main + self.conv_1x1_out
+
+        self.Y_skip.retain_grad()
+        self.Y_main.retain_grad()
+
+    def extract_grads(self, X_main, X_skip):
+        self.forward(X_main, X_skip)
+        self.loss = (self.Y_skip + self.Y_main).sum()
+        self.loss.backward()
+
+        # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)
+        # X (theirs): (N, C, W)              -> X (mine): (N, W, C)
+        # Y (theirs): (N, C, W)              -> Y (mine): (N, W, C)
+        orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]
+        grads = {
+            "X_main": np.moveaxis(self.X_main.detach().numpy(), orig, X_swap),
+            "X_skip": np.moveaxis(self.X_skip.detach().numpy(), orig, X_swap),
+            "conv_dilation_W": np.moveaxis(
+                self.conv_dilation.weight.detach().numpy(), orig, W_swap
+            ),
+            "conv_dilation_b": self.conv_dilation.bias.detach()
+            .numpy()
+            .reshape(1, 1, -1),
+            "conv_1x1_W": np.moveaxis(
+                self.conv_1x1.weight.detach().numpy(), orig, W_swap
+            ),
+            "conv_1x1_b": self.conv_1x1.bias.detach().numpy().reshape(1, 1, -1),
+            "conv_dilation_out": np.moveaxis(
+                self.conv_dilation_out.detach().numpy(), orig, X_swap
+            ),
+            "tanh_out": np.moveaxis(self.tanh_out.detach().numpy(), orig, X_swap),
+            "sigm_out": np.moveaxis(self.sigm_out.detach().numpy(), orig, X_swap),
+            "multiply_gate_out": np.moveaxis(
+                self.multiply_gate_out.detach().numpy(), orig, X_swap
+            ),
+            "conv_1x1_out": np.moveaxis(
+                self.conv_1x1_out.detach().numpy(), orig, X_swap
+            ),
+            "Y_main": np.moveaxis(self.Y_main.detach().numpy(), orig, X_swap),
+            "Y_skip": np.moveaxis(self.Y_skip.detach().numpy(), orig, X_swap),
+            "dLdY_skip": np.moveaxis(self.Y_skip.grad.numpy(), orig, X_swap),
+            "dLdY_main": np.moveaxis(self.Y_main.grad.numpy(), orig, X_swap),
+            "dLdConv_1x1_out": np.moveaxis(
+                self.conv_1x1_out.grad.numpy(), orig, X_swap
+            ),
+            "dLdConv_1x1_W": np.moveaxis(
+                self.conv_1x1.weight.grad.numpy(), orig, W_swap
+            ),
+            "dLdConv_1x1_b": self.conv_1x1.bias.grad.numpy().reshape(1, 1, -1),
+            "dLdMultiply_out": np.moveaxis(
+                self.multiply_gate_out.grad.numpy(), orig, X_swap
+            ),
+            "dLdTanh_out": np.moveaxis(self.tanh_out.grad.numpy(), orig, X_swap),
+            "dLdSigm_out": np.moveaxis(self.sigm_out.grad.numpy(), orig, X_swap),
+            "dLdConv_dilation_out": np.moveaxis(
+                self.conv_dilation_out.grad.numpy(), orig, X_swap
+            ),
+            "dLdConv_dilation_W": np.moveaxis(
+                self.conv_dilation.weight.grad.numpy(), orig, W_swap
+            ),
+            "dLdConv_dilation_b": self.conv_dilation.bias.grad.numpy().reshape(
+                1, 1, -1
+            ),
+            "dLdX_main": np.moveaxis(self.X_main.grad.numpy(), orig, X_swap),
+            "dLdX_skip": np.moveaxis(self.X_skip.grad.numpy(), orig, X_swap),
+        }
+
+        return grads
+
+
+class TorchSkipConnectionConv(nn.Module):
+    def __init__(
+        self, act_fn, pad1, pad2, pad_skip, params, hparams, momentum=0.9, epsilon=1e-5
+    ):
+        super(TorchSkipConnectionConv, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            hparams["in_ch"],
+            hparams["out_ch1"],
+            hparams["kernel_shape1"],
+            padding=pad1,
+            stride=hparams["stride1"],
+            bias=True,
+        )
+
+        self.act_fn = act_fn
+
+        self.batchnorm1 = nn.BatchNorm2d(
+            num_features=hparams["out_ch1"],
+            momentum=1 - momentum,
+            eps=epsilon,
+            affine=True,
+        )
+
+        self.conv2 = nn.Conv2d(
+            hparams["out_ch1"],
+            hparams["out_ch2"],
+            hparams["kernel_shape2"],
+            padding=pad2,
+            stride=hparams["stride2"],
+            bias=True,
+        )
+
+        self.batchnorm2 = nn.BatchNorm2d(
+            num_features=hparams["out_ch2"],
+            momentum=1 - momentum,
+            eps=epsilon,
+            affine=True,
+        )
+
+        self.conv_skip = nn.Conv2d(
+            hparams["in_ch"],
+            hparams["out_ch2"],
+            hparams["kernel_shape_skip"],
+            padding=pad_skip,
+            stride=hparams["stride_skip"],
+            bias=True,
+        )
+
+        self.batchnorm_skip = nn.BatchNorm2d(
+            num_features=hparams["out_ch2"],
+            momentum=1 - momentum,
+            eps=epsilon,
+            affine=True,
+        )
+
+        orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]
+        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
+        W = params["components"]["conv1"]["W"]
+        b = params["components"]["conv1"]["b"]
+        W = np.moveaxis(W, orig, W_swap)
+        assert self.conv1.weight.shape == W.shape
+        assert self.conv1.bias.shape == b.flatten().shape
+        self.conv1.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+        scaler = params["components"]["batchnorm1"]["scaler"]
+        intercept = params["components"]["batchnorm1"]["intercept"]
+        self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
+        W = params["components"]["conv2"]["W"]
+        b = params["components"]["conv2"]["b"]
+        W = np.moveaxis(W, orig, W_swap)
+        assert self.conv2.weight.shape == W.shape
+        assert self.conv2.bias.shape == b.flatten().shape
+        self.conv2.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+        scaler = params["components"]["batchnorm2"]["scaler"]
+        intercept = params["components"]["batchnorm2"]["intercept"]
+        self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+        W = params["components"]["conv_skip"]["W"]
+        b = params["components"]["conv_skip"]["b"]
+        W = np.moveaxis(W, orig, W_swap)
+        assert self.conv_skip.weight.shape == W.shape
+        assert self.conv_skip.bias.shape == b.flatten().shape
+        self.conv_skip.weight = nn.Parameter(torch.FloatTensor(W))
+        self.conv_skip.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+        scaler = params["components"]["batchnorm_skip"]["scaler"]
+        intercept = params["components"]["batchnorm_skip"]["intercept"]
+        self.batchnorm_skip.weight = nn.Parameter(torch.FloatTensor(scaler))
+        self.batchnorm_skip.bias = nn.Parameter(torch.FloatTensor(intercept))
+
+    def forward(self, X):
+        if not isinstance(X, torch.Tensor):
+            # (N, H, W, C) -> (N, C, H, W)
+            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+            X = torchify(X)
+
+        self.X = X
+        self.X.retain_grad()
+
+        self.conv1_out = self.conv1(self.X)
+        self.conv1_out.retain_grad()
+
+        self.act_fn1_out = self.act_fn(self.conv1_out)
+        self.act_fn1_out.retain_grad()
+
+        self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)
+        self.batchnorm1_out.retain_grad()
+
+        self.conv2_out = self.conv2(self.batchnorm1_out)
+        self.conv2_out.retain_grad()
+
+        self.batchnorm2_out = self.batchnorm2(self.conv2_out)
+        self.batchnorm2_out.retain_grad()
+
+        self.c_skip_out = self.conv_skip(self.X)
+        self.c_skip_out.retain_grad()
+
+        self.bn_skip_out = self.batchnorm_skip(self.c_skip_out)
+        self.bn_skip_out.retain_grad()
+
+        self.layer3_in = self.batchnorm2_out + self.bn_skip_out
+        self.layer3_in.retain_grad()
+
+        self.Y = self.act_fn(self.layer3_in)
+        self.Y.retain_grad()
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+
+        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
+        grads = {
+            # layer parameters
+            "conv1_W": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),
+            "conv1_b": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "bn1_intercept": self.batchnorm1.bias.detach().numpy(),
+            "bn1_scaler": self.batchnorm1.weight.detach().numpy(),
+            "bn1_running_mean": self.batchnorm1.running_mean.detach().numpy(),
+            "bn1_running_var": self.batchnorm1.running_var.detach().numpy(),
+            "conv2_W": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),
+            "conv2_b": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "bn2_intercept": self.batchnorm2.bias.detach().numpy(),
+            "bn2_scaler": self.batchnorm2.weight.detach().numpy(),
+            "bn2_running_mean": self.batchnorm2.running_mean.detach().numpy(),
+            "bn2_running_var": self.batchnorm2.running_var.detach().numpy(),
+            "conv_skip_W": np.moveaxis(
+                self.conv_skip.weight.detach().numpy(), orig, W_swap
+            ),
+            "conv_skip_b": self.conv_skip.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "bn_skip_intercept": self.batchnorm_skip.bias.detach().numpy(),
+            "bn_skip_scaler": self.batchnorm_skip.weight.detach().numpy(),
+            "bn_skip_running_mean": self.batchnorm_skip.running_mean.detach().numpy(),
+            "bn_skip_running_var": self.batchnorm_skip.running_var.detach().numpy(),
+            # layer inputs/outputs (forward step)
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "conv1_out": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),
+            "act1_out": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),
+            "bn1_out": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),
+            "conv2_out": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),
+            "bn2_out": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),
+            "conv_skip_out": np.moveaxis(
+                self.c_skip_out.detach().numpy(), orig, X_swap
+            ),
+            "bn_skip_out": np.moveaxis(self.bn_skip_out.detach().numpy(), orig, X_swap),
+            "add_out": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),
+            "Y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
+            # layer gradients (backward step)
+            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
+            "dLdAdd": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),
+            "dLdBnSkip_out": np.moveaxis(self.bn_skip_out.grad.numpy(), orig, X_swap),
+            "dLdConvSkip_out": np.moveaxis(self.c_skip_out.grad.numpy(), orig, X_swap),
+            "dLdBn2_out": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),
+            "dLdConv2_out": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),
+            "dLdBn1_out": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),
+            "dLdActFn1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
+            "dLdConv1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+            # layer parameter gradients (backward step)
+            "dLdBnSkip_intercept": self.batchnorm_skip.bias.grad.numpy(),
+            "dLdBnSkip_scaler": self.batchnorm_skip.weight.grad.numpy(),
+            "dLdConvSkip_W": np.moveaxis(
+                self.conv_skip.weight.grad.numpy(), orig, W_swap
+            ),
+            "dLdConvSkip_b": self.conv_skip.bias.grad.numpy().reshape(1, 1, 1, -1),
+            "dLdBn2_intercept": self.batchnorm2.bias.grad.numpy(),
+            "dLdBn2_scaler": self.batchnorm2.weight.grad.numpy(),
+            "dLdConv2_W": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),
+            "dLdConv2_b": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),
+            "dLdBn1_intercept": self.batchnorm1.bias.grad.numpy(),
+            "dLdBn1_scaler": self.batchnorm1.weight.grad.numpy(),
+            "dLdConv1_W": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),
+            "dLdConv1_b": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),
+        }
+        return grads
+
+
+class TorchBidirectionalLSTM(nn.Module):
+    def __init__(self, n_in, n_out, params, **kwargs):
+        super(TorchBidirectionalLSTM, self).__init__()
+
+        self.layer1 = nn.LSTM(
+            input_size=n_in,
+            hidden_size=n_out,
+            num_layers=1,
+            bidirectional=True,
+            bias=True,
+        )
+
+        Wiu = params["components"]["cell_fwd"]["Wu"][n_out:, :].T
+        Wif = params["components"]["cell_fwd"]["Wf"][n_out:, :].T
+        Wic = params["components"]["cell_fwd"]["Wc"][n_out:, :].T
+        Wio = params["components"]["cell_fwd"]["Wo"][n_out:, :].T
+        W_ih_f = np.vstack([Wiu, Wif, Wic, Wio])
+
+        Whu = params["components"]["cell_fwd"]["Wu"][:n_out, :].T
+        Whf = params["components"]["cell_fwd"]["Wf"][:n_out, :].T
+        Whc = params["components"]["cell_fwd"]["Wc"][:n_out, :].T
+        Who = params["components"]["cell_fwd"]["Wo"][:n_out, :].T
+        W_hh_f = np.vstack([Whu, Whf, Whc, Who])
+
+        assert self.layer1.weight_ih_l0.shape == W_ih_f.shape
+        assert self.layer1.weight_hh_l0.shape == W_hh_f.shape
+
+        self.layer1.weight_ih_l0 = nn.Parameter(torch.FloatTensor(W_ih_f))
+        self.layer1.weight_hh_l0 = nn.Parameter(torch.FloatTensor(W_hh_f))
+
+        Wiu = params["components"]["cell_bwd"]["Wu"][n_out:, :].T
+        Wif = params["components"]["cell_bwd"]["Wf"][n_out:, :].T
+        Wic = params["components"]["cell_bwd"]["Wc"][n_out:, :].T
+        Wio = params["components"]["cell_bwd"]["Wo"][n_out:, :].T
+        W_ih_b = np.vstack([Wiu, Wif, Wic, Wio])
+
+        Whu = params["components"]["cell_bwd"]["Wu"][:n_out, :].T
+        Whf = params["components"]["cell_bwd"]["Wf"][:n_out, :].T
+        Whc = params["components"]["cell_bwd"]["Wc"][:n_out, :].T
+        Who = params["components"]["cell_bwd"]["Wo"][:n_out, :].T
+        W_hh_b = np.vstack([Whu, Whf, Whc, Who])
+
+        assert self.layer1.weight_ih_l0_reverse.shape == W_ih_b.shape
+        assert self.layer1.weight_hh_l0_reverse.shape == W_hh_b.shape
+
+        self.layer1.weight_ih_l0_reverse = nn.Parameter(torch.FloatTensor(W_ih_b))
+        self.layer1.weight_hh_l0_reverse = nn.Parameter(torch.FloatTensor(W_hh_b))
+
+        b_f = np.concatenate(
+            [
+                params["components"]["cell_fwd"]["bu"],
+                params["components"]["cell_fwd"]["bf"],
+                params["components"]["cell_fwd"]["bc"],
+                params["components"]["cell_fwd"]["bo"],
+            ],
+            axis=-1,
+        ).flatten()
+
+        assert self.layer1.bias_ih_l0.shape == b_f.shape
+        assert self.layer1.bias_hh_l0.shape == b_f.shape
+
+        self.layer1.bias_ih_l0 = nn.Parameter(torch.FloatTensor(b_f))
+        self.layer1.bias_hh_l0 = nn.Parameter(torch.FloatTensor(b_f))
+
+        b_b = np.concatenate(
+            [
+                params["components"]["cell_bwd"]["bu"],
+                params["components"]["cell_bwd"]["bf"],
+                params["components"]["cell_bwd"]["bc"],
+                params["components"]["cell_bwd"]["bo"],
+            ],
+            axis=-1,
+        ).flatten()
+
+        assert self.layer1.bias_ih_l0_reverse.shape == b_b.shape
+        assert self.layer1.bias_hh_l0_reverse.shape == b_b.shape
+
+        self.layer1.bias_ih_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))
+        self.layer1.bias_hh_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))
+
+    def forward(self, X):
+        # (batch, input_size, seq_len) -> (seq_len, batch, input_size)
+        self.X = np.moveaxis(X, [0, 1, 2], [-2, -1, -3])
+
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+
+        # initial hidden state is 0
+        n_ex, n_in, n_timesteps = self.X.shape
+        n_out, n_out = self.layer1.weight_hh_l0.shape
+
+        # forward pass
+        self.A, (At, Ct) = self.layer1(self.X)
+        self.A.retain_grad()
+        return self.A
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.A.sum()
+        self.loss.backward()
+
+        # forward
+        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0.chunk(4, 0)
+        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0.chunk(4, 0)
+        bu_f, bf_f, bc_f, bo_f = self.layer1.bias_ih_l0.chunk(4, 0)
+
+        Wu_f = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
+        Wf_f = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
+        Wc_f = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
+        Wo_f = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)
+
+        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0.grad.chunk(4, 0)
+        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0.grad.chunk(4, 0)
+        dbu_f, dbf_f, dbc_f, dbo_f = self.layer1.bias_ih_l0.grad.chunk(4, 0)
+
+        dWu_f = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
+        dWf_f = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
+        dWc_f = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
+        dWo_f = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)
+
+        # backward
+        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0_reverse.chunk(4, 0)
+        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0_reverse.chunk(4, 0)
+        bu_b, bf_b, bc_b, bo_b = self.layer1.bias_ih_l0_reverse.chunk(4, 0)
+
+        Wu_b = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
+        Wf_b = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
+        Wc_b = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
+        Wo_b = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)
+
+        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0_reverse.grad.chunk(4, 0)
+        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0_reverse.grad.chunk(4, 0)
+        dbu_b, dbf_b, dbc_b, dbo_b = self.layer1.bias_ih_l0_reverse.grad.chunk(4, 0)
+
+        dWu_b = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
+        dWf_b = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
+        dWc_b = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
+        dWo_b = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)
+
+        orig, X_swap = [0, 1, 2], [-1, -3, -2]
+        grads = {
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "Wu_f": Wu_f.detach().numpy(),
+            "Wf_f": Wf_f.detach().numpy(),
+            "Wc_f": Wc_f.detach().numpy(),
+            "Wo_f": Wo_f.detach().numpy(),
+            "bu_f": bu_f.detach().numpy().reshape(-1, 1),
+            "bf_f": bf_f.detach().numpy().reshape(-1, 1),
+            "bc_f": bc_f.detach().numpy().reshape(-1, 1),
+            "bo_f": bo_f.detach().numpy().reshape(-1, 1),
+            "Wu_b": Wu_b.detach().numpy(),
+            "Wf_b": Wf_b.detach().numpy(),
+            "Wc_b": Wc_b.detach().numpy(),
+            "Wo_b": Wo_b.detach().numpy(),
+            "bu_b": bu_b.detach().numpy().reshape(-1, 1),
+            "bf_b": bf_b.detach().numpy().reshape(-1, 1),
+            "bc_b": bc_b.detach().numpy().reshape(-1, 1),
+            "bo_b": bo_b.detach().numpy().reshape(-1, 1),
+            "y": np.moveaxis(self.A.detach().numpy(), orig, X_swap),
+            "dLdA": self.A.grad.numpy(),
+            "dLdWu_f": dWu_f.numpy(),
+            "dLdWf_f": dWf_f.numpy(),
+            "dLdWc_f": dWc_f.numpy(),
+            "dLdWo_f": dWo_f.numpy(),
+            "dLdBu_f": dbu_f.numpy().reshape(-1, 1),
+            "dLdBf_f": dbf_f.numpy().reshape(-1, 1),
+            "dLdBc_f": dbc_f.numpy().reshape(-1, 1),
+            "dLdBo_f": dbo_f.numpy().reshape(-1, 1),
+            "dLdWu_b": dWu_b.numpy(),
+            "dLdWf_b": dWf_b.numpy(),
+            "dLdWc_b": dWc_b.numpy(),
+            "dLdWo_b": dWo_b.numpy(),
+            "dLdBu_b": dbu_b.numpy().reshape(-1, 1),
+            "dLdBf_b": dbf_b.numpy().reshape(-1, 1),
+            "dLdBc_b": dbc_b.numpy().reshape(-1, 1),
+            "dLdBo_b": dbo_b.numpy().reshape(-1, 1),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+        }
+        return grads
+
+
+class TorchPool2DLayer(nn.Module):
+    def __init__(self, in_channels, hparams, **kwargs):
+        super(TorchPool2DLayer, self).__init__()
+
+        if hparams["mode"] == "max":
+            self.layer1 = nn.MaxPool2d(
+                kernel_size=hparams["kernel_shape"],
+                padding=hparams["pad"],
+                stride=hparams["stride"],
+            )
+        elif hparams["mode"] == "average":
+            self.layer1 = nn.AvgPool2d(
+                kernel_size=hparams["kernel_shape"],
+                padding=hparams["pad"],
+                stride=hparams["stride"],
+            )
+
+    def forward(self, X):
+        # (N, H, W, C) -> (N, C, H, W)
+        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+        self.Y = self.layer1(self.X)
+        self.Y.retain_grad()
+        return self.Y
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+
+        # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
+        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
+        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
+        orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
+        grads = {
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
+            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+        }
+        return grads
+
+
+class TorchConv2DLayer(nn.Module):
+    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
+        super(TorchConv2DLayer, self).__init__()
+
+        W = params["W"]
+        b = params["b"]
+        self.act_fn = act_fn
+
+        self.layer1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            hparams["kernel_shape"],
+            padding=hparams["pad"],
+            stride=hparams["stride"],
+            dilation=hparams["dilation"] + 1,
+            bias=True,
+        )
+
+        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
+        W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -3, -4])
+        assert self.layer1.weight.shape == W.shape
+        assert self.layer1.bias.shape == b.flatten().shape
+
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
+        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+    def forward(self, X):
+        # (N, H, W, C) -> (N, C, H, W)
+        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+
+        self.Z = self.layer1(self.X)
+        self.Z.retain_grad()
+
+        self.Y = self.act_fn(self.Z)
+        self.Y.retain_grad()
+        return self.Y
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+
+        # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
+        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
+        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
+        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
+        grads = {
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
+            "b": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
+            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
+            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
+            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
+            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+        }
+        return grads
+
+
+class TorchConv1DLayer(nn.Module):
+    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
+        super(TorchConv1DLayer, self).__init__()
+
+        W = params["W"]
+        b = params["b"]
+        self.act_fn = act_fn
+
+        self.layer1 = nn.Conv1d(
+            in_channels,
+            out_channels,
+            hparams["kernel_width"],
+            padding=hparams["pad"],
+            stride=hparams["stride"],
+            dilation=hparams["dilation"] + 1,
+            bias=True,
+        )
+
+        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
+        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
+        assert self.layer1.weight.shape == W.shape
+        assert self.layer1.bias.shape == b.flatten().shape
+
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
+        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+    def forward(self, X):
+        # (N, W, C) -> (N, C, W)
+        self.X = np.moveaxis(X, [0, 1, 2], [0, -1, -2])
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+
+        self.Z = self.layer1(self.X)
+        self.Z.retain_grad()
+
+        self.Y = self.act_fn(self.Z)
+        self.Y.retain_grad()
+        return self.Y
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+
+        # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)
+        # X (theirs): (N, C, W)              -> X (mine): (N, W, C)
+        # Y (theirs): (N, C, W)              -> Y (mine): (N, W, C)
+        orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]
+        grads = {
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
+            "b": self.layer1.bias.detach().numpy().reshape(1, 1, -1),
+            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
+            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
+            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
+            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
+            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, -1),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+        }
+        return grads
+
+
+class TorchDeconv2DLayer(nn.Module):
+    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
+        super(TorchDeconv2DLayer, self).__init__()
+
+        W = params["W"]
+        b = params["b"]
+        self.act_fn = act_fn
+
+        self.layer1 = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            hparams["kernel_shape"],
+            padding=hparams["pad"],
+            stride=hparams["stride"],
+            dilation=1,
+            bias=True,
+        )
+
+        # (f[0], f[1], n_in, n_out) -> (n_in, n_out, f[0], f[1])
+        W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -4, -3])
+        assert self.layer1.weight.shape == W.shape
+        assert self.layer1.bias.shape == b.flatten().shape
+
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
+        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+    def forward(self, X):
+        # (N, H, W, C) -> (N, C, H, W)
+        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+
+        self.Z = self.layer1(self.X)
+        self.Z.retain_grad()
+
+        self.Y = self.act_fn(self.Z)
+        self.Y.retain_grad()
+        return self.Y
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = self.Y.sum()
+        self.loss.backward()
+
+        # W (theirs): (n_in, n_out, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
+        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
+        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
+        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-2, -1, -4, -3]
+        grads = {
+            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
+            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
+            "b": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),
+            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
+            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
+            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
+            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
+            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),
+            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
+        }
+        return grads
+
+
+class TorchLSTMCell(nn.Module):
+    def __init__(self, n_in, n_out, params, **kwargs):
+        super(TorchLSTMCell, self).__init__()
+
+        Wiu = params["Wu"][n_out:, :].T
+        Wif = params["Wf"][n_out:, :].T
+        Wic = params["Wc"][n_out:, :].T
+        Wio = params["Wo"][n_out:, :].T
+        W_ih = np.vstack([Wiu, Wif, Wic, Wio])
+
+        Whu = params["Wu"][:n_out, :].T
+        Whf = params["Wf"][:n_out, :].T
+        Whc = params["Wc"][:n_out, :].T
+        Who = params["Wo"][:n_out, :].T
+        W_hh = np.vstack([Whu, Whf, Whc, Who])
+
+        self.layer1 = nn.LSTMCell(input_size=n_in, hidden_size=n_out, bias=True)
+        assert self.layer1.weight_ih.shape == W_ih.shape
+        assert self.layer1.weight_hh.shape == W_hh.shape
+        self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(W_ih))
+        self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(W_hh))
+
+        b = np.concatenate(
+            [params["bu"], params["bf"], params["bc"], params["bo"]], axis=-1
+        ).flatten()
+        assert self.layer1.bias_ih.shape == b.shape
+        assert self.layer1.bias_hh.shape == b.shape
+        self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(b))
+        self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(b))
+
+    def forward(self, X):
+        self.X = X
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+
+        # initial hidden state is 0
+        n_ex, n_in, n_timesteps = self.X.shape
+        n_out, n_out = self.layer1.weight_hh.shape
+
+        # initialize hidden states
+        a0 = torchify(np.zeros((n_ex, n_out)))
+        c0 = torchify(np.zeros((n_ex, n_out)))
+        a0.retain_grad()
+        c0.retain_grad()
+
+        # forward pass
+        A, C = [], []
+        at = a0
+        ct = c0
+        for t in range(n_timesteps):
+            A.append(at)
+            C.append(ct)
+            at1, ct1 = self.layer1(self.X[:, :, t], (at, ct))
+            at.retain_grad()
+            ct.retain_grad()
+            at = at1
+            ct = ct1
+
+        at.retain_grad()
+        ct.retain_grad()
+        A.append(at)
+        C.append(ct)
+
+        # don't inclue a0 in our outputs
+        self.A = A[1:]
+        self.C = C[1:]
+        return self.A, self.C
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = torch.stack(self.A).sum()
+        self.loss.backward()
+
+        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih.chunk(4, 0)
+        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh.chunk(4, 0)
+        bu, bf, bc, bo = self.layer1.bias_ih.chunk(4, 0)
+
+        Wu = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
+        Wf = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
+        Wc = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
+        Wo = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)
+
+        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih.grad.chunk(4, 0)
+        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh.grad.chunk(4, 0)
+        dbu, dbf, dbc, dbo = self.layer1.bias_ih.grad.chunk(4, 0)
+
+        dWu = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
+        dWf = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
+        dWc = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
+        dWo = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)
+
+        grads = {
+            "X": self.X.detach().numpy(),
+            "Wu": Wu.detach().numpy(),
+            "Wf": Wf.detach().numpy(),
+            "Wc": Wc.detach().numpy(),
+            "Wo": Wo.detach().numpy(),
+            "bu": bu.detach().numpy().reshape(-1, 1),
+            "bf": bf.detach().numpy().reshape(-1, 1),
+            "bc": bc.detach().numpy().reshape(-1, 1),
+            "bo": bo.detach().numpy().reshape(-1, 1),
+            "C": torch.stack(self.C).detach().numpy(),
+            "y": np.swapaxes(
+                np.swapaxes(torch.stack(self.A).detach().numpy(), 1, 0), 1, 2
+            ),
+            "dLdA": np.array([a.grad.numpy() for a in self.A]),
+            "dLdWu": dWu.numpy(),
+            "dLdWf": dWf.numpy(),
+            "dLdWc": dWc.numpy(),
+            "dLdWo": dWo.numpy(),
+            "dLdBu": dbu.numpy().reshape(-1, 1),
+            "dLdBf": dbf.numpy().reshape(-1, 1),
+            "dLdBc": dbc.numpy().reshape(-1, 1),
+            "dLdBo": dbo.numpy().reshape(-1, 1),
+            "dLdX": self.X.grad.numpy(),
+        }
+        return grads
+
+
+class TorchRNNCell(nn.Module):
+    def __init__(self, n_in, n_hid, params, **kwargs):
+        super(TorchRNNCell, self).__init__()
+
+        self.layer1 = nn.RNNCell(n_in, n_hid, bias=True, nonlinearity="tanh")
+
+        # set weights and bias to match those of RNNCell
+        # NB: we pass the *transpose* of the RNNCell weights and biases to
+        # pytorch, meaning we need to check against the *transpose* of our
+        # outputs for any function of the weights
+        self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(params["Wax"].T))
+        self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(params["Waa"].T))
+        self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(params["bx"].T))
+        self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(params["ba"].T))
+
+    def forward(self, X):
+        self.X = X
+        if not isinstance(self.X, torch.Tensor):
+            self.X = torchify(self.X)
+
+        self.X.retain_grad()
+
+        # initial hidden state is 0
+        n_ex, n_in, n_timesteps = self.X.shape
+        n_out, n_out = self.layer1.weight_hh.shape
+
+        # initialize hidden states
+        a0 = torchify(np.zeros((n_ex, n_out)))
+        a0.retain_grad()
+
+        # forward pass
+        A = []
+        at = a0
+        for t in range(n_timesteps):
+            A += [at]
+            at1 = self.layer1(self.X[:, :, t], at)
+            at.retain_grad()
+            at = at1
+
+        at.retain_grad()
+        A += [at]
+
+        # don't inclue a0 in our outputs
+        self.A = A[1:]
+        return self.A
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss = torch.stack(self.A).sum()
+        self.loss.backward()
+        grads = {
+            "X": self.X.detach().numpy(),
+            "ba": self.layer1.bias_hh.detach().numpy(),
+            "bx": self.layer1.bias_ih.detach().numpy(),
+            "Wax": self.layer1.weight_ih.detach().numpy(),
+            "Waa": self.layer1.weight_hh.detach().numpy(),
+            "y": torch.stack(self.A).detach().numpy(),
+            "dLdA": np.array([a.grad.numpy() for a in self.A]),
+            "dLdWaa": self.layer1.weight_hh.grad.numpy(),
+            "dLdWax": self.layer1.weight_ih.grad.numpy(),
+            "dLdBa": self.layer1.bias_hh.grad.numpy(),
+            "dLdBx": self.layer1.bias_ih.grad.numpy(),
+            "dLdX": self.X.grad.numpy(),
+        }
+        return grads
+
+
+class TorchFCLayer(nn.Module):
+    def __init__(self, n_in, n_hid, act_fn, params, **kwargs):
+        super(TorchFCLayer, self).__init__()
+        self.layer1 = nn.Linear(n_in, n_hid)
+
+        # explicitly set weights and bias
+        # NB: we pass the *transpose* of the weights to pytorch, meaning
+        # we'll need to check against the *transpose* of our outputs for
+        # any function of the weights
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(params["W"].T))
+        self.layer1.bias = nn.Parameter(torch.FloatTensor(params["b"]))
+
+        self.act_fn = act_fn
+        self.model = nn.Sequential(self.layer1, self.act_fn)
+
+    def forward(self, X):
+        self.X = X
+        if not isinstance(X, torch.Tensor):
+            self.X = torchify(X)
+
+        self.z1 = self.layer1(self.X)
+        self.z1.retain_grad()
+
+        self.out1 = self.act_fn(self.z1)
+        self.out1.retain_grad()
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss1 = self.out1.sum()
+        self.loss1.backward()
+        grads = {
+            "X": self.X.detach().numpy(),
+            "b": self.layer1.bias.detach().numpy(),
+            "W": self.layer1.weight.detach().numpy(),
+            "y": self.out1.detach().numpy(),
+            "dLdy": self.out1.grad.numpy(),
+            "dLdZ": self.z1.grad.numpy(),
+            "dLdB": self.layer1.bias.grad.numpy(),
+            "dLdW": self.layer1.weight.grad.numpy(),
+            "dLdX": self.X.grad.numpy(),
+        }
+        return grads
+
+
+class TorchEmbeddingLayer(nn.Module):
+    def __init__(self, vocab_size, n_out, params, **kwargs):
+        super(TorchEmbeddingLayer, self).__init__()
+        self.layer1 = nn.Embedding(vocab_size, n_out)
+
+        # explicitly set embedding weights
+        self.layer1.weight = nn.Parameter(torch.FloatTensor(params["W"]))
+        self.model = nn.Sequential(self.layer1)
+
+    def forward(self, X):
+        self.X = X
+        if not isinstance(X, torch.Tensor):
+            self.X = torch.from_numpy(X)
+
+        self.out1 = self.layer1(self.X)
+        self.out1.retain_grad()
+
+    def extract_grads(self, X):
+        self.forward(X)
+        self.loss1 = self.out1.sum()
+        self.loss1.backward()
+        grads = {
+            "X": self.X.detach().numpy(),
+            "W": self.layer1.weight.detach().numpy(),
+            "y": self.out1.detach().numpy(),
+            "dLdy": self.out1.grad.numpy(),
+            "dLdW": self.layer1.weight.grad.numpy(),
+        }
+        return grads
+
+
+class TorchSDPAttentionLayer(nn.Module):
+    def __init__(self):
+        super(TorchSDPAttentionLayer, self).__init__()
+
+    def forward(self, Q, K, V, mask=None):
+        self.Q = Q
+        self.K = K
+        self.V = V
+
+        if not isinstance(self.Q, torch.Tensor):
+            self.Q = torchify(self.Q)
+        if not isinstance(self.K, torch.Tensor):
+            self.K = torchify(self.K)
+        if not isinstance(self.V, torch.Tensor):
+            self.V = torchify(self.V)
+
+        self.Q.retain_grad()
+        self.K.retain_grad()
+        self.V.retain_grad()
+
+        self.d_k = self.Q.size(-1)
+        self.scores = torch.matmul(self.Q, self.K.transpose(-2, -1)) / np.sqrt(self.d_k)
+        if mask is not None:
+            self.scores = self.scores.masked_fill(mask == 0, -1e9)
+        self.scores.retain_grad()
+
+        self.weights = F.softmax(self.scores, dim=-1)
+        self.weights.retain_grad()
+        self.Y = torch.matmul(self.weights, self.V)
+        self.Y.retain_grad()
+        return self.Y, self.weights
+
+    def extract_grads(self, Q, K, V, mask=None):
+        self.forward(Q, K, V, mask=mask)
+        self.loss1 = self.Y.sum()
+        self.loss1.backward()
+        grads = {
+            "Q": self.Q.detach().numpy(),
+            "K": self.K.detach().numpy(),
+            "V": self.V.detach().numpy(),
+            "d_k": self.d_k,
+            "scores": self.scores.detach().numpy(),
+            "weights": self.weights.detach().numpy(),
+            "Y": self.Y.detach().numpy(),
+            "dLdV": self.V.grad.numpy(),
+            "dWeights": self.weights.grad.numpy(),
+            "dScores": self.scores.grad.numpy(),
+            "dLdQ": self.Q.grad.numpy(),
+            "dLdK": self.K.grad.numpy(),
+        }
+        return grads
+
+
+class TorchMultiHeadedAttentionModule(nn.Module):
+    def __init__(self, params, hparams):
+        "Take in model size and number of heads."
+        super(TorchMultiHeadedAttentionModule, self).__init__()
+        assert hparams["kqv_dim"] % hparams["n_heads"] == 0
+        self.n_heads = hparams["n_heads"]
+        self.latent_dim = hparams["kqv_dim"] // hparams["n_heads"]
+        self.p_dropout = hparams["dropout_p"]
+        self.projections = {
+            "Q": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
+            "K": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
+            "V": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
+            "O": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
+        }
+        self.projections["Q"].weight = nn.Parameter(
+            torch.FloatTensor(params["components"]["Q"]["W"].T)
+        )
+        self.projections["Q"].bias = nn.Parameter(
+            torch.FloatTensor(params["components"]["Q"]["b"])
+        )
+        self.projections["K"].weight = nn.Parameter(
+            torch.FloatTensor(params["components"]["K"]["W"].T)
+        )
+        self.projections["K"].bias = nn.Parameter(
+            torch.FloatTensor(params["components"]["K"]["b"])
+        )
+        self.projections["V"].weight = nn.Parameter(
+            torch.FloatTensor(params["components"]["V"]["W"].T)
+        )
+        self.projections["V"].bias = nn.Parameter(
+            torch.FloatTensor(params["components"]["V"]["b"])
+        )
+        self.projections["O"].weight = nn.Parameter(
+            torch.FloatTensor(params["components"]["O"]["W"].T)
+        )
+        self.projections["O"].bias = nn.Parameter(
+            torch.FloatTensor(params["components"]["O"]["b"])
+        )
+
+        self.attn = None
+        self.dropout = nn.Dropout(p=hparams["dropout_p"])
+
+    def forward(self, Q, K, V, mask=None):
+        self.Q = Q
+        self.K = K
+        self.V = V
+
+        if not isinstance(self.Q, torch.Tensor):
+            self.Q = torchify(self.Q)
+        if not isinstance(self.K, torch.Tensor):
+            self.K = torchify(self.K)
+        if not isinstance(self.V, torch.Tensor):
+            self.V = torchify(self.V)
+
+        self.Q.retain_grad()
+        self.K.retain_grad()
+        self.V.retain_grad()
+
+        if mask is not None:
+            # Same mask applied to all h heads.
+            mask = mask.unsqueeze(1)
+        n_ex = self.Q.size(0)
+
+        self.Q_proj = (
+            self.projections["Q"](self.Q)
+            .view(n_ex, -1, self.n_heads, self.latent_dim)
+            .transpose(1, 2)
+        )
+
+        self.K_proj = (
+            self.projections["K"](self.K)
+            .view(n_ex, -1, self.n_heads, self.latent_dim)
+            .transpose(1, 2)
+        )
+
+        self.V_proj = (
+            self.projections["V"](self.V)
+            .view(n_ex, -1, self.n_heads, self.latent_dim)
+            .transpose(1, 2)
+        )
+
+        self.Q_proj.retain_grad()
+        self.K_proj.retain_grad()
+        self.V_proj.retain_grad()
+
+        # 2) Apply attention on all the projected vectors in batch.
+        self.attn_out, self.attn = TorchSDPAttentionLayer().forward(
+            self.Q_proj, self.K_proj, self.V_proj, mask=mask
+        )
+        self.attn.retain_grad()
+        self.attn_out.retain_grad()
+
+        # 3) "Concat" using a view and apply a final linear transformation
+        self.attn_out_reshaped = (
+            self.attn_out.transpose(1, 2)
+            .contiguous()
+            .view(n_ex, -1, self.n_heads * self.latent_dim)
+        )
+        self.attn_out_reshaped.retain_grad()
+        print(self.attn_out_reshaped.shape)
+        self.Y = self.projections["O"](self.attn_out_reshaped)
+        print(self.Y.shape)
+        self.Y.retain_grad()
+
+    def extract_grads(self, Q, K, V, mask=None):
+        self.forward(Q, K, V, mask=mask)
+        self.loss1 = self.Y.sum()
+        self.loss1.backward()
+        grads = {
+            "Q": self.Q.detach().numpy(),
+            "K": self.K.detach().numpy(),
+            "V": self.V.detach().numpy(),
+            "O_W": self.projections["O"].weight.detach().numpy().T,
+            "V_W": self.projections["V"].weight.detach().numpy().T,
+            "K_W": self.projections["K"].weight.detach().numpy().T,
+            "Q_W": self.projections["Q"].weight.detach().numpy().T,
+            "O_b": self.projections["O"].bias.detach().numpy(),
+            "V_b": self.projections["V"].bias.detach().numpy(),
+            "K_b": self.projections["K"].bias.detach().numpy(),
+            "Q_b": self.projections["Q"].bias.detach().numpy(),
+            "latent_dim": self.latent_dim,
+            "n_heads": self.n_heads,
+            "Q_proj": self.Q_proj.detach().numpy(),  # .reshape(self.Q_proj.shape[0], -1),
+            "K_proj": self.K_proj.detach().numpy(),  # .reshape(self.K_proj.shape[0], -1),
+            "V_proj": self.V_proj.detach().numpy(),  # .reshape(self.V_proj.shape[0], -1),
+            "weights": self.attn.detach().numpy(),
+            "attn_out": self.attn_out_reshaped.detach().numpy(),  # .squeeze(),
+            #  .reshape(self.attn_out_reshaped.shape[0], -1),
+            "Y": self.Y.detach().numpy(),
+            "dO_W": self.projections["O"].weight.grad.numpy().T,
+            "dV_W": self.projections["V"].weight.grad.numpy().T,
+            "dK_W": self.projections["K"].weight.grad.numpy().T,
+            "dQ_W": self.projections["Q"].weight.grad.numpy().T,
+            "dO_b": self.projections["O"].bias.grad.numpy(),
+            "dV_b": self.projections["V"].bias.grad.numpy(),
+            "dK_b": self.projections["K"].bias.grad.numpy(),
+            "dQ_b": self.projections["Q"].bias.grad.numpy(),
+            "dLdy": self.Y.grad.numpy(),
+            "dAttn_out": self.attn_out_reshaped.grad.numpy(),
+            "dWeights": self.attn.grad.numpy(),
+            "dQ_proj": self.Q_proj.grad.numpy(),
+            "dK_proj": self.K_proj.grad.numpy(),
+            "dV_proj": self.V_proj.grad.numpy(),
+            "dQ": self.Q.grad.numpy(),
+            "dK": self.K.grad.numpy(),
+            "dV": self.V.grad.numpy(),
+        }
+        return grads
+
+
+#######################################################################
+#              TF WGAN GP Gold Standard Implementation                #
+#  adapted from: https://github.com/igul222/improved_wgan_training/   #
+#######################################################################
+
+_params = {}
+_param_aliases = {}
+
+
+def param(name, *args, **kwargs):
+    """
+    A wrapper for `tf.Variable` which enables parameter sharing in models.
+
+    Creates and returns theano shared variables similarly to `tf.Variable`,
+    except if you try to create a param with the same name as a
+    previously-created one, `param(...)` will just return the old one instead of
+    making a new one.
+
+    This constructor also adds a `param` attribute to the shared variables it
+    creates, so that you can easily search a graph for all params.
+    """
+
+    if name not in _params:
+        kwargs["name"] = name
+        param = tf.Variable(*args, **kwargs)
+        param.param = True
+        _params[name] = param
+    result = _params[name]
+    i = 0
+    while result in _param_aliases:
+        i += 1
+        result = _param_aliases[result]
+    return result
+
+
+def params_with_name(name):
+    return [p for n, p in _params.items() if name in n]
+
+
+def ReLULayer(name, n_in, n_out, inputs, w_initialization):
+    if isinstance(w_initialization, np.ndarray):
+        weight_values = w_initialization.astype("float32")
+
+    W = param(name + ".W", weight_values)
+    result = tf.matmul(inputs, W)
+    output = tf.nn.bias_add(
+        result, param(name + ".b", np.zeros((n_out,), dtype="float32"))
+    )
+    output = tf.nn.relu(output)
+    return output, W
+
+
+def LinearLayer(name, n_in, n_out, inputs, w_initialization):
+    if isinstance(w_initialization, np.ndarray):
+        weight_values = w_initialization.astype("float32")
+
+    W = param(name + ".W", weight_values)
+    result = tf.matmul(inputs, W)
+    output = tf.nn.bias_add(
+        result, param(name + ".b", np.zeros((n_out,), dtype="float32"))
+    )
+    return output, W
+
+
+def Generator(n_samples, X_real, params=None):
+    n_feats = 2
+    W1 = W2 = W3 = W4 = "he"
+    noise = tf.random.normal([n_samples, 2])
+    if params is not None:
+        noise = tf.convert_to_tensor(params["noise"], dtype="float32")
+        W1 = params["generator"]["FC1"]["W"]
+        W2 = params["generator"]["FC2"]["W"]
+        W3 = params["generator"]["FC3"]["W"]
+        W4 = params["generator"]["FC4"]["W"]
+        DIM = params["g_hidden"]
+        n_feats = params["n_in"]
+
+    outs = {}
+    weights = {}
+    output, W = ReLULayer("Generator.1", n_feats, DIM, noise, w_initialization=W1)
+    outs["FC1"] = output
+    weights["FC1"] = W
+    output, W = ReLULayer("Generator.2", DIM, DIM, output, w_initialization=W2)
+    outs["FC2"] = output
+    weights["FC2"] = W
+    output, W = ReLULayer("Generator.3", DIM, DIM, output, w_initialization=W3)
+    outs["FC3"] = output
+    weights["FC3"] = W
+    output, W = LinearLayer("Generator.4", DIM, n_feats, output, w_initialization=W4)
+    outs["FC4"] = output
+    weights["FC4"] = W
+    return output, outs, weights
+
+
+def Discriminator(inputs, params=None):
+    n_feats = 2
+    W1 = W2 = W3 = W4 = "he"
+    if params is not None:
+        W1 = params["critic"]["FC1"]["W"]
+        W2 = params["critic"]["FC2"]["W"]
+        W3 = params["critic"]["FC3"]["W"]
+        W4 = params["critic"]["FC4"]["W"]
+        DIM = params["g_hidden"]
+        n_feats = params["n_in"]
+
+    outs = {}
+    weights = {}
+    output, W = ReLULayer("Discriminator.1", n_feats, DIM, inputs, w_initialization=W1)
+    outs["FC1"] = output
+    weights["FC1"] = W
+
+    output, W = ReLULayer("Discriminator.2", DIM, DIM, output, w_initialization=W2)
+    outs["FC2"] = output
+    weights["FC2"] = W
+
+    output, W = ReLULayer("Discriminator.3", DIM, DIM, output, w_initialization=W3)
+    outs["FC3"] = output
+    weights["FC3"] = W
+
+    output, W = LinearLayer("Discriminator.4", DIM, 1, output, w_initialization=W4)
+    outs["FC4"] = output
+    weights["FC4"] = W
+
+    # get bias
+    for var in params_with_name("Discriminator"):
+        if "1.b:" in var.name:
+            weights["FC1_b"] = var
+        elif "2.b:" in var.name:
+            weights["FC2_b"] = var
+        elif "3.b:" in var.name:
+            weights["FC3_b"] = var
+        elif "4.b:" in var.name:
+            weights["FC4_b"] = var
+
+    return tf.reshape(output, [-1]), outs, weights
+
+
+def WGAN_GP_tf(X, lambda_, params, batch_size):
+    tf.compat.v1.disable_eager_execution()
+
+    batch_size = X.shape[0]
+
+    # get alpha value
+    n_steps = params["n_steps"]
+    c_updates_per_epoch = params["c_updates_per_epoch"]
+    alpha = tf.convert_to_tensor(params["alpha"], dtype="float32")
+
+    X_real = tf.compat.v1.placeholder(tf.float32, shape=[None, params["n_in"]])
+    X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params)
+
+    Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params)
+    Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params)
+
+    # WGAN loss
+    mean_fake = tf.reduce_mean(Y_fake)
+    mean_real = tf.reduce_mean(Y_real)
+
+    C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real)
+    G_loss = -tf.reduce_mean(Y_fake)
+
+    # WGAN gradient penalty
+    X_interp = alpha * X_real + ((1 - alpha) * X_fake)
+    Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params)
+    gradInterp = tf.gradients(Y_interp, [X_interp])[0]
+
+    norm_gradInterp = tf.sqrt(
+        tf.compat.v1.reduce_sum(tf.square(gradInterp), reduction_indices=[1])
+    )
+    gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2)
+    C_loss += lambda_ * gradient_penalty
+
+    # extract gradient of Y_interp wrt. each layer output in critic
+    C_bwd_Y_interp = {}
+    for k, v in C_out_Y_interp.items():
+        C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0]
+
+    C_bwd_W = {}
+    for k, v in C_Y_interp_weights.items():
+        C_bwd_W[k] = tf.gradients(C_loss, [v])[0]
+
+    # get gradients
+    dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0]
+    dC_Y_real = tf.gradients(C_loss, [Y_real])[0]
+    dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0]
+    dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0]
+
+    with tf.compat.v1.Session() as session:
+        session.run(tf.compat.v1.global_variables_initializer())
+
+        for iteration in range(n_steps):
+            # Train critic
+            for i in range(c_updates_per_epoch):
+                _data = X
+                (
+                    _alpha,
+                    _X_interp,
+                    _Y_interp,
+                    _gradInterp,
+                    _norm_gradInterp,
+                    _gradient_penalty,
+                    _C_loss,
+                    _X_fake,
+                    _Y_fake,
+                    _Y_real,
+                    _dC_Y_fake,
+                    _dC_Y_real,
+                    _dC_gradInterp,
+                    _dG_Y_fake,
+                    _mean_fake,
+                    _mean_real,
+                    _G_weights_FC1,
+                    _G_weights_FC2,
+                    _G_weights_FC3,
+                    _G_weights_FC4,
+                    _G_fwd_X_fake_FC1,
+                    _G_fwd_X_fake_FC2,
+                    _G_fwd_X_fake_FC3,
+                    _G_fwd_X_fake_FC4,
+                    _C_weights_Y_fake_FC1,
+                    _C_weights_Y_fake_FC2,
+                    _C_weights_Y_fake_FC3,
+                    _C_weights_Y_fake_FC4,
+                    _C_fwd_Y_fake_FC1,
+                    _C_fwd_Y_fake_FC2,
+                    _C_fwd_Y_fake_FC3,
+                    _C_fwd_Y_fake_FC4,
+                    _C_weights_Y_real_FC1,
+                    _C_weights_Y_real_FC2,
+                    _C_weights_Y_real_FC3,
+                    _C_weights_Y_real_FC4,
+                    _C_fwd_Y_real_FC1,
+                    _C_fwd_Y_real_FC2,
+                    _C_fwd_Y_real_FC3,
+                    _C_fwd_Y_real_FC4,
+                    _C_weights_Y_interp_FC1,
+                    _C_weights_Y_interp_FC2,
+                    _C_weights_Y_interp_FC3,
+                    _C_weights_Y_interp_FC4,
+                    _C_dY_interp_wrt_FC1,
+                    _C_dY_interp_wrt_FC2,
+                    _C_dY_interp_wrt_FC3,
+                    _C_dY_interp_wrt_FC4,
+                    _C_fwd_Y_interp_FC1,
+                    _C_fwd_Y_interp_FC2,
+                    _C_fwd_Y_interp_FC3,
+                    _C_fwd_Y_interp_FC4,
+                    _C_dW_FC1,
+                    _C_db_FC1,
+                    _C_dW_FC2,
+                    _C_db_FC2,
+                    _C_dW_FC3,
+                    _C_db_FC3,
+                    _C_dW_FC4,
+                    _C_db_FC4,
+                ) = session.run(
+                    [
+                        alpha,
+                        X_interp,
+                        Y_interp,
+                        gradInterp,
+                        norm_gradInterp,
+                        gradient_penalty,
+                        C_loss,
+                        X_fake,
+                        Y_fake,
+                        Y_real,
+                        dC_Y_fake,
+                        dC_Y_real,
+                        dC_gradInterp,
+                        dG_Y_fake,
+                        mean_fake,
+                        mean_real,
+                        G_weights["FC1"],
+                        G_weights["FC2"],
+                        G_weights["FC3"],
+                        G_weights["FC4"],
+                        G_out_X_fake["FC1"],
+                        G_out_X_fake["FC2"],
+                        G_out_X_fake["FC3"],
+                        G_out_X_fake["FC4"],
+                        C_Y_fake_weights["FC1"],
+                        C_Y_fake_weights["FC2"],
+                        C_Y_fake_weights["FC3"],
+                        C_Y_fake_weights["FC4"],
+                        C_out_Y_fake["FC1"],
+                        C_out_Y_fake["FC2"],
+                        C_out_Y_fake["FC3"],
+                        C_out_Y_fake["FC4"],
+                        C_Y_real_weights["FC1"],
+                        C_Y_real_weights["FC2"],
+                        C_Y_real_weights["FC3"],
+                        C_Y_real_weights["FC4"],
+                        C_out_Y_real["FC1"],
+                        C_out_Y_real["FC2"],
+                        C_out_Y_real["FC3"],
+                        C_out_Y_real["FC4"],
+                        C_Y_interp_weights["FC1"],
+                        C_Y_interp_weights["FC2"],
+                        C_Y_interp_weights["FC3"],
+                        C_Y_interp_weights["FC4"],
+                        C_bwd_Y_interp["FC1"],
+                        C_bwd_Y_interp["FC2"],
+                        C_bwd_Y_interp["FC3"],
+                        C_bwd_Y_interp["FC4"],
+                        C_out_Y_interp["FC1"],
+                        C_out_Y_interp["FC2"],
+                        C_out_Y_interp["FC3"],
+                        C_out_Y_interp["FC4"],
+                        C_bwd_W["FC1"],
+                        C_bwd_W["FC1_b"],
+                        C_bwd_W["FC2"],
+                        C_bwd_W["FC2_b"],
+                        C_bwd_W["FC3"],
+                        C_bwd_W["FC3_b"],
+                        C_bwd_W["FC4"],
+                        C_bwd_W["FC4_b"],
+                    ],
+                    feed_dict={X_real: _data},
+                )
+
+            _G_loss = session.run(G_loss, feed_dict={X_real: _data})
+
+        grads = {
+            "X_real": _data,
+            "X_interp": _X_interp,
+            "G_weights_FC1": _G_weights_FC1,
+            "G_weights_FC2": _G_weights_FC2,
+            "G_weights_FC3": _G_weights_FC3,
+            "G_weights_FC4": _G_weights_FC4,
+            "G_fwd_X_fake_FC1": _G_fwd_X_fake_FC1,
+            "G_fwd_X_fake_FC2": _G_fwd_X_fake_FC2,
+            "G_fwd_X_fake_FC3": _G_fwd_X_fake_FC3,
+            "G_fwd_X_fake_FC4": _G_fwd_X_fake_FC4,
+            "X_fake": _X_fake,
+            "C_weights_Y_fake_FC1": _C_weights_Y_fake_FC1,
+            "C_weights_Y_fake_FC2": _C_weights_Y_fake_FC2,
+            "C_weights_Y_fake_FC3": _C_weights_Y_fake_FC3,
+            "C_weights_Y_fake_FC4": _C_weights_Y_fake_FC4,
+            "C_fwd_Y_fake_FC1": _C_fwd_Y_fake_FC1,
+            "C_fwd_Y_fake_FC2": _C_fwd_Y_fake_FC2,
+            "C_fwd_Y_fake_FC3": _C_fwd_Y_fake_FC3,
+            "C_fwd_Y_fake_FC4": _C_fwd_Y_fake_FC4,
+            "Y_fake": _Y_fake,
+            "C_weights_Y_real_FC1": _C_weights_Y_real_FC1,
+            "C_weights_Y_real_FC2": _C_weights_Y_real_FC2,
+            "C_weights_Y_real_FC3": _C_weights_Y_real_FC3,
+            "C_weights_Y_real_FC4": _C_weights_Y_real_FC4,
+            "C_fwd_Y_real_FC1": _C_fwd_Y_real_FC1,
+            "C_fwd_Y_real_FC2": _C_fwd_Y_real_FC2,
+            "C_fwd_Y_real_FC3": _C_fwd_Y_real_FC3,
+            "C_fwd_Y_real_FC4": _C_fwd_Y_real_FC4,
+            "Y_real": _Y_real,
+            "C_weights_Y_interp_FC1": _C_weights_Y_interp_FC1,
+            "C_weights_Y_interp_FC2": _C_weights_Y_interp_FC2,
+            "C_weights_Y_interp_FC3": _C_weights_Y_interp_FC3,
+            "C_weights_Y_interp_FC4": _C_weights_Y_interp_FC4,
+            "C_fwd_Y_interp_FC1": _C_fwd_Y_interp_FC1,
+            "C_fwd_Y_interp_FC2": _C_fwd_Y_interp_FC2,
+            "C_fwd_Y_interp_FC3": _C_fwd_Y_interp_FC3,
+            "C_fwd_Y_interp_FC4": _C_fwd_Y_interp_FC4,
+            "Y_interp": _Y_interp,
+            "dY_interp_wrt_FC1": _C_dY_interp_wrt_FC1,
+            "dY_interp_wrt_FC2": _C_dY_interp_wrt_FC2,
+            "dY_interp_wrt_FC3": _C_dY_interp_wrt_FC3,
+            "dY_interp_wrt_FC4": _C_dY_interp_wrt_FC4,
+            "gradInterp": _gradInterp,
+            "gradInterp_norm": _norm_gradInterp,
+            "G_loss": _G_loss,
+            "C_loss": _C_loss,
+            "dC_loss_dW_FC1": _C_dW_FC1,
+            "dC_loss_db_FC1": _C_db_FC1,
+            "dC_loss_dW_FC2": _C_dW_FC2,
+            "dC_loss_db_FC2": _C_db_FC2,
+            "dC_loss_dW_FC3": _C_dW_FC3,
+            "dC_loss_db_FC3": _C_db_FC3,
+            "dC_loss_dW_FC4": _C_dW_FC4,
+            "dC_loss_db_FC4": _C_db_FC4,
+            "dC_Y_fake": _dC_Y_fake,
+            "dC_Y_real": _dC_Y_real,
+            "dC_gradInterp": _dC_gradInterp,
+            "dG_Y_fake": _dG_Y_fake,
+        }
+    return grads
+
+
+def TFNCELoss(X, target_word, L):
+    from tensorflow.python.ops.nn_impl import _compute_sampled_logits
+    from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits
+
+    tf.compat.v1.disable_eager_execution()
+
+    in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape)
+    in_bias = tf.compat.v1.placeholder(
+        tf.float32, shape=L.parameters["b"].flatten().shape
+    )
+    in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.parameters["W"].shape)
+    in_target_word = tf.compat.v1.placeholder(tf.int64)
+    in_neg_samples = tf.compat.v1.placeholder(tf.int32)
+    in_target_prob = tf.compat.v1.placeholder(tf.float32)
+    in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32)
+
+    #  in_embed = tf.keras.Input(dtype=tf.float32, shape=X.shape)
+    #  in_bias = tf.keras.Input(dtype=tf.float32, shape=L.parameters["b"].flatten().shape)
+    #  in_weights = tf.keras.Input(dtype=tf.float32, shape=L.parameters["W"].shape)
+    #  in_target_word = tf.keras.Input(dtype=tf.int64, shape=())
+    #  in_neg_samples = tf.keras.Input(dtype=tf.int32, shape=())
+    #  in_target_prob = tf.keras.Input(dtype=tf.float32, shape=())
+    #  in_neg_samp_prob = tf.keras.Input(dtype=tf.float32, shape=())
+
+    feed = {
+        in_embed: X,
+        in_weights: L.parameters["W"],
+        in_target_word: target_word,
+        in_bias: L.parameters["b"].flatten(),
+        in_neg_samples: L.derived_variables["noise_samples"][0],
+        in_target_prob: L.derived_variables["noise_samples"][1],
+        in_neg_samp_prob: L.derived_variables["noise_samples"][2],
+    }
+
+    # Compute the NCE loss, using a sample of the negative labels each time.
+    nce_unreduced = tf.nn.nce_loss(
+        weights=in_weights,
+        biases=in_bias,
+        labels=in_target_word,
+        inputs=in_embed,
+        sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),
+        num_sampled=L.num_negative_samples,
+        num_classes=L.n_classes,
+    )
+
+    loss = tf.reduce_sum(nce_unreduced)
+    dLdW = tf.gradients(loss, [in_weights])[0]
+    dLdb = tf.gradients(loss, [in_bias])[0]
+    dLdX = tf.gradients(loss, [in_embed])[0]
+
+    sampled_logits, sampled_labels = _compute_sampled_logits(
+        weights=in_weights,
+        biases=in_bias,
+        labels=in_target_word,
+        inputs=in_embed,
+        sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),
+        num_sampled=L.num_negative_samples,
+        num_classes=L.n_classes,
+        num_true=1,
+        subtract_log_q=True,
+    )
+
+    sampled_losses = sigmoid_cross_entropy_with_logits(
+        labels=sampled_labels, logits=sampled_logits
+    )
+
+    with tf.compat.v1.Session() as session:
+        session.run(tf.compat.v1.global_variables_initializer())
+        (
+            _final_loss,
+            _nce_unreduced,
+            _dLdW,
+            _dLdb,
+            _dLdX,
+            _sampled_logits,
+            _sampled_labels,
+            _sampled_losses,
+        ) = session.run(
+            [
+                loss,
+                nce_unreduced,
+                dLdW,
+                dLdb,
+                dLdX,
+                sampled_logits,
+                sampled_labels,
+                sampled_losses,
+            ],
+            feed_dict=feed,
+        )
+    tf.compat.v1.reset_default_graph()
+    return {
+        "final_loss": _final_loss,
+        "nce_unreduced": _nce_unreduced,
+        "dLdW": _dLdW,
+        "dLdb": _dLdb,
+        "dLdX": _dLdX,
+        "out_logits": _sampled_logits,
+        "out_labels": _sampled_labels,
+        "sampled_loss": _sampled_losses,
+    }
diff --git a/numpy_ml/tests/test_ngram.py b/numpy_ml/tests/test_ngram.py
new file mode 100644
index 0000000..0fd4252
--- /dev/null
+++ b/numpy_ml/tests/test_ngram.py
@@ -0,0 +1,254 @@
+# flake8: noqa
+import tempfile
+
+import nltk
+import numpy as np
+
+from ..preprocessing.nlp import tokenize_words
+from ..ngram import AdditiveNGram, MLENGram
+from ..utils.testing import random_paragraph
+
+
+class MLEGold:
+    def __init__(
+        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
+    ):
+        self.N = N
+        self.K = K
+        self.unk = unk
+        self.filter_stopwords = filter_stopwords
+        self.filter_punctuation = filter_punctuation
+
+        self.hyperparameters = {
+            "N": N,
+            "K": K,
+            "unk": unk,
+            "filter_stopwords": filter_stopwords,
+            "filter_punctuation": filter_punctuation,
+        }
+
+    def train(self, corpus_fp, vocab=None, encoding=None):
+        N = self.N
+        H = self.hyperparameters
+        models, counts = {}, {}
+        grams = {n: [] for n in range(1, N + 1)}
+        gg = {n: [] for n in range(1, N + 1)}
+        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
+
+        n_words = 0
+        tokens = set([])
+
+        with open(corpus_fp, "r", encoding=encoding) as text:
+            for line in text:
+                words = tokenize_words(line, filter_punc, filter_stop)
+
+                if vocab is not None:
+                    words = vocab.filter(words, H["unk"])
+
+                if len(words) == 0:
+                    continue
+
+                n_words += len(words)
+                tokens.update(words)
+
+                # calculate n, n-1, ... 1-grams
+                for n in range(1, N + 1):
+                    grams[n].append(
+                        nltk.ngrams(
+                            words,
+                            n,
+                            pad_left=True,
+                            pad_right=True,
+                            left_pad_symbol="<bol>",
+                            right_pad_symbol="<eol>",
+                        )
+                    )
+
+                    gg[n].extend(
+                        list(
+                            nltk.ngrams(
+                                words,
+                                n,
+                                pad_left=True,
+                                pad_right=True,
+                                left_pad_symbol="<bol>",
+                                right_pad_symbol="<eol>",
+                            )
+                        )
+                    )
+
+        for n in range(1, N + 1):
+            counts[n] = nltk.FreqDist(gg[n])
+            models[n] = nltk.lm.MLE(order=n)
+            models[n].fit(grams[n], tokens)
+
+        self.counts = counts
+        self.n_words = n_words
+        self._models = models
+        self.n_tokens = len(vocab) if vocab is not None else len(tokens)
+
+    def log_prob(self, words, N):
+        assert N in self.counts, "You do not have counts for {}-grams".format(N)
+
+        if N > len(words):
+            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
+            raise ValueError(err)
+
+        total_prob = 0
+        for ngram in nltk.ngrams(words, N):
+            total_prob += self._log_ngram_prob(ngram)
+        return total_prob
+
+    def _log_ngram_prob(self, ngram):
+        N = len(ngram)
+        return self._models[N].logscore(ngram[-1], ngram[:-1])
+
+
+class AdditiveGold:
+    def __init__(
+        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
+    ):
+        self.N = N
+        self.K = K
+        self.unk = unk
+        self.filter_stopwords = filter_stopwords
+        self.filter_punctuation = filter_punctuation
+
+        self.hyperparameters = {
+            "N": N,
+            "K": K,
+            "unk": unk,
+            "filter_stopwords": filter_stopwords,
+            "filter_punctuation": filter_punctuation,
+        }
+
+    def train(self, corpus_fp, vocab=None, encoding=None):
+        N = self.N
+        H = self.hyperparameters
+        models, counts = {}, {}
+        grams = {n: [] for n in range(1, N + 1)}
+        gg = {n: [] for n in range(1, N + 1)}
+        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
+
+        n_words = 0
+        tokens = set()
+
+        with open(corpus_fp, "r", encoding=encoding) as text:
+            for line in text:
+                words = tokenize_words(line, filter_punc, filter_stop)
+
+                if vocab is not None:
+                    words = vocab.filter(words, H["unk"])
+
+                if len(words) == 0:
+                    continue
+
+                n_words += len(words)
+                tokens.update(words)
+
+                # calculate n, n-1, ... 1-grams
+                for n in range(1, N + 1):
+                    grams[n].append(
+                        nltk.ngrams(
+                            words,
+                            n,
+                            pad_left=True,
+                            pad_right=True,
+                            left_pad_symbol="<bol>",
+                            right_pad_symbol="<eol>",
+                        )
+                    )
+
+                    gg[n].extend(
+                        list(
+                            nltk.ngrams(
+                                words,
+                                n,
+                                pad_left=True,
+                                pad_right=True,
+                                left_pad_symbol="<bol>",
+                                right_pad_symbol="<eol>",
+                            )
+                        )
+                    )
+
+        for n in range(1, N + 1):
+            counts[n] = nltk.FreqDist(gg[n])
+            models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)
+            models[n].fit(grams[n], tokens)
+
+        self.counts = counts
+        self._models = models
+        self.n_words = n_words
+        self.n_tokens = len(vocab) if vocab is not None else len(tokens)
+
+    def log_prob(self, words, N):
+        assert N in self.counts, "You do not have counts for {}-grams".format(N)
+
+        if N > len(words):
+            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
+            raise ValueError(err)
+
+        total_prob = 0
+        for ngram in nltk.ngrams(words, N):
+            total_prob += self._log_ngram_prob(ngram)
+        return total_prob
+
+    def _log_ngram_prob(self, ngram):
+        N = len(ngram)
+        return self._models[N].logscore(ngram[-1], ngram[:-1])
+
+
+def test_mle():
+    N = np.random.randint(2, 5)
+    gold = MLEGold(N, unk=True, filter_stopwords=False, filter_punctuation=False)
+    mine = MLENGram(N, unk=True, filter_stopwords=False, filter_punctuation=False)
+
+    with tempfile.NamedTemporaryFile() as temp:
+        temp.write(bytes(" ".join(random_paragraph(1000)), encoding="utf-8-sig"))
+        gold.train(temp.name, encoding="utf-8-sig")
+        mine.train(temp.name, encoding="utf-8-sig")
+
+    for k in mine.counts[N].keys():
+        if k[0] == k[1] and k[0] in ("<bol>", "<eol>"):
+            continue
+
+        err_str = "{}, mine: {}, gold: {}"
+        assert mine.counts[N][k] == gold.counts[N][k], err_str.format(
+            k, mine.counts[N][k], gold.counts[N][k]
+        )
+
+        M = mine.log_prob(k, N)
+        G = gold.log_prob(k, N) / np.log2(np.e)  # convert to log base e
+        np.testing.assert_allclose(M, G)
+        print("PASSED")
+
+
+def test_additive():
+    K = np.random.rand()
+    N = np.random.randint(2, 5)
+    gold = AdditiveGold(
+        N, K, unk=True, filter_stopwords=False, filter_punctuation=False
+    )
+    mine = AdditiveNGram(
+        N, K, unk=True, filter_stopwords=False, filter_punctuation=False
+    )
+
+    with tempfile.NamedTemporaryFile() as temp:
+        temp.write(bytes(" ".join(random_paragraph(1000)), encoding="utf-8-sig"))
+        gold.train(temp.name, encoding="utf-8-sig")
+        mine.train(temp.name, encoding="utf-8-sig")
+
+    for k in mine.counts[N].keys():
+        if k[0] == k[1] and k[0] in ("<bol>", "<eol>"):
+            continue
+
+        err_str = "{}, mine: {}, gold: {}"
+        assert mine.counts[N][k] == gold.counts[N][k], err_str.format(
+            k, mine.counts[N][k], gold.counts[N][k]
+        )
+
+        M = mine.log_prob(k, N)
+        G = gold.log_prob(k, N) / np.log2(np.e)  # convert to log base e
+        np.testing.assert_allclose(M, G)
+        print("PASSED")
diff --git a/numpy_ml/tests/test_nn.py b/numpy_ml/tests/test_nn.py
new file mode 100644
index 0000000..1f00680
--- /dev/null
+++ b/numpy_ml/tests/test_nn.py
@@ -0,0 +1,2447 @@
+# flake8: noqa
+import time
+from copy import deepcopy
+
+import numpy as np
+from numpy.testing import assert_almost_equal
+
+from sklearn.metrics import log_loss, mean_squared_error
+
+# for testing sigmoid
+from scipy.special import expit
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import tensorflow.keras.datasets.mnist as mnist
+
+from numpy_ml.neural_nets.utils import (
+    calc_pad_dims_2D,
+    conv2D_naive,
+    conv2D,
+    pad2D,
+    pad1D,
+)
+from numpy_ml.utils.testing import (
+    random_one_hot_matrix,
+    random_stochastic_matrix,
+    random_tensor,
+)
+
+from .nn_torch_models import (
+    TFNCELoss,
+    WGAN_GP_tf,
+    torch_xe_grad,
+    torch_mse_grad,
+    TorchVAELoss,
+    TorchFCLayer,
+    TorchRNNCell,
+    TorchLSTMCell,
+    TorchAddLayer,
+    TorchWGANGPLoss,
+    TorchConv1DLayer,
+    TorchConv2DLayer,
+    TorchPool2DLayer,
+    TorchWavenetModule,
+    TorchMultiplyLayer,
+    TorchDeconv2DLayer,
+    TorchLayerNormLayer,
+    TorchBatchNormLayer,
+    TorchEmbeddingLayer,
+    TorchLinearActivation,
+    TorchSDPAttentionLayer,
+    TorchBidirectionalLSTM,
+    torch_gradient_generator,
+    TorchSkipConnectionConv,
+    TorchSkipConnectionIdentity,
+    TorchMultiHeadedAttentionModule,
+)
+
+#######################################################################
+#                           Debug Formatter                           #
+#######################################################################
+
+
+def err_fmt(params, golds, ix, warn_str=""):
+    mine, label = params[ix]
+    err_msg = "-" * 25 + " DEBUG " + "-" * 25 + "\n"
+    prev_mine, prev_label = params[max(ix - 1, 0)]
+    err_msg += "Mine (prev) [{}]:\n{}\n\nTheirs (prev) [{}]:\n{}".format(
+        prev_label, prev_mine, prev_label, golds[prev_label]
+    )
+    err_msg += "\n\nMine [{}]:\n{}\n\nTheirs [{}]:\n{}".format(
+        label, mine, label, golds[label]
+    )
+    err_msg += warn_str
+    err_msg += "\n" + "-" * 23 + " END DEBUG " + "-" * 23
+    return err_msg
+
+
+#######################################################################
+#                         Loss Functions                              #
+#######################################################################
+
+
+def test_squared_error(N=15):
+    from numpy_ml.neural_nets.losses import SquaredError
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = SquaredError()
+    gold = (
+        lambda y, y_pred: mean_squared_error(y, y_pred)
+        * y_pred.shape[0]
+        * y_pred.shape[1]
+        * 0.5
+    )
+
+    # ensure we get 0 when the two arrays are equal
+    n_dims = np.random.randint(2, 100)
+    n_examples = np.random.randint(1, 1000)
+    y = y_pred = random_tensor((n_examples, n_dims))
+    assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))
+    print("PASSED")
+
+    i = 1
+    while i < N:
+        n_dims = np.random.randint(2, 100)
+        n_examples = np.random.randint(1, 1000)
+        y = random_tensor((n_examples, n_dims))
+        y_pred = random_tensor((n_examples, n_dims))
+        assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred), decimal=5)
+        print("PASSED")
+        i += 1
+
+
+def test_cross_entropy(N=15):
+    from numpy_ml.neural_nets.losses import CrossEntropy
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = CrossEntropy()
+    gold = log_loss
+
+    # ensure we get 0 when the two arrays are equal
+    n_classes = np.random.randint(2, 100)
+    n_examples = np.random.randint(1, 1000)
+    y = y_pred = random_one_hot_matrix(n_examples, n_classes)
+    assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))
+    print("PASSED")
+
+    # test on random inputs
+    i = 1
+    while i < N:
+        n_classes = np.random.randint(2, 100)
+        n_examples = np.random.randint(1, 1000)
+        y = random_one_hot_matrix(n_examples, n_classes)
+        y_pred = random_stochastic_matrix(n_examples, n_classes)
+
+        assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred, normalize=False))
+        print("PASSED")
+        i += 1
+
+
+def test_VAE_loss(N=15):
+    from numpy_ml.neural_nets.losses import VAELoss
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+    eps = np.finfo(float).eps
+
+    i = 1
+    while i < N:
+        n_ex = np.random.randint(1, 10)
+        t_dim = np.random.randint(2, 10)
+        t_mean = random_tensor([n_ex, t_dim], standardize=True)
+        t_log_var = np.log(np.abs(random_tensor([n_ex, t_dim], standardize=True) + eps))
+        im_cols, im_rows = np.random.randint(2, 40), np.random.randint(2, 40)
+        X = np.random.rand(n_ex, im_rows * im_cols)
+        X_recon = np.random.rand(n_ex, im_rows * im_cols)
+
+        mine = VAELoss()
+        mine_loss = mine(X, X_recon, t_mean, t_log_var)
+        dX_recon, dLogVar, dMean = mine.grad(X, X_recon, t_mean, t_log_var)
+        golds = TorchVAELoss().extract_grads(X, X_recon, t_mean, t_log_var)
+
+        params = [
+            (mine_loss, "loss"),
+            (dX_recon, "dX_recon"),
+            (dLogVar, "dt_log_var"),
+            (dMean, "dt_mean"),
+        ]
+        print("\nTrial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            np.testing.assert_allclose(
+                mine,
+                golds[label],
+                err_msg=err_fmt(params, golds, ix),
+                rtol=0.1,
+                atol=1e-2,
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_WGAN_GP_loss(N=5):
+    from numpy_ml.neural_nets.losses import WGAN_GPLoss
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    i = 1
+    while i < N:
+        lambda_ = np.random.randint(0, 10)
+        n_ex = np.random.randint(1, 10)
+        n_feats = np.random.randint(2, 10)
+        Y_real = random_tensor([n_ex], standardize=True)
+        Y_fake = random_tensor([n_ex], standardize=True)
+        gradInterp = random_tensor([n_ex, n_feats], standardize=True)
+
+        mine = WGAN_GPLoss(lambda_=lambda_)
+        C_loss = mine(Y_fake, "C", Y_real, gradInterp)
+        G_loss = mine(Y_fake, "G")
+
+        C_dY_fake, dY_real, dGradInterp = mine.grad(Y_fake, "C", Y_real, gradInterp)
+        G_dY_fake = mine.grad(Y_fake, "G")
+
+        golds = TorchWGANGPLoss(lambda_).extract_grads(Y_real, Y_fake, gradInterp)
+        if np.isnan(golds["C_dGradInterp"]).any():
+            continue
+
+        params = [
+            (Y_real, "Y_real"),
+            (Y_fake, "Y_fake"),
+            (gradInterp, "gradInterp"),
+            (C_loss, "C_loss"),
+            (G_loss, "G_loss"),
+            (-dY_real, "C_dY_real"),
+            (-C_dY_fake, "C_dY_fake"),
+            (dGradInterp, "C_dGradInterp"),
+            (G_dY_fake, "G_dY_fake"),
+        ]
+
+        print("\nTrial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            np.testing.assert_allclose(
+                mine,
+                golds[label],
+                err_msg=err_fmt(params, golds, ix),
+                rtol=0.1,
+                atol=1e-2,
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_NCELoss(N=1):
+    from numpy_ml.neural_nets.losses import NCELoss
+    from numpy_ml.utils.data_structures import DiscreteSampler
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        n_c = np.random.randint(1, 10)
+        n_out = np.random.randint(1, 300)
+        vocab_size = np.random.randint(200, 1000)
+        num_negative_samples = np.random.randint(1, 10)
+
+        embeddings = random_tensor((n_ex, n_c, n_out), standardize=True)
+        target = np.random.randint(0, vocab_size, (n_ex, 1))
+
+        probs = np.random.rand(vocab_size)
+        probs /= probs.sum()
+
+        D = DiscreteSampler(probs, log=False, with_replacement=False)
+        NCE = NCELoss(vocab_size, D, num_negative_samples)
+        my_loss, _ = NCE(embeddings, target.flatten())
+
+        my_dLdX = NCE.grad(update_params=False)
+        my_dLdW = NCE.gradients["W"]
+        my_dLdb = NCE.gradients["b"]
+
+        NCE.gradients["W"] = np.zeros_like(NCE.parameters["W"])
+        NCE.gradients["b"] = np.zeros_like(NCE.parameters["b"])
+
+        MY_final_loss, TF_final_loss = 0, 0
+        MY_dLdX, TF_dLdX = np.zeros_like(embeddings), np.zeros_like(embeddings)
+        TF_dLdW, TF_dLdb = (
+            np.zeros_like(NCE.parameters["W"]),
+            np.zeros_like(NCE.parameters["b"]),
+        )
+
+        # XXX: instead of calculating the tf NCE on the entire batch, we
+        # calculate it per-example and then sum. this is really lame and should
+        # be changed to operate on batches.
+        nv = NCE.derived_variables["noise_samples"][0]
+        for ix, emb in enumerate(embeddings):
+            sv = (nv[0], np.array([nv[1][0, ix]]), nv[2])
+
+            NCE.X = []
+            for k, v in NCE.derived_variables.items():
+                NCE.derived_variables[k] = []
+
+            for k, v in NCE.gradients.items():
+                NCE.gradients[k] = np.zeros_like(v)
+
+            my, _ = NCE(emb[None, :, :], target[ix], neg_samples=sv[0])
+
+            NCE.derived_variables["noise_samples"] = [sv]
+            dldx = NCE.grad(update_params=False)
+            NCE.derived_variables["noise_samples"] = sv
+
+            MY_final_loss += my
+            MY_dLdX[ix, ...] += np.squeeze(dldx, axis=0)
+
+            TF_dict = TFNCELoss(emb, np.array([target[ix]]), NCE)
+
+            TF_loss = TF_dict["final_loss"]
+            TF_final_loss += TF_loss
+            TF_dLdX[ix, ...] += TF_dict["dLdX"]
+            TF_dLdW[TF_dict["dLdW"].indices, :] += TF_dict["dLdW"].values
+            TF_dLdb[:, TF_dict["dLdb"].indices] += TF_dict["dLdb"].values
+
+            tf_dw = np.zeros_like(NCE.gradients["W"])
+            tf_dw[TF_dict["dLdW"].indices, :] += TF_dict["dLdW"].values
+
+            tf_db = np.zeros_like(NCE.gradients["b"])
+            tf_db[:, TF_dict["dLdb"].indices] += TF_dict["dLdb"].values
+
+        print("\nTrial {}".format(i))
+        np.testing.assert_almost_equal(my_loss, TF_final_loss, decimal=3)
+        print("PASSED: final loss")
+
+        maps = [
+            ("dLdW", my_dLdW, TF_dLdW),
+            ("dLdb", my_dLdb, TF_dLdb),
+            ("dLdX", my_dLdX, TF_dLdX),
+        ]
+        for (ll, k1, k2) in maps:
+            np.testing.assert_almost_equal(k1, k2, decimal=2, err_msg=ll)
+            print("PASSED: {}".format(ll))
+
+        i += 1
+
+
+#######################################################################
+#                       Loss Function Gradients                       #
+#######################################################################
+
+
+def test_squared_error_grad(N=15):
+    from numpy_ml.neural_nets.losses import SquaredError
+    from numpy_ml.neural_nets.activations import Tanh
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = SquaredError()
+    gold = torch_mse_grad
+    act = Tanh()
+
+    i = 1
+    while i < N:
+        n_dims = np.random.randint(2, 100)
+        n_examples = np.random.randint(1, 1000)
+        y = random_tensor((n_examples, n_dims))
+
+        # raw inputs
+        z = random_tensor((n_examples, n_dims))
+        y_pred = act.fn(z)
+
+        assert_almost_equal(
+            mine.grad(y, y_pred, z, act), 0.5 * gold(y, z, torch.tanh), decimal=4
+        )
+        print("PASSED")
+        i += 1
+
+
+def test_cross_entropy_grad(N=15):
+    from numpy_ml.neural_nets.losses import CrossEntropy
+    from numpy_ml.neural_nets.layers import Softmax
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = CrossEntropy()
+    gold = torch_xe_grad
+    sm = Softmax()
+
+    i = 1
+    while i < N:
+        n_classes = np.random.randint(2, 100)
+        n_examples = np.random.randint(1, 1000)
+
+        y = random_one_hot_matrix(n_examples, n_classes)
+
+        # the cross_entropy_gradient returns the gradient wrt. z (NOT softmax(z))
+        z = random_tensor((n_examples, n_classes))
+        y_pred = sm.forward(z)
+
+        assert_almost_equal(mine.grad(y, y_pred), gold(y, z), decimal=5)
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                          Activations                                #
+#######################################################################
+
+
+def test_sigmoid_activation(N=15):
+    from numpy_ml.neural_nets.activations import Sigmoid
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = Sigmoid()
+    gold = expit
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((1, n_dims))
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_elu_activation(N=15):
+    from numpy_ml.neural_nets.activations import ELU
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 10)
+        z = random_tensor((1, n_dims))
+
+        alpha = np.random.uniform(0, 10)
+
+        mine = ELU(alpha)
+        gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()
+
+        assert_almost_equal(mine.fn(z), gold(z, alpha))
+        print("PASSED")
+        i += 1
+
+
+def test_softmax_activation(N=15):
+    from numpy_ml.neural_nets.layers import Softmax
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = Softmax()
+    gold = lambda z: F.softmax(torch.FloatTensor(z), dim=1).numpy()
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        assert_almost_equal(mine.forward(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_relu_activation(N=15):
+    from numpy_ml.neural_nets.activations import ReLU
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = ReLU()
+    gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_softplus_activation(N=15):
+    from numpy_ml.neural_nets.activations import SoftPlus
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = SoftPlus()
+    gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                      Activation Gradients                           #
+#######################################################################
+
+
+def test_sigmoid_grad(N=15):
+    from numpy_ml.neural_nets.activations import Sigmoid
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = Sigmoid()
+    gold = torch_gradient_generator(torch.sigmoid)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_elu_grad(N=15):
+    from numpy_ml.neural_nets.activations import ELU
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 10)
+        n_dims = np.random.randint(1, 10)
+        alpha = np.random.uniform(0, 10)
+        z = random_tensor((n_ex, n_dims))
+
+        mine = ELU(alpha)
+        gold = torch_gradient_generator(F.elu, alpha=alpha)
+        assert_almost_equal(mine.grad(z), gold(z), decimal=5)
+        print("PASSED")
+        i += 1
+
+
+def test_tanh_grad(N=15):
+    from numpy_ml.neural_nets.activations import Tanh
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = Tanh()
+    gold = torch_gradient_generator(torch.tanh)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_relu_grad(N=15):
+    from numpy_ml.neural_nets.activations import ReLU
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = ReLU()
+    gold = torch_gradient_generator(F.relu)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_softmax_grad(N=15):
+    from numpy_ml.neural_nets.layers import Softmax
+    from functools import partial
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+    p_soft = partial(F.softmax, dim=1)
+    gold = torch_gradient_generator(p_soft)
+
+    i = 0
+    while i < N:
+        mine = Softmax()
+        n_ex = np.random.randint(1, 3)
+        n_dims = np.random.randint(1, 50)
+        z = random_tensor((n_ex, n_dims), standardize=True)
+        out = mine.forward(z)
+
+        assert_almost_equal(
+            gold(z),
+            mine.backward(np.ones_like(out)),
+            err_msg="Theirs:\n{}\n\nMine:\n{}\n".format(
+                gold(z), mine.backward(np.ones_like(out))
+            ),
+            decimal=3,
+        )
+        print("PASSED")
+        i += 1
+
+
+def test_softplus_grad(N=15):
+    from numpy_ml.neural_nets.activations import SoftPlus
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    mine = SoftPlus()
+    gold = torch_gradient_generator(F.softplus)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims), standardize=True)
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                          Layers                                     #
+#######################################################################
+
+
+def test_FullyConnected(N=15):
+    from numpy_ml.neural_nets.layers import FullyConnected
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 100)
+        n_in = np.random.randint(1, 100)
+        n_out = np.random.randint(1, 100)
+        X = random_tensor((n_ex, n_in), standardize=True)
+
+        # randomly select an activation function
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        # initialize FC layer
+        L1 = FullyConnected(n_out=n_out, act_fn=act_fn)
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchFCLayer(n_in, n_out, torch_fn, L1.parameters)
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (y_pred, "y"),
+            (L1.parameters["W"].T, "W"),
+            (L1.parameters["b"], "b"),
+            (dLdy, "dLdy"),
+            (L1.gradients["W"].T, "dLdW"),
+            (L1.gradients["b"], "dLdB"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("\nTrial {}\nact_fn={}".format(i, act_fn_name))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_Embedding(N=15):
+    from numpy_ml.neural_nets.layers import Embedding
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    i = 1
+    while i < N + 1:
+        vocab_size = np.random.randint(1, 2000)
+        n_ex = np.random.randint(1, 100)
+        n_in = np.random.randint(1, 100)
+        emb_dim = np.random.randint(1, 100)
+
+        X = np.random.randint(0, vocab_size, (n_ex, n_in))
+
+        # initialize Embedding layer
+        L1 = Embedding(n_out=emb_dim, vocab_size=vocab_size)
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        #  dLdX = L1.backward(dLdy)
+        L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchEmbeddingLayer(vocab_size, emb_dim, L1.parameters)
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (y_pred, "y"),
+            (L1.parameters["W"], "W"),
+            (dLdy, "dLdy"),
+            (L1.gradients["W"], "dLdW"),
+            #  (dLdX, "dLdX"),
+        ]
+
+        print("\nTrial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_BatchNorm1D(N=15):
+    from numpy_ml.neural_nets.layers import BatchNorm1D
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(2, 1000)
+        n_in = np.random.randint(1, 1000)
+        X = random_tensor((n_ex, n_in), standardize=True)
+
+        # initialize BatchNorm1D layer
+        L1 = BatchNorm1D()
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchBatchNormLayer(
+            n_in, L1.parameters, "1D", epsilon=L1.epsilon, momentum=L1.momentum
+        )
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (y_pred, "y"),
+            (L1.parameters["scaler"].T, "scaler"),
+            (L1.parameters["intercept"], "intercept"),
+            (L1.parameters["running_mean"], "running_mean"),
+            #  (L1.parameters["running_var"], "running_var"),
+            (L1.gradients["scaler"], "dLdScaler"),
+            (L1.gradients["intercept"], "dLdIntercept"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Trial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_LayerNorm1D(N=15):
+    from numpy_ml.neural_nets.layers import LayerNorm1D
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(2, 1000)
+        n_in = np.random.randint(1, 1000)
+        X = random_tensor((n_ex, n_in), standardize=True)
+
+        # initialize BatchNorm1D layer
+        L1 = LayerNorm1D()
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchLayerNormLayer(n_in, L1.parameters, "1D", epsilon=L1.epsilon)
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (y_pred, "y"),
+            (L1.parameters["scaler"].T, "scaler"),
+            (L1.parameters["intercept"], "intercept"),
+            (L1.gradients["scaler"], "dLdScaler"),
+            (L1.gradients["intercept"], "dLdIntercept"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Trial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_LayerNorm2D(N=15):
+    from numpy_ml.neural_nets.layers import LayerNorm2D
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(2, 10)
+        in_rows = np.random.randint(1, 10)
+        in_cols = np.random.randint(1, 10)
+        n_in = np.random.randint(1, 3)
+
+        # initialize LayerNorm2D layer
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+        L1 = LayerNorm2D()
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # standard sum loss
+        dLdy = np.ones_like(X)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchLayerNormLayer(
+            [n_in, in_rows, in_cols], L1.parameters, mode="2D", epsilon=L1.epsilon
+        )
+        golds = gold_mod.extract_grads(X, Y_true=None)
+
+        params = [
+            (L1.X[0], "X"),
+            (L1.hyperparameters["epsilon"], "epsilon"),
+            (L1.parameters["scaler"], "scaler"),
+            (L1.parameters["intercept"], "intercept"),
+            (y_pred, "y"),
+            (L1.gradients["scaler"], "dLdScaler"),
+            (L1.gradients["intercept"], "dLdIntercept"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Trial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
+            )
+
+            print("\tPASSED {}".format(label))
+
+        i += 1
+
+
+def test_MultiplyLayer(N=15):
+    from numpy_ml.neural_nets.layers import Multiply
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        Xs = []
+        n_ex = np.random.randint(1, 100)
+        n_in = np.random.randint(1, 100)
+        n_entries = np.random.randint(2, 5)
+        for _ in range(n_entries):
+            Xs.append(random_tensor((n_ex, n_in), standardize=True))
+
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        # initialize Add layer
+        L1 = Multiply(act_fn)
+
+        # forward prop
+        y_pred = L1.forward(Xs)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdXs = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchMultiplyLayer(torch_fn)
+        golds = gold_mod.extract_grads(Xs)
+
+        params = [(Xs, "Xs"), (y_pred, "Y")]
+        params.extend(
+            [(dldxi, "dLdX{}".format(i + 1)) for i, dldxi in enumerate(dLdXs)]
+        )
+
+        print("\nTrial {}".format(i))
+        print("n_ex={}, n_in={}".format(n_ex, n_in))
+        print("n_entries={}, act_fn={}".format(n_entries, str(act_fn)))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_AddLayer(N=15):
+    from numpy_ml.neural_nets.layers import Add
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        Xs = []
+        n_ex = np.random.randint(1, 100)
+        n_in = np.random.randint(1, 100)
+        n_entries = np.random.randint(2, 5)
+        for _ in range(n_entries):
+            Xs.append(random_tensor((n_ex, n_in), standardize=True))
+
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        # initialize Add layer
+        L1 = Add(act_fn)
+
+        # forward prop
+        y_pred = L1.forward(Xs)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdXs = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchAddLayer(torch_fn)
+        golds = gold_mod.extract_grads(Xs)
+
+        params = [(Xs, "Xs"), (y_pred, "Y")]
+        params.extend(
+            [(dldxi, "dLdX{}".format(i + 1)) for i, dldxi in enumerate(dLdXs)]
+        )
+
+        print("\nTrial {}".format(i))
+        print("n_ex={}, n_in={}".format(n_ex, n_in))
+        print("n_entries={}, act_fn={}".format(n_entries, str(act_fn)))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_BatchNorm2D(N=15):
+    from numpy_ml.neural_nets.layers import BatchNorm2D
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(2, 10)
+        in_rows = np.random.randint(1, 10)
+        in_cols = np.random.randint(1, 10)
+        n_in = np.random.randint(1, 3)
+
+        # initialize BatchNorm2D layer
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+        L1 = BatchNorm2D()
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # standard sum loss
+        dLdy = np.ones_like(X)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchBatchNormLayer(
+            n_in, L1.parameters, mode="2D", epsilon=L1.epsilon, momentum=L1.momentum
+        )
+        golds = gold_mod.extract_grads(X, Y_true=None)
+
+        params = [
+            (L1.X[0], "X"),
+            (L1.hyperparameters["momentum"], "momentum"),
+            (L1.hyperparameters["epsilon"], "epsilon"),
+            (L1.parameters["scaler"].T, "scaler"),
+            (L1.parameters["intercept"], "intercept"),
+            (L1.parameters["running_mean"], "running_mean"),
+            #  (L1.parameters["running_var"], "running_var"),
+            (y_pred, "y"),
+            (L1.gradients["scaler"], "dLdScaler"),
+            (L1.gradients["intercept"], "dLdIntercept"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Trial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
+            )
+
+            print("\tPASSED {}".format(label))
+
+        i += 1
+
+
+def test_RNNCell(N=15):
+    from numpy_ml.neural_nets.layers import RNNCell
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        n_in = np.random.randint(1, 10)
+        n_out = np.random.randint(1, 10)
+        n_t = np.random.randint(1, 10)
+        X = random_tensor((n_ex, n_in, n_t), standardize=True)
+
+        # initialize RNN layer
+        L1 = RNNCell(n_out=n_out)
+
+        # forward prop
+        y_preds = []
+        for t in range(n_t):
+            y_pred = L1.forward(X[:, :, t])
+            y_preds += [y_pred]
+
+        # backprop
+        dLdX = []
+        dLdAt = np.ones_like(y_preds[t])
+        for t in reversed(range(n_t)):
+            dLdXt = L1.backward(dLdAt)
+            dLdX.insert(0, dLdXt)
+        dLdX = np.dstack(dLdX)
+
+        # get gold standard gradients
+        gold_mod = TorchRNNCell(n_in, n_out, L1.parameters)
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (X, "X"),
+            (np.array(y_preds), "y"),
+            (L1.parameters["ba"].T, "ba"),
+            (L1.parameters["bx"].T, "bx"),
+            (L1.parameters["Wax"].T, "Wax"),
+            (L1.parameters["Waa"].T, "Waa"),
+            (L1.gradients["ba"].T, "dLdBa"),
+            (L1.gradients["bx"].T, "dLdBx"),
+            (L1.gradients["Wax"].T, "dLdWax"),
+            (L1.gradients["Waa"].T, "dLdWaa"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Trial {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            np.testing.assert_allclose(
+                mine,
+                golds[label],
+                err_msg=err_fmt(params, golds, ix),
+                atol=1e-3,
+                rtol=1e-3,
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_Conv2D(N=15):
+    from numpy_ml.neural_nets.layers import Conv2D
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        in_rows = np.random.randint(1, 10)
+        in_cols = np.random.randint(1, 10)
+        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
+        f_shape = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
+        d = np.random.randint(0, 5)
+
+        fr, fc = f_shape[0] * (d + 1) - d, f_shape[1] * (d + 1) - d
+        out_rows = int(1 + (in_rows + 2 * p - fr) / s)
+        out_cols = int(1 + (in_cols + 2 * p - fc) / s)
+
+        if out_rows <= 0 or out_cols <= 0:
+            continue
+
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+
+        # randomly select an activation function
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        # initialize Conv2D layer
+        L1 = Conv2D(
+            out_ch=n_out,
+            kernel_shape=f_shape,
+            act_fn=act_fn,
+            pad=p,
+            stride=s,
+            dilation=d,
+        )
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchConv2DLayer(
+            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
+        )
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (y_pred, "y"),
+            (L1.parameters["W"], "W"),
+            (L1.parameters["b"], "b"),
+            (L1.gradients["W"], "dLdW"),
+            (L1.gradients["b"], "dLdB"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
+        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
+        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_out))
+        print("dilation={}".format(d))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_DPAttention(N=15):
+    from numpy_ml.neural_nets.layers import DotProductAttention
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        d_k = np.random.randint(1, 100)
+        d_v = np.random.randint(1, 100)
+
+        Q = random_tensor((n_ex, d_k), standardize=True)
+        K = random_tensor((n_ex, d_k), standardize=True)
+        V = random_tensor((n_ex, d_v), standardize=True)
+
+        # initialize DotProductAttention layer
+        mine = DotProductAttention(scale=True, dropout_p=0)
+
+        # forward prop
+        y_pred = mine.forward(Q, K, V)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdQ, dLdK, dLdV = mine.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchSDPAttentionLayer()
+        golds = gold_mod.extract_grads(Q, K, V)
+
+        params = [
+            (mine.X[0][0], "Q"),
+            (mine.X[0][1], "K"),
+            (mine.X[0][2], "V"),
+            (y_pred, "Y"),
+            (dLdV, "dLdV"),
+            (dLdK, "dLdK"),
+            (dLdQ, "dLdQ"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("n_ex={} d_k={} d_v={}".format(n_ex, d_k, d_v))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_Conv1D(N=15):
+    from numpy_ml.neural_nets.layers import Conv1D
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        l_in = np.random.randint(1, 10)
+        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
+        f_width = min(l_in, np.random.randint(1, 5))
+        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
+        d = np.random.randint(0, 5)
+
+        fc = f_width * (d + 1) - d
+        l_out = int(1 + (l_in + 2 * p - fc) / s)
+
+        if l_out <= 0:
+            continue
+
+        X = random_tensor((n_ex, l_in, n_in), standardize=True)
+
+        # randomly select an activation function
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        # initialize Conv2D layer
+        L1 = Conv1D(
+            out_ch=n_out,
+            kernel_width=f_width,
+            act_fn=act_fn,
+            pad=p,
+            stride=s,
+            dilation=d,
+        )
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchConv1DLayer(
+            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
+        )
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (y_pred, "y"),
+            (L1.parameters["W"], "W"),
+            (L1.parameters["b"], "b"),
+            (L1.gradients["W"], "dLdW"),
+            (L1.gradients["b"], "dLdB"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("pad={}, stride={}, f_width={}, n_ex={}".format(p, s, f_width, n_ex))
+        print("l_in={}, n_in={}".format(l_in, n_in))
+        print("l_out={}, n_out={}".format(l_out, n_out))
+        print("dilation={}".format(d))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_Deconv2D(N=15):
+    from numpy_ml.neural_nets.layers import Deconv2D
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        in_rows = np.random.randint(1, 10)
+        in_cols = np.random.randint(1, 10)
+        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
+        f_shape = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
+
+        out_rows = s * (in_rows - 1) - 2 * p + f_shape[0]
+        out_cols = s * (in_cols - 1) - 2 * p + f_shape[1]
+
+        if out_rows <= 0 or out_cols <= 0:
+            continue
+
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+
+        # randomly select an activation function
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        # initialize Deconv2D layer
+        L1 = Deconv2D(
+            out_ch=n_out, kernel_shape=f_shape, act_fn=act_fn, pad=p, stride=s
+        )
+
+        # forward prop
+        try:
+            y_pred = L1.forward(X)
+        except ValueError:
+            print("Improper dimensions; retrying")
+            continue
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchDeconv2DLayer(
+            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
+        )
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (L1.X[0], "X"),
+            (L1.parameters["W"], "W"),
+            (L1.parameters["b"], "b"),
+            (y_pred, "y"),
+            (L1.gradients["W"], "dLdW"),
+            (L1.gradients["b"], "dLdB"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
+        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
+        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_out))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_Pool2D(N=15):
+    from numpy_ml.neural_nets.layers import Pool2D
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        in_rows = np.random.randint(1, 10)
+        in_cols = np.random.randint(1, 10)
+        n_in = np.random.randint(1, 3)
+        f_shape = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        p, s = np.random.randint(0, max(1, min(f_shape) // 2)), np.random.randint(1, 3)
+        #  mode = ["max", "average"][np.random.randint(0, 2)]
+        mode = "average"
+        out_rows = int(1 + (in_rows + 2 * p - f_shape[0]) / s)
+        out_cols = int(1 + (in_cols + 2 * p - f_shape[1]) / s)
+
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+        print("\nmode: {}".format(mode))
+        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
+        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
+        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_in))
+
+        # initialize Pool2D layer
+        L1 = Pool2D(kernel_shape=f_shape, pad=p, stride=s, mode=mode)
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchPool2DLayer(n_in, L1.hyperparameters)
+        golds = gold_mod.extract_grads(X)
+
+        params = [(L1.X[0], "X"), (y_pred, "y"), (dLdX, "dLdX")]
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_LSTMCell(N=15):
+    from numpy_ml.neural_nets.layers import LSTMCell
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        n_in = np.random.randint(1, 10)
+        n_out = np.random.randint(1, 10)
+        n_t = np.random.randint(1, 10)
+        X = random_tensor((n_ex, n_in, n_t), standardize=True)
+
+        # initialize LSTM layer
+        L1 = LSTMCell(n_out=n_out)
+
+        # forward prop
+        Cs = []
+        y_preds = []
+        for t in range(n_t):
+            y_pred, Ct = L1.forward(X[:, :, t])
+            y_preds.append(y_pred)
+            Cs.append(Ct)
+
+        # backprop
+        dLdX = []
+        dLdAt = np.ones_like(y_preds[t])
+        for t in reversed(range(n_t)):
+            dLdXt = L1.backward(dLdAt)
+            dLdX.insert(0, dLdXt)
+        dLdX = np.dstack(dLdX)
+        y_preds = np.dstack(y_preds)
+        Cs = np.array(Cs)
+
+        # get gold standard gradients
+        gold_mod = TorchLSTMCell(n_in, n_out, L1.parameters)
+        golds = gold_mod.extract_grads(X)
+
+        params = [
+            (X, "X"),
+            (np.array(Cs), "C"),
+            (y_preds, "y"),
+            (L1.parameters["bo"].T, "bo"),
+            (L1.parameters["bu"].T, "bu"),
+            (L1.parameters["bf"].T, "bf"),
+            (L1.parameters["bc"].T, "bc"),
+            (L1.parameters["Wo"], "Wo"),
+            (L1.parameters["Wu"], "Wu"),
+            (L1.parameters["Wf"], "Wf"),
+            (L1.parameters["Wc"], "Wc"),
+            (L1.gradients["bo"].T, "dLdBo"),
+            (L1.gradients["bu"].T, "dLdBu"),
+            (L1.gradients["bf"].T, "dLdBf"),
+            (L1.gradients["bc"].T, "dLdBc"),
+            (L1.gradients["Wo"], "dLdWo"),
+            (L1.gradients["Wu"], "dLdWu"),
+            (L1.gradients["Wf"], "dLdWf"),
+            (L1.gradients["Wc"], "dLdWc"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Case {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            np.testing.assert_allclose(
+                mine,
+                golds[label],
+                err_msg=err_fmt(params, golds, ix),
+                atol=1e-4,
+                rtol=1e-4,
+            )
+
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):
+    """
+    Manual gradient calc for vanilla RNN parameters
+    """
+    if param_name in ["Ba", "Bx"]:
+        param_name = param_name.lower()
+    elif param_name in ["X", "y"]:
+        return None
+
+    param_orig = model.parameters[param_name]
+    model.flush_gradients()
+    grads = np.zeros_like(param_orig)
+
+    for flat_ix, val in enumerate(param_orig.flat):
+        param = deepcopy(param_orig)
+        md_ix = np.unravel_index(flat_ix, param.shape)
+
+        # plus
+        y_preds_plus = []
+        param[md_ix] = val + epsilon
+        model.parameters[param_name] = param
+        for t in range(n_t):
+            y_pred_plus = model.forward(X[:, :, t])
+            y_preds_plus += [y_pred_plus]
+        loss_plus = loss_func(y_preds_plus)
+        model.flush_gradients()
+
+        # minus
+        y_preds_minus = []
+        param[md_ix] = val - epsilon
+        model.parameters[param_name] = param
+        for t in range(n_t):
+            y_pred_minus = model.forward(X[:, :, t])
+            y_preds_minus += [y_pred_minus]
+        loss_minus = loss_func(y_preds_minus)
+        model.flush_gradients()
+
+        grad = (loss_plus - loss_minus) / (2 * epsilon)
+        grads[md_ix] = grad
+    return grads.T
+
+
+#######################################################################
+#                               Modules                               #
+#######################################################################
+
+
+def test_MultiHeadedAttentionModule(N=15):
+    from numpy_ml.neural_nets.modules import MultiHeadedAttentionModule
+
+    N = np.inf if N is None else N
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        latent_dim = np.random.randint(1, 20)
+        n_heads = np.random.randint(2, 10)
+        d_k = d_v = n_heads * latent_dim
+
+        Q = random_tensor((n_ex, d_k), standardize=True)
+        K = random_tensor((n_ex, d_k), standardize=True)
+        V = random_tensor((n_ex, d_v), standardize=True)
+
+        mine = MultiHeadedAttentionModule(n_heads=n_heads, dropout_p=0)
+
+        # forward prop
+        y_pred = mine.forward(Q, K, V)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdQ, dLdK, dLdV = mine.backward(dLdy)
+
+        # get gold standard gradients
+        params = mine.parameters
+        hparams = mine.hyperparameters
+        gold_mod = TorchMultiHeadedAttentionModule(params, hparams)
+        golds = gold_mod.extract_grads(Q, K, V)
+
+        dv = mine.derived_variables
+        params = mine.parameters["components"]
+        grads = mine.gradients["components"]
+        params = [
+            (Q, "Q"),
+            (K, "K"),
+            (V, "V"),
+            (mine.n_heads, "n_heads"),
+            (mine.latent_dim, "latent_dim"),
+            (params["O"]["W"], "O_W"),
+            (params["K"]["W"], "K_W"),
+            (params["V"]["W"], "V_W"),
+            (params["Q"]["W"], "Q_W"),
+            (params["O"]["b"], "O_b"),
+            (params["K"]["b"], "K_b"),
+            (params["V"]["b"], "V_b"),
+            (params["Q"]["b"], "Q_b"),
+            (dv["Q_proj"], "Q_proj"),
+            (dv["K_proj"], "K_proj"),
+            (dv["V_proj"], "V_proj"),
+            (dv["attention_weights"][0], "weights"),
+            (dv["attention_out"], "attn_out"),
+            (y_pred, "Y"),
+            (dLdy, "dLdy"),
+            (dv["dQ_proj"], "dQ_proj"),
+            (dv["dK_proj"], "dK_proj"),
+            (dv["dV_proj"], "dV_proj"),
+            (grads["O"]["W"], "dO_W"),
+            (grads["V"]["W"], "dV_W"),
+            (grads["K"]["W"], "dK_W"),
+            (grads["Q"]["W"], "dQ_W"),
+            (grads["O"]["b"], "dO_b"),
+            (grads["V"]["b"], "dV_b"),
+            (grads["K"]["b"], "dK_b"),
+            (grads["Q"]["b"], "dQ_b"),
+            (dLdQ, "dQ"),
+            (dLdK, "dK"),
+            (dLdV, "dV"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print(
+            "n_ex={} d_k=d_v={} latent_dim={} n_heads={}".format(
+                n_ex, d_k, latent_dim, n_heads
+            )
+        )
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_SkipConnectionIdentityModule(N=15):
+    from numpy_ml.neural_nets.modules import SkipConnectionIdentityModule
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(2, 10)
+        in_rows = np.random.randint(2, 25)
+        in_cols = np.random.randint(2, 25)
+        n_in = np.random.randint(2, 5)
+        n_out = n_in
+        f_shape1 = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        f_shape2 = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        s1 = np.random.randint(1, 5)
+        s2 = np.random.randint(1, 5)
+
+        # randomly select an activation function
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+
+        p1 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape1, s1)
+        if p1[0] != p1[1] or p1[2] != p1[3]:
+            continue
+
+        p2 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape2, s2)
+        if p2[0] != p2[1] or p2[2] != p2[3]:
+            continue
+
+        p1 = (p1[0], p1[2])
+        p2 = (p2[0], p2[2])
+
+        # initialize SkipConnectionIdentity module
+        L1 = SkipConnectionIdentityModule(
+            out_ch=n_out,
+            kernel_shape1=f_shape1,
+            kernel_shape2=f_shape2,
+            stride1=s1,
+            stride2=s2,
+            act_fn=act_fn,
+            epsilon=1e-5,
+            momentum=0.9,
+        )
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchSkipConnectionIdentity(
+            torch_fn,
+            p1,
+            p2,
+            L1.parameters,
+            L1.hyperparameters,
+            momentum=L1.momentum,
+            epsilon=L1.epsilon,
+        )
+        golds = gold_mod.extract_grads(X)
+
+        params = L1.parameters["components"]
+        grads = L1.gradients["components"]
+        params = [
+            (X, "X"),
+            (params["conv1"]["W"], "conv1_W"),
+            (params["conv1"]["b"], "conv1_b"),
+            (params["batchnorm1"]["scaler"].T, "bn1_scaler"),
+            (params["batchnorm1"]["intercept"], "bn1_intercept"),
+            (params["batchnorm1"]["running_mean"], "bn1_running_mean"),
+            #  (params["batchnorm1"]["running_var"], "bn1_running_var"),
+            (params["conv2"]["W"], "conv2_W"),
+            (params["conv2"]["b"], "conv2_b"),
+            (params["batchnorm2"]["scaler"].T, "bn2_scaler"),
+            (params["batchnorm2"]["intercept"], "bn2_intercept"),
+            (params["batchnorm2"]["running_mean"], "bn2_running_mean"),
+            #  (params["batchnorm2"]["running_var"], "bn2_running_var"),
+            (L1._dv["conv1_out"], "act1_out"),
+            (L1._dv["batchnorm1_out"], "bn1_out"),
+            (L1._dv["conv2_out"], "conv2_out"),
+            (L1._dv["batchnorm2_out"], "bn2_out"),
+            (y_pred, "Y"),
+            (dLdy, "dLdY"),
+            (L1.derived_variables["dLdBn2"], "dLdBn2_out"),
+            (L1.derived_variables["dLdConv2"], "dLdConv2_out"),
+            (L1.derived_variables["dLdBn1"], "dLdBn1_out"),
+            (L1.derived_variables["dLdConv1"], "dLdActFn1_out"),
+            (dLdX, "dLdX"),
+            (grads["batchnorm2"]["scaler"].T, "dLdBn2_scaler"),
+            (grads["batchnorm2"]["intercept"], "dLdBn2_intercept"),
+            (grads["conv2"]["W"], "dLdConv2_W"),
+            (grads["conv2"]["b"], "dLdConv2_b"),
+            (grads["batchnorm1"]["scaler"].T, "dLdBn1_scaler"),
+            (grads["batchnorm1"]["intercept"], "dLdBn1_intercept"),
+            (grads["conv1"]["W"], "dLdConv1_W"),
+            (grads["conv1"]["b"], "dLdConv1_b"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("act_fn={}, n_ex={}".format(act_fn, n_ex))
+        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
+        print("pad1={}, stride1={}, f_shape1={}".format(p1, s1, f_shape1))
+        print("pad2={}, stride2={}, f_shape2={}".format(p2, s2, f_shape2))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=2
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_SkipConnectionConvModule(N=15):
+    from numpy_ml.neural_nets.modules import SkipConnectionConvModule
+    from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    acts = [
+        (Tanh(), nn.Tanh(), "Tanh"),
+        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
+        (ReLU(), nn.ReLU(), "ReLU"),
+        (Affine(), TorchLinearActivation(), "Affine"),
+    ]
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(2, 10)
+        in_rows = np.random.randint(2, 10)
+        in_cols = np.random.randint(2, 10)
+        n_in = np.random.randint(2, 5)
+        n_out1 = np.random.randint(2, 5)
+        n_out2 = np.random.randint(2, 5)
+        f_shape1 = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        f_shape2 = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+        f_shape_skip = (
+            min(in_rows, np.random.randint(1, 5)),
+            min(in_cols, np.random.randint(1, 5)),
+        )
+
+        s1 = np.random.randint(1, 5)
+        s2 = np.random.randint(1, 5)
+        s_skip = np.random.randint(1, 5)
+
+        # randomly select an activation function
+        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
+
+        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
+
+        p1 = (np.random.randint(1, 5), np.random.randint(1, 5))
+        p2 = (np.random.randint(1, 5), np.random.randint(1, 5))
+
+        # initialize SkipConnectionConv module
+        L1 = SkipConnectionConvModule(
+            out_ch1=n_out1,
+            out_ch2=n_out2,
+            kernel_shape1=f_shape1,
+            kernel_shape2=f_shape2,
+            kernel_shape_skip=f_shape_skip,
+            stride1=s1,
+            stride2=s2,
+            stride_skip=s_skip,
+            pad1=p1,
+            pad2=p2,
+            act_fn=act_fn,
+            epsilon=1e-5,
+            momentum=0.9,
+        )
+
+        # forward prop
+        try:
+            y_pred = L1.forward(X)
+        except (ValueError, AssertionError):
+            print("Invalid padding; Retrying")
+            continue
+
+        ps = L1.hyperparameters["pad_skip"]
+        if ps[0] != ps[1] or ps[2] != ps[3]:
+            continue
+        pad_skip = (ps[0], ps[2])
+
+        # backprop
+        dLdy = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdy)
+
+        # get gold standard gradients
+        gold_mod = TorchSkipConnectionConv(
+            torch_fn,
+            p1,
+            p2,
+            pad_skip,
+            L1.parameters,
+            L1.hyperparameters,
+            momentum=L1.momentum,
+            epsilon=L1.epsilon,
+        )
+        golds = gold_mod.extract_grads(X)
+
+        params = L1.parameters["components"]
+        grads = L1.gradients["components"]
+        params = [
+            (X, "X"),
+            (params["conv1"]["W"], "conv1_W"),
+            (params["conv1"]["b"], "conv1_b"),
+            (params["batchnorm1"]["scaler"].T, "bn1_scaler"),
+            (params["batchnorm1"]["intercept"], "bn1_intercept"),
+            (params["batchnorm1"]["running_mean"], "bn1_running_mean"),
+            #  (params["batchnorm1"]["running_var"], "bn1_running_var"),
+            (params["conv2"]["W"], "conv2_W"),
+            (params["conv2"]["b"], "conv2_b"),
+            (params["batchnorm2"]["scaler"].T, "bn2_scaler"),
+            (params["batchnorm2"]["intercept"], "bn2_intercept"),
+            (params["batchnorm2"]["running_mean"], "bn2_running_mean"),
+            #  (params["batchnorm2"]["running_var"], "bn2_running_var"),
+            (params["conv_skip"]["W"], "conv_skip_W"),
+            (params["conv_skip"]["b"], "conv_skip_b"),
+            (params["batchnorm_skip"]["scaler"].T, "bn_skip_scaler"),
+            (params["batchnorm_skip"]["intercept"], "bn_skip_intercept"),
+            (params["batchnorm_skip"]["running_mean"], "bn_skip_running_mean"),
+            #  (params["batchnorm_skip"]["running_var"], "bn_skip_running_var"),
+            (L1._dv["conv1_out"], "act1_out"),
+            (L1._dv["batchnorm1_out"], "bn1_out"),
+            (L1._dv["conv2_out"], "conv2_out"),
+            (L1._dv["batchnorm2_out"], "bn2_out"),
+            (L1._dv["conv_skip_out"], "conv_skip_out"),
+            (L1._dv["batchnorm_skip_out"], "bn_skip_out"),
+            (y_pred, "Y"),
+            (dLdy, "dLdY"),
+            (L1.derived_variables["dLdBn2"], "dLdBn2_out"),
+            (L1.derived_variables["dLdConv2"], "dLdConv2_out"),
+            (L1.derived_variables["dLdBnSkip"], "dLdBnSkip_out"),
+            (L1.derived_variables["dLdConvSkip"], "dLdConvSkip_out"),
+            (L1.derived_variables["dLdBn1"], "dLdBn1_out"),
+            (L1.derived_variables["dLdConv1"], "dLdActFn1_out"),
+            (dLdX, "dLdX"),
+            (grads["batchnorm_skip"]["scaler"].T, "dLdBnSkip_scaler"),
+            (grads["batchnorm_skip"]["intercept"], "dLdBnSkip_intercept"),
+            (grads["conv_skip"]["W"], "dLdConvSkip_W"),
+            (grads["conv_skip"]["b"], "dLdConvSkip_b"),
+            (grads["batchnorm2"]["scaler"].T, "dLdBn2_scaler"),
+            (grads["batchnorm2"]["intercept"], "dLdBn2_intercept"),
+            (grads["conv2"]["W"], "dLdConv2_W"),
+            (grads["conv2"]["b"], "dLdConv2_b"),
+            (grads["batchnorm1"]["scaler"].T, "dLdBn1_scaler"),
+            (grads["batchnorm1"]["intercept"], "dLdBn1_intercept"),
+            (grads["conv1"]["W"], "dLdConv1_W"),
+            (grads["conv1"]["b"], "dLdConv1_b"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("act_fn={}, n_ex={}".format(act_fn, n_ex))
+        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
+        print("pad1={}, stride1={}, f_shape1={}".format(p1, s1, f_shape1))
+        print("pad2={}, stride2={}, f_shape2={}".format(p2, s2, f_shape2))
+        print("stride_skip={}, f_shape_skip={}".format(s_skip, f_shape_skip))
+        warn_str = (
+            "\n[NOTE] The tests in this module can fail sometimes during "
+            "backprop due to the ReLU issue: while the difference in the forward pass "
+            "between z=-1e-9 and z=1e-9 is miniscule, the difference during the backward "
+            "pass is significant due to ReLU's kink about 0."
+        )
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine,
+                golds[label],
+                err_msg=err_fmt(params, golds, ix, warn_str),
+                decimal=2,
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_BidirectionalLSTM(N=15):
+    from numpy_ml.neural_nets.modules import BidirectionalLSTM
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        n_in = np.random.randint(1, 10)
+        n_out = np.random.randint(1, 10)
+        n_t = np.random.randint(1, 10)
+        X = random_tensor((n_ex, n_in, n_t), standardize=True)
+
+        # initialize LSTM layer
+        L1 = BidirectionalLSTM(n_out=n_out)
+
+        # forward prop
+        y_pred = L1.forward(X)
+
+        # backprop
+        dLdA = np.ones_like(y_pred)
+        dLdX = L1.backward(dLdA)
+
+        # get gold standard gradients
+        gold_mod = TorchBidirectionalLSTM(n_in, n_out, L1.parameters)
+        golds = gold_mod.extract_grads(X)
+
+        pms, grads = L1.parameters["components"], L1.gradients["components"]
+        params = [
+            (X, "X"),
+            (y_pred, "y"),
+            (pms["cell_fwd"]["bo"].T, "bo_f"),
+            (pms["cell_fwd"]["bu"].T, "bu_f"),
+            (pms["cell_fwd"]["bf"].T, "bf_f"),
+            (pms["cell_fwd"]["bc"].T, "bc_f"),
+            (pms["cell_fwd"]["Wo"], "Wo_f"),
+            (pms["cell_fwd"]["Wu"], "Wu_f"),
+            (pms["cell_fwd"]["Wf"], "Wf_f"),
+            (pms["cell_fwd"]["Wc"], "Wc_f"),
+            (pms["cell_bwd"]["bo"].T, "bo_b"),
+            (pms["cell_bwd"]["bu"].T, "bu_b"),
+            (pms["cell_bwd"]["bf"].T, "bf_b"),
+            (pms["cell_bwd"]["bc"].T, "bc_b"),
+            (pms["cell_bwd"]["Wo"], "Wo_b"),
+            (pms["cell_bwd"]["Wu"], "Wu_b"),
+            (pms["cell_bwd"]["Wf"], "Wf_b"),
+            (pms["cell_bwd"]["Wc"], "Wc_b"),
+            (grads["cell_fwd"]["bo"].T, "dLdBo_f"),
+            (grads["cell_fwd"]["bu"].T, "dLdBu_f"),
+            (grads["cell_fwd"]["bf"].T, "dLdBf_f"),
+            (grads["cell_fwd"]["bc"].T, "dLdBc_f"),
+            (grads["cell_fwd"]["Wo"], "dLdWo_f"),
+            (grads["cell_fwd"]["Wu"], "dLdWu_f"),
+            (grads["cell_fwd"]["Wf"], "dLdWf_f"),
+            (grads["cell_fwd"]["Wc"], "dLdWc_f"),
+            (grads["cell_bwd"]["bo"].T, "dLdBo_b"),
+            (grads["cell_bwd"]["bu"].T, "dLdBu_b"),
+            (grads["cell_bwd"]["bf"].T, "dLdBf_b"),
+            (grads["cell_bwd"]["bc"].T, "dLdBc_b"),
+            (grads["cell_bwd"]["Wo"], "dLdWo_b"),
+            (grads["cell_bwd"]["Wu"], "dLdWu_b"),
+            (grads["cell_bwd"]["Wf"], "dLdWf_b"),
+            (grads["cell_bwd"]["Wc"], "dLdWc_b"),
+            (dLdX, "dLdX"),
+        ]
+
+        print("Case {}".format(i))
+        for ix, (mine, label) in enumerate(params):
+            np.testing.assert_allclose(
+                mine,
+                golds[label],
+                err_msg=err_fmt(params, golds, ix),
+                atol=1e-4,
+                rtol=1e-4,
+            )
+
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+def test_WaveNetModule(N=10):
+    from numpy_ml.neural_nets.modules import WavenetResidualModule
+
+    N = np.inf if N is None else N
+
+    np.random.seed(12345)
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 10)
+        l_in = np.random.randint(1, 10)
+        ch_residual, ch_dilation = np.random.randint(1, 5), np.random.randint(1, 5)
+        f_width = min(l_in, np.random.randint(1, 5))
+        d = np.random.randint(0, 5)
+
+        X_main = np.zeros_like(
+            random_tensor((n_ex, l_in, ch_residual), standardize=True)
+        )
+        X_main[0][0][0] = 1.0
+        X_skip = np.zeros_like(
+            random_tensor((n_ex, l_in, ch_residual), standardize=True)
+        )
+
+        # initialize Conv2D layer
+        L1 = WavenetResidualModule(
+            ch_residual=ch_residual,
+            ch_dilation=ch_dilation,
+            kernel_width=f_width,
+            dilation=d,
+        )
+
+        # forward prop
+        Y_main, Y_skip = L1.forward(X_main, X_skip)
+
+        # backprop
+        dLdY_skip = np.ones_like(Y_skip)
+        dLdY_main = np.ones_like(Y_main)
+        dLdX_main, dLdX_skip = L1.backward(dLdY_skip, dLdY_main)
+
+        _, conv_1x1_pad = pad1D(
+            L1._dv["multiply_gate_out"], "same", kernel_width=1, stride=1, dilation=0
+        )
+        if conv_1x1_pad[0] != conv_1x1_pad[1]:
+            print("Skipping")
+            continue
+
+        conv_1x1_pad = conv_1x1_pad[0]
+
+        # get gold standard gradients
+        gold_mod = TorchWavenetModule(L1.parameters, L1.hyperparameters, conv_1x1_pad)
+        golds = gold_mod.extract_grads(X_main, X_skip)
+
+        dv = L1.derived_variables
+        pc = L1.parameters["components"]
+        gr = L1.gradients["components"]
+
+        params = [
+            (L1.X_main, "X_main"),
+            (L1.X_skip, "X_skip"),
+            (pc["conv_dilation"]["W"], "conv_dilation_W"),
+            (pc["conv_dilation"]["b"], "conv_dilation_b"),
+            (pc["conv_1x1"]["W"], "conv_1x1_W"),
+            (pc["conv_1x1"]["b"], "conv_1x1_b"),
+            (dv["conv_dilation_out"], "conv_dilation_out"),
+            (dv["tanh_out"], "tanh_out"),
+            (dv["sigm_out"], "sigm_out"),
+            (dv["multiply_gate_out"], "multiply_gate_out"),
+            (dv["conv_1x1_out"], "conv_1x1_out"),
+            (Y_main, "Y_main"),
+            (Y_skip, "Y_skip"),
+            (dLdY_skip, "dLdY_skip"),
+            (dLdY_main, "dLdY_main"),
+            (dv["dLdConv_1x1"], "dLdConv_1x1_out"),
+            (gr["conv_1x1"]["W"], "dLdConv_1x1_W"),
+            (gr["conv_1x1"]["b"], "dLdConv_1x1_b"),
+            (dv["dLdMultiply"], "dLdMultiply_out"),
+            (dv["dLdTanh"], "dLdTanh_out"),
+            (dv["dLdSigmoid"], "dLdSigm_out"),
+            (dv["dLdConv_dilation"], "dLdConv_dilation_out"),
+            (gr["conv_dilation"]["W"], "dLdConv_dilation_W"),
+            (gr["conv_dilation"]["b"], "dLdConv_dilation_b"),
+            (dLdX_main, "dLdX_main"),
+            (dLdX_skip, "dLdX_skip"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("f_width={}, n_ex={}".format(f_width, n_ex))
+        print("l_in={}, ch_residual={}".format(l_in, ch_residual))
+        print("ch_dilation={} dilation={}".format(ch_dilation, d))
+        for ix, (mine, label) in enumerate(params):
+            assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
+
+
+#######################################################################
+#                                Utils                                #
+#######################################################################
+
+
+def test_pad1D(N=15):
+    from numpy_ml.neural_nets.layers import Conv1D
+    from .nn_torch_models import TorchCausalConv1d, torchify
+
+    np.random.seed(12345)
+
+    N = np.inf if N is None else N
+
+    i = 1
+    while i < N + 1:
+        p = np.random.choice(["same", "causal"])
+        n_ex = np.random.randint(1, 10)
+        l_in = np.random.randint(1, 10)
+        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
+        f_width = min(l_in, np.random.randint(1, 5))
+        s = np.random.randint(1, 3)
+        d = np.random.randint(0, 5)
+
+        X = random_tensor((n_ex, l_in, n_in), standardize=True)
+        X_pad, _ = pad1D(X, p, kernel_width=f_width, stride=s, dilation=d)
+
+        # initialize Conv2D layer
+        L1 = Conv1D(out_ch=n_out, kernel_width=f_width, pad=0, stride=s, dilation=d)
+
+        # forward prop
+        try:
+            y_pred = L1.forward(X_pad)
+        except ValueError:
+            continue
+
+        # ignore n. output channels
+        print("Trial {}".format(i))
+        print("p={} d={} s={} l_in={} f_width={}".format(p, d, s, l_in, f_width))
+        print("n_ex={} n_in={} n_out={}".format(n_ex, n_in, n_out))
+        assert y_pred.shape[:2] == X.shape[:2], print(
+            "y_pred.shape={} X.shape={}".format(y_pred.shape, X.shape)
+        )
+
+        if p == "causal":
+            gold = TorchCausalConv1d(
+                in_channels=n_in,
+                out_channels=n_out,
+                kernel_size=f_width,
+                stride=s,
+                dilation=d + 1,
+                bias=True,
+            )
+            if s != 1:
+                print(
+                    "TorchCausalConv1D does not do `same` padding for stride > 1. Skipping"
+                )
+                continue
+
+            XT = torchify(np.moveaxis(X, [0, 1, 2], [0, -1, -2]))
+        else:
+            gold = nn.Conv1d(
+                in_channels=n_in,
+                out_channels=n_out,
+                kernel_size=f_width,
+                padding=0,
+                stride=s,
+                dilation=d + 1,
+                bias=True,
+            )
+            XT = torchify(np.moveaxis(X_pad, [0, 1, 2], [0, -1, -2]))
+
+        # import weights and biases
+        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
+        b = L1.parameters["b"]
+        W = np.moveaxis(L1.parameters["W"], [0, 1, 2], [-1, -2, -3])
+        assert gold.weight.shape == W.shape
+        assert gold.bias.shape == b.flatten().shape
+
+        gold.weight = nn.Parameter(torch.FloatTensor(W))
+        gold.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
+
+        outT = gold(XT)
+        if outT.ndimension() == 2:
+            import ipdb
+
+            ipdb.set_trace()
+
+        gold_out = np.moveaxis(outT.detach().numpy(), [0, 1, 2], [0, -1, -2])
+        assert gold_out.shape[:2] == X.shape[:2]
+
+        np.testing.assert_almost_equal(
+            y_pred,
+            gold_out,
+            err_msg=err_fmt(
+                [(y_pred.shape, "out.shape"), (y_pred, "out")],
+                {"out.shape": gold_out.shape, "out": gold_out},
+                1,
+            ),
+            decimal=4,
+        )
+        print("PASSED\n")
+        i += 1
+
+
+def test_conv(N=15):
+    np.random.seed(12345)
+    N = np.inf if N is None else N
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(2, 15)
+        in_rows = np.random.randint(2, 15)
+        in_cols = np.random.randint(2, 15)
+        in_ch = np.random.randint(2, 15)
+        out_ch = np.random.randint(2, 15)
+        f_shape = (
+            min(in_rows, np.random.randint(2, 10)),
+            min(in_cols, np.random.randint(2, 10)),
+        )
+        s = np.random.randint(1, 3)
+        p = np.random.randint(0, 5)
+
+        X = np.random.rand(n_ex, in_rows, in_cols, in_ch)
+        X_pad, p = pad2D(X, p)
+        W = np.random.randn(f_shape[0], f_shape[1], in_ch, out_ch)
+
+        gold = conv2D_naive(X, W, s, p)
+        mine = conv2D(X, W, s, p)
+
+        np.testing.assert_almost_equal(mine, gold)
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                               Models                                #
+#######################################################################
+
+
+def fit_VAE():
+    # for testing
+    from numpy_ml.neural_nets.models.vae import BernoulliVAE
+
+    np.random.seed(12345)
+
+    (X_train, y_train), (X_test, y_test) = mnist.load_data()
+
+    # scale pixel intensities to [0, 1]
+    X_train = np.expand_dims(X_train.astype("float32") / 255.0, 3)
+    X_test = np.expand_dims(X_test.astype("float32") / 255.0, 3)
+
+    X_train = X_train[: 128 * 1]  # 1 batch
+
+    BV = BernoulliVAE()
+    BV.fit(X_train, n_epochs=1, verbose=False)
+
+
+def test_WGAN_GP(N=1):
+    from numpy_ml.neural_nets.models.wgan_gp import WGAN_GP
+
+    np.random.seed(12345)
+
+    ss = np.random.randint(0, 1000)
+    np.random.seed(ss)
+
+    N = np.inf if N is None else N
+
+    i = 1
+    while i < N + 1:
+        c_updates_per_epoch, n_steps = 1, 1
+        n_ex = np.random.randint(1, 500)
+        n_in = np.random.randint(1, 100)
+        lambda_ = np.random.randint(0, 20)
+        g_hidden = np.random.randint(2, 500)
+        X = random_tensor((n_ex, n_in), standardize=True)
+
+        # initialize WGAN_GP model
+        L1 = WGAN_GP(g_hidden=g_hidden, debug=True)
+
+        # forward prop
+        batchsize = n_ex
+        L1.fit(
+            X,
+            lambda_=lambda_,
+            c_updates_per_epoch=c_updates_per_epoch,
+            n_steps=n_steps,
+            batchsize=batchsize,
+        )
+
+        # backprop
+        dv = L1.derived_variables
+        params = L1.parameters["components"]
+        grads = L1.gradients["components"]
+        params["noise"] = dv["noise"]
+        params["alpha"] = dv["alpha"]
+        params["n_in"] = n_in
+        params["g_hidden"] = g_hidden
+        params["c_updates_per_epoch"] = c_updates_per_epoch
+        params["n_steps"] = n_steps
+
+        # get gold standard gradients
+        golds = WGAN_GP_tf(X, lambda_=lambda_, batch_size=batchsize, params=params)
+
+        params = [
+            (dv["X_real"], "X_real"),
+            (params["generator"]["FC1"]["W"], "G_weights_FC1"),
+            (params["generator"]["FC2"]["W"], "G_weights_FC2"),
+            (params["generator"]["FC3"]["W"], "G_weights_FC3"),
+            (params["generator"]["FC4"]["W"], "G_weights_FC4"),
+            (dv["G_fwd_X_fake"]["FC1"], "G_fwd_X_fake_FC1"),
+            (dv["G_fwd_X_fake"]["FC2"], "G_fwd_X_fake_FC2"),
+            (dv["G_fwd_X_fake"]["FC3"], "G_fwd_X_fake_FC3"),
+            (dv["G_fwd_X_fake"]["FC4"], "G_fwd_X_fake_FC4"),
+            (dv["X_fake"], "X_fake"),
+            (dv["X_interp"], "X_interp"),
+            (params["critic"]["FC1"]["W"], "C_weights_Y_real_FC1"),
+            (params["critic"]["FC2"]["W"], "C_weights_Y_real_FC2"),
+            (params["critic"]["FC3"]["W"], "C_weights_Y_real_FC3"),
+            (params["critic"]["FC4"]["W"], "C_weights_Y_real_FC4"),
+            (dv["C_fwd_Y_real"]["FC1"], "C_fwd_Y_real_FC1"),
+            (dv["C_fwd_Y_real"]["FC2"], "C_fwd_Y_real_FC2"),
+            (dv["C_fwd_Y_real"]["FC3"], "C_fwd_Y_real_FC3"),
+            (dv["C_fwd_Y_real"]["FC4"], "C_fwd_Y_real_FC4"),
+            (dv["Y_real"].flatten(), "Y_real"),
+            (params["critic"]["FC1"]["W"], "C_weights_Y_fake_FC1"),
+            (params["critic"]["FC2"]["W"], "C_weights_Y_fake_FC2"),
+            (params["critic"]["FC3"]["W"], "C_weights_Y_fake_FC3"),
+            (params["critic"]["FC4"]["W"], "C_weights_Y_fake_FC4"),
+            (dv["C_fwd_Y_fake"]["FC1"], "C_fwd_Y_fake_FC1"),
+            (dv["C_fwd_Y_fake"]["FC2"], "C_fwd_Y_fake_FC2"),
+            (dv["C_fwd_Y_fake"]["FC3"], "C_fwd_Y_fake_FC3"),
+            (dv["C_fwd_Y_fake"]["FC4"], "C_fwd_Y_fake_FC4"),
+            (dv["Y_fake"].flatten(), "Y_fake"),
+            (params["critic"]["FC1"]["W"], "C_weights_Y_interp_FC1"),
+            (params["critic"]["FC2"]["W"], "C_weights_Y_interp_FC2"),
+            (params["critic"]["FC3"]["W"], "C_weights_Y_interp_FC3"),
+            (params["critic"]["FC4"]["W"], "C_weights_Y_interp_FC4"),
+            (dv["C_fwd_Y_interp"]["FC1"], "C_fwd_Y_interp_FC1"),
+            (dv["C_fwd_Y_interp"]["FC2"], "C_fwd_Y_interp_FC2"),
+            (dv["C_fwd_Y_interp"]["FC3"], "C_fwd_Y_interp_FC3"),
+            (dv["C_fwd_Y_interp"]["FC4"], "C_fwd_Y_interp_FC4"),
+            (dv["Y_interp"].flatten(), "Y_interp"),
+            (dv["C_dY_interp_wrt"]["FC4"], "dY_interp_wrt_FC4"),
+            (dv["C_dY_interp_wrt"]["FC3"], "dY_interp_wrt_FC3"),
+            (dv["C_dY_interp_wrt"]["FC2"], "dY_interp_wrt_FC2"),
+            (dv["C_dY_interp_wrt"]["FC1"], "dY_interp_wrt_FC1"),
+            (dv["gradInterp"], "gradInterp"),
+            (dv["C_loss"], "C_loss"),
+            (dv["G_loss"], "G_loss"),
+            (grads["critic"]["FC1"]["W"], "dC_loss_dW_FC1"),
+            (grads["critic"]["FC1"]["b"].flatten(), "dC_loss_db_FC1"),
+            (grads["critic"]["FC2"]["W"], "dC_loss_dW_FC2"),
+            (grads["critic"]["FC2"]["b"].flatten(), "dC_loss_db_FC2"),
+            (grads["critic"]["FC3"]["W"], "dC_loss_dW_FC3"),
+            (grads["critic"]["FC3"]["b"].flatten(), "dC_loss_db_FC3"),
+            (grads["critic"]["FC4"]["W"], "dC_loss_dW_FC4"),
+            (grads["critic"]["FC4"]["b"].flatten(), "dC_loss_db_FC4"),
+            (dv["dG_Y_fake"].flatten(), "dG_Y_fake"),
+            (dv["dY_real"].flatten(), "dC_Y_real"),
+            (dv["dC_Y_fake"].flatten(), "dC_Y_fake"),
+            (dv["dGrad_interp"], "dC_gradInterp"),
+        ]
+
+        print("\nTrial {}".format(i))
+        print("Seed: {} g_hidden={}".format(ss, g_hidden))
+        print("lambda={} n_ex={} n_in={}".format(lambda_, n_ex, n_in))
+        print(
+            "c_updates_per_epoch={}, n_steps={} batchsize={}".format(
+                c_updates_per_epoch, n_steps, batchsize
+            )
+        )
+
+        for ix, (mine, label) in enumerate(params):
+            np.testing.assert_almost_equal(
+                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
+            )
+            print("\tPASSED {}".format(label))
+        i += 1
diff --git a/numpy_ml/tests/test_nn_activations.py b/numpy_ml/tests/test_nn_activations.py
new file mode 100644
index 0000000..99bb294
--- /dev/null
+++ b/numpy_ml/tests/test_nn_activations.py
@@ -0,0 +1,337 @@
+# flake8: noqa
+import time
+import numpy as np
+
+from numpy.testing import assert_almost_equal
+from scipy.special import expit
+
+import torch
+import torch.nn.functional as F
+
+from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor
+
+
+def torch_gradient_generator(fn, **kwargs):
+    def get_grad(z):
+        z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)
+        z2 = fn(z1, **kwargs).sum()
+        z2.backward()
+        grad = z1.grad.numpy()
+        return grad
+
+    return get_grad
+
+
+#######################################################################
+#                           Debug Formatter                           #
+#######################################################################
+
+
+def err_fmt(params, golds, ix, warn_str=""):
+    mine, label = params[ix]
+    err_msg = "-" * 25 + " DEBUG " + "-" * 25 + "\n"
+    prev_mine, prev_label = params[max(ix - 1, 0)]
+    err_msg += "Mine (prev) [{}]:\n{}\n\nTheirs (prev) [{}]:\n{}".format(
+        prev_label, prev_mine, prev_label, golds[prev_label]
+    )
+    err_msg += "\n\nMine [{}]:\n{}\n\nTheirs [{}]:\n{}".format(
+        label, mine, label, golds[label]
+    )
+    err_msg += warn_str
+    err_msg += "\n" + "-" * 23 + " END DEBUG " + "-" * 23
+    return err_msg
+
+
+#######################################################################
+#                            Test Suite                               #
+#######################################################################
+#
+#
+#  def test_activations(N=50):
+#      print("Testing Sigmoid activation")
+#      time.sleep(1)
+#      test_sigmoid_activation(N)
+#      test_sigmoid_grad(N)
+#
+#      #  print("Testing Softmax activation")
+#      #  time.sleep(1)
+#      #  test_softmax_activation(N)
+#      #  test_softmax_grad(N)
+#
+#      print("Testing Tanh activation")
+#      time.sleep(1)
+#      test_tanh_grad(N)
+#
+#      print("Testing ReLU activation")
+#      time.sleep(1)
+#      test_relu_activation(N)
+#      test_relu_grad(N)
+#
+#      print("Testing ELU activation")
+#      time.sleep(1)
+#      test_elu_activation(N)
+#      test_elu_grad(N)
+#
+#      print("Testing SELU activation")
+#      time.sleep(1)
+#      test_selu_activation(N)
+#      test_selu_grad(N)
+#
+#      print("Testing LeakyRelu activation")
+#      time.sleep(1)
+#      test_leakyrelu_activation(N)
+#      test_leakyrelu_grad(N)
+#
+#      print("Testing SoftPlus activation")
+#      time.sleep(1)
+#      test_softplus_activation(N)
+#      test_softplus_grad(N)
+#
+
+#######################################################################
+#                          Activations                                #
+#######################################################################
+
+
+def test_sigmoid_activation(N=50):
+    from numpy_ml.neural_nets.activations import Sigmoid
+
+    N = np.inf if N is None else N
+
+    mine = Sigmoid()
+    gold = expit
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((1, n_dims))
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_softplus_activation(N=50):
+    from numpy_ml.neural_nets.activations import SoftPlus
+
+    N = np.inf if N is None else N
+
+    mine = SoftPlus()
+    gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_elu_activation(N=50):
+    from numpy_ml.neural_nets.activations import ELU
+
+    N = np.inf if N is None else N
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 10)
+        z = random_tensor((1, n_dims))
+
+        alpha = np.random.uniform(0, 10)
+
+        mine = ELU(alpha)
+        gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()
+
+        assert_almost_equal(mine.fn(z), gold(z, alpha))
+        print("PASSED")
+        i += 1
+
+
+def test_relu_activation(N=50):
+    from numpy_ml.neural_nets.activations import ReLU
+
+    N = np.inf if N is None else N
+
+    mine = ReLU()
+    gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_selu_activation(N=50):
+    from numpy_ml.neural_nets.activations import SELU
+
+    N = np.inf if N is None else N
+
+    mine = SELU()
+    gold = lambda z: F.selu(torch.FloatTensor(z)).numpy()
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        assert_almost_equal(mine.fn(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_leakyrelu_activation(N=50):
+    from numpy_ml.neural_nets.activations import LeakyReLU
+
+    N = np.inf if N is None else N
+
+    i = 0
+    while i < N:
+        n_dims = np.random.randint(1, 100)
+        z = random_stochastic_matrix(1, n_dims)
+        alpha = np.random.uniform(0, 10)
+
+        mine = LeakyReLU(alpha=alpha)
+        gold = lambda z: F.leaky_relu(torch.FloatTensor(z), alpha).numpy()
+        assert_almost_equal(mine.fn(z), gold(z))
+
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                      Activation Gradients                           #
+#######################################################################
+
+
+def test_sigmoid_grad(N=50):
+    from numpy_ml.neural_nets.activations import Sigmoid
+
+    N = np.inf if N is None else N
+
+    mine = Sigmoid()
+    gold = torch_gradient_generator(torch.sigmoid)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_elu_grad(N=50):
+    from numpy_ml.neural_nets.activations import ELU
+
+    N = np.inf if N is None else N
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 10)
+        n_dims = np.random.randint(1, 10)
+        alpha = np.random.uniform(0, 10)
+        z = random_tensor((n_ex, n_dims))
+
+        mine = ELU(alpha)
+        gold = torch_gradient_generator(F.elu, alpha=alpha)
+        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
+        print("PASSED")
+        i += 1
+
+
+def test_tanh_grad(N=50):
+    from numpy_ml.neural_nets.activations import Tanh
+
+    N = np.inf if N is None else N
+
+    mine = Tanh()
+    gold = torch_gradient_generator(torch.tanh)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_relu_grad(N=50):
+    from numpy_ml.neural_nets.activations import ReLU
+
+    N = np.inf if N is None else N
+
+    mine = ReLU()
+    gold = torch_gradient_generator(F.relu)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+def test_selu_grad(N=50):
+    from numpy_ml.neural_nets.activations import SELU
+
+    N = np.inf if N is None else N
+
+    mine = SELU()
+    gold = torch_gradient_generator(F.selu)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims))
+        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
+        print("PASSED")
+        i += 1
+
+
+def test_leakyrelu_grad(N=50):
+    from numpy_ml.neural_nets.activations import LeakyReLU
+
+    N = np.inf if N is None else N
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 10)
+        n_dims = np.random.randint(1, 10)
+        alpha = np.random.uniform(0, 10)
+        z = random_tensor((n_ex, n_dims))
+
+        mine = LeakyReLU(alpha)
+        gold = torch_gradient_generator(F.leaky_relu, negative_slope=alpha)
+        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
+        print("PASSED")
+        i += 1
+
+
+def test_softplus_grad(N=50):
+    from numpy_ml.neural_nets.activations import SoftPlus
+
+    N = np.inf if N is None else N
+
+    mine = SoftPlus()
+    gold = torch_gradient_generator(F.softplus)
+
+    i = 0
+    while i < N:
+        n_ex = np.random.randint(1, 100)
+        n_dims = np.random.randint(1, 100)
+        z = random_tensor((n_ex, n_dims), standardize=True)
+        assert_almost_equal(mine.grad(z), gold(z))
+        print("PASSED")
+        i += 1
+
+
+if __name__ == "__main__":
+    test_activations(N=50)
diff --git a/numpy_ml/tests/test_nonparametric.py b/numpy_ml/tests/test_nonparametric.py
new file mode 100644
index 0000000..9e2ec7e
--- /dev/null
+++ b/numpy_ml/tests/test_nonparametric.py
@@ -0,0 +1,119 @@
+# flake8: noqa
+import numpy as np
+
+from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
+from sklearn.gaussian_process import GaussianProcessRegressor
+
+from numpy_ml.nonparametric.knn import KNN
+from numpy_ml.nonparametric.gp import GPRegression
+from numpy_ml.utils.distance_metrics import euclidean
+
+
+def test_knn_regression(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(2, 100)
+        M = np.random.randint(2, 100)
+        k = np.random.randint(1, N)
+        ls = np.min([np.random.randint(1, 10), N - 1])
+        weights = np.random.choice(["uniform", "distance"])
+
+        X = np.random.rand(N, M)
+        X_test = np.random.rand(N, M)
+        y = np.random.rand(N)
+
+        knn = KNN(
+            k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights
+        )
+        knn.fit(X, y)
+        preds = knn.predict(X_test)
+
+        gold = KNeighborsRegressor(
+            p=2,
+            leaf_size=ls,
+            n_neighbors=k,
+            weights=weights,
+            metric="minkowski",
+            algorithm="ball_tree",
+        )
+        gold.fit(X, y)
+        gold_preds = gold.predict(X_test)
+
+        for mine, theirs in zip(preds, gold_preds):
+            np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+def test_knn_clf(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(2, 100)
+        M = np.random.randint(2, 100)
+        k = np.random.randint(1, N)
+        n_classes = np.random.randint(2, 10)
+        ls = np.min([np.random.randint(1, 10), N - 1])
+        weights = "uniform"
+
+        X = np.random.rand(N, M)
+        X_test = np.random.rand(N, M)
+        y = np.random.randint(0, n_classes, size=N)
+
+        knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights)
+        knn.fit(X, y)
+        preds = knn.predict(X_test)
+
+        gold = KNeighborsClassifier(
+            p=2,
+            metric="minkowski",
+            leaf_size=ls,
+            n_neighbors=k,
+            weights=weights,
+            algorithm="ball_tree",
+        )
+        gold.fit(X, y)
+        gold_preds = gold.predict(X_test)
+
+        for mine, theirs in zip(preds, gold_preds):
+            np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+def test_gp_regression(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        alpha = np.random.rand()
+        N = np.random.randint(2, 100)
+        M = np.random.randint(2, 100)
+        K = np.random.randint(1, N)
+        J = np.random.randint(1, 3)
+
+        X = np.random.rand(N, M)
+        y = np.random.rand(N, J)
+        X_test = np.random.rand(K, M)
+
+        gp = GPRegression(kernel="RBFKernel(sigma=1)", alpha=alpha)
+        gold = GaussianProcessRegressor(
+            kernel=None, alpha=alpha, optimizer=None, normalize_y=False
+        )
+
+        gp.fit(X, y)
+        gold.fit(X, y)
+
+        preds, _ = gp.predict(X_test)
+        gold_preds = gold.predict(X_test)
+        np.testing.assert_almost_equal(preds, gold_preds)
+
+        mll = gp.marginal_log_likelihood()
+        gold_mll = gold.log_marginal_likelihood()
+        np.testing.assert_almost_equal(mll, gold_mll)
+
+        print("PASSED")
+        i += 1
diff --git a/numpy_ml/tests/test_preprocessing.py b/numpy_ml/tests/test_preprocessing.py
new file mode 100644
index 0000000..793e31c
--- /dev/null
+++ b/numpy_ml/tests/test_preprocessing.py
@@ -0,0 +1,252 @@
+# flake8: noqa
+from collections import Counter
+
+# gold-standard imports
+import huffman
+import numpy as np
+
+from scipy.fftpack import dct
+
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from librosa.core.time_frequency import fft_frequencies
+from librosa.feature import mfcc as lr_mfcc
+from librosa.util import frame
+from librosa.filters import mel
+
+# numpy-ml implementations
+from numpy_ml.preprocessing.general import Standardizer
+from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder
+from numpy_ml.preprocessing.dsp import (
+    DCT,
+    DFT,
+    mfcc,
+    to_frames,
+    mel_filterbank,
+    dft_bins,
+)
+from numpy_ml.utils.testing import random_paragraph
+
+
+def test_huffman(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        n_words = np.random.randint(1, 100)
+        para = random_paragraph(n_words)
+        HT = HuffmanEncoder()
+        HT.fit(para)
+        my_dict = HT._item2code
+        their_dict = huffman.codebook(Counter(para).items())
+
+        for k, v in their_dict.items():
+            fstr = "their_dict['{}'] = {}, but my_dict['{}'] = {}"
+            assert k in my_dict, "key `{}` not in my_dict".format(k)
+            assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k])
+        print("PASSED")
+        i += 1
+
+
+def test_standardizer(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        mean = bool(np.random.randint(2))
+        std = bool(np.random.randint(2))
+        N = np.random.randint(2, 100)
+        M = np.random.randint(2, 100)
+        X = np.random.rand(N, M)
+
+        S = Standardizer(with_mean=mean, with_std=std)
+        S.fit(X)
+        mine = S.transform(X)
+
+        theirs = StandardScaler(with_mean=mean, with_std=std)
+        gold = theirs.fit_transform(X)
+
+        np.testing.assert_almost_equal(mine, gold)
+        print("PASSED")
+        i += 1
+
+
+def test_tfidf(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        docs = []
+        n_docs = np.random.randint(1, 10)
+        for d in range(n_docs):
+            n_lines = np.random.randint(1, 1000)
+            lines = [random_paragraph(np.random.randint(1, 10)) for _ in range(n_lines)]
+            docs.append("\n".join([" ".join(l) for l in lines]))
+
+        smooth = bool(np.random.randint(2))
+
+        tfidf = TFIDFEncoder(
+            lowercase=True,
+            min_count=0,
+            smooth_idf=smooth,
+            max_tokens=None,
+            input_type="strings",
+            filter_stopwords=False,
+        )
+        gold = TfidfVectorizer(
+            input="content",
+            norm=None,
+            use_idf=True,
+            lowercase=True,
+            smooth_idf=smooth,
+            sublinear_tf=False,
+        )
+
+        tfidf.fit(docs)
+        mine = tfidf.transform(ignore_special_chars=True)
+        theirs = gold.fit_transform(docs).toarray()
+
+        np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+def test_dct(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(2, 100)
+        signal = np.random.rand(N)
+        ortho = bool(np.random.randint(2))
+        mine = DCT(signal, orthonormal=ortho)
+        theirs = dct(signal, norm="ortho" if ortho else None)
+
+        np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+def test_dft(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(2, 100)
+        signal = np.random.rand(N)
+        mine = DFT(signal)
+        theirs = np.fft.rfft(signal)
+
+        np.testing.assert_almost_equal(mine.real, theirs.real)
+        print("PASSED")
+        i += 1
+
+
+def test_mfcc(N=1):
+    """Broken"""
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(500, 1000)
+        fs = np.random.randint(50, 100)
+        n_mfcc = 12
+        window_len = 100
+        stride_len = 50
+        n_filters = 20
+        window_dur = window_len / fs
+        stride_dur = stride_len / fs
+        signal = np.random.rand(N)
+
+        mine = mfcc(
+            signal,
+            fs=fs,
+            window="hann",
+            window_duration=window_dur,
+            stride_duration=stride_dur,
+            lifter_coef=0,
+            alpha=0,
+            n_mfccs=n_mfcc,
+            normalize=False,
+            center=True,
+            n_filters=n_filters,
+            replace_intercept=False,
+        )
+
+        theirs = lr_mfcc(
+            signal,
+            sr=fs,
+            n_mels=n_filters,
+            n_mfcc=n_mfcc,
+            n_fft=window_len,
+            hop_length=stride_len,
+            htk=True,
+        ).T
+
+        np.testing.assert_almost_equal(mine, theirs, decimal=4)
+        print("PASSED")
+        i += 1
+
+
+def test_framing(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(500, 100000)
+        window_len = np.random.randint(10, 100)
+        stride_len = np.random.randint(1, 50)
+        signal = np.random.rand(N)
+
+        mine = to_frames(signal, window_len, stride_len, writeable=False)
+        theirs = frame(signal, frame_length=window_len, hop_length=stride_len).T
+
+        assert len(mine) == len(theirs), "len(mine) = {}, len(theirs) = {}".format(
+            len(mine), len(theirs)
+        )
+        np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+def test_dft_bins(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        N = np.random.randint(500, 100000)
+        fs = np.random.randint(50, 1000)
+
+        mine = dft_bins(N, fs=fs, positive_only=True)
+        theirs = fft_frequencies(fs, N)
+        np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+def test_mel_filterbank(N=15):
+    np.random.seed(12345)
+
+    i = 0
+    while i < N:
+        fs = np.random.randint(50, 10000)
+        n_filters = np.random.randint(2, 20)
+        window_len = np.random.randint(10, 100)
+        norm = np.random.randint(2)
+
+        mine = mel_filterbank(
+            window_len, n_filters, fs, min_freq=0, max_freq=None, normalize=bool(norm)
+        )
+
+        theirs = mel(
+            fs,
+            n_fft=window_len,
+            n_mels=n_filters,
+            htk=True,
+            norm=norm if norm == 1 else None,
+        )
+
+        np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
diff --git a/numpy_ml/tests/test_trees.py b/numpy_ml/tests/test_trees.py
new file mode 100644
index 0000000..4a90fb5
--- /dev/null
+++ b/numpy_ml/tests/test_trees.py
@@ -0,0 +1,355 @@
+# flake8: noqa
+import numpy as np
+
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.metrics import accuracy_score, mean_squared_error
+from sklearn.datasets import make_regression, make_blobs
+from sklearn.model_selection import train_test_split
+
+from numpy_ml.trees.gbdt import GradientBoostedDecisionTree
+from numpy_ml.trees.dt import DecisionTree, Node, Leaf
+from numpy_ml.trees.rf import RandomForest
+from numpy_ml.utils.testing import random_tensor
+
+
+def clone_tree(dtree):
+    children_left = dtree.tree_.children_left
+    children_right = dtree.tree_.children_right
+    feature = dtree.tree_.feature
+    threshold = dtree.tree_.threshold
+    values = dtree.tree_.value
+
+    def grow(node_id):
+        l, r = children_left[node_id], children_right[node_id]
+        if l == r:
+            return Leaf(values[node_id].argmax())
+        n = Node(None, None, (feature[node_id], threshold[node_id]))
+        n.left = grow(l)
+        n.right = grow(r)
+        return n
+
+    node_id = 0
+    root = Node(None, None, (feature[node_id], threshold[node_id]))
+    root.left = grow(children_left[node_id])
+    root.right = grow(children_right[node_id])
+    return root
+
+
+def compare_trees(mine, gold):
+    clone = clone_tree(gold)
+    mine = mine.root
+
+    def test(mine, clone):
+        if isinstance(clone, Node) and isinstance(mine, Node):
+            assert mine.feature == clone.feature, "Node {} not equal".format(depth)
+            np.testing.assert_allclose(mine.threshold, clone.threshold)
+            test(mine.left, clone.left, depth + 1)
+            test(mine.right, clone.right, depth + 1)
+        elif isinstance(clone, Leaf) and isinstance(mine, Leaf):
+            np.testing.assert_allclose(mine.value, clone.value)
+            return
+        else:
+            raise ValueError("Nodes at depth {} are not equal".format(depth))
+
+    depth = 0
+    ok = True
+    while ok:
+        if isinstance(clone, Node) and isinstance(mine, Node):
+            assert mine.feature == clone.feature
+            np.testing.assert_allclose(mine.threshold, clone.threshold)
+            test(mine.left, clone.left, depth + 1)
+            test(mine.right, clone.right, depth + 1)
+        elif isinstance(clone, Leaf) and isinstance(mine, Leaf):
+            np.testing.assert_allclose(mine.value, clone.value)
+            return
+        else:
+            raise ValueError("Nodes at depth {} are not equal".format(depth))
+
+
+def test_DecisionTree(N=1):
+    i = 1
+    np.random.seed(12345)
+    while i <= N:
+        n_ex = np.random.randint(2, 100)
+        n_feats = np.random.randint(2, 100)
+        max_depth = np.random.randint(1, 5)
+
+        classifier = np.random.choice([True, False])
+        if classifier:
+            # create classification problem
+            n_classes = np.random.randint(2, 10)
+            X, Y = make_blobs(
+                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
+            )
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
+
+            # initialize model
+            def loss(yp, y):
+                return 1 - accuracy_score(yp, y)
+
+            criterion = np.random.choice(["entropy", "gini"])
+            mine = DecisionTree(
+                classifier=classifier, max_depth=max_depth, criterion=criterion
+            )
+            gold = DecisionTreeClassifier(
+                criterion=criterion,
+                max_depth=max_depth,
+                splitter="best",
+                random_state=i,
+            )
+        else:
+            # create regeression problem
+            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
+
+            # initialize model
+            criterion = "mse"
+            loss = mean_squared_error
+            mine = DecisionTree(
+                criterion=criterion, max_depth=max_depth, classifier=classifier
+            )
+            gold = DecisionTreeRegressor(
+                criterion=criterion, max_depth=max_depth, splitter="best"
+            )
+
+        print("Trial {}".format(i))
+        print("\tClassifier={}, criterion={}".format(classifier, criterion))
+        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
+        if classifier:
+            print("\tn_classes: {}".format(n_classes))
+
+        # fit 'em
+        mine.fit(X, Y)
+        gold.fit(X, Y)
+
+        # get preds on training set
+        y_pred_mine = mine.predict(X)
+        y_pred_gold = gold.predict(X)
+
+        loss_mine = loss(y_pred_mine, Y)
+        loss_gold = loss(y_pred_gold, Y)
+
+        # get preds on test set
+        y_pred_mine_test = mine.predict(X_test)
+        y_pred_gold_test = gold.predict(X_test)
+
+        loss_mine_test = loss(y_pred_mine_test, Y_test)
+        loss_gold_test = loss(y_pred_gold_test, Y_test)
+
+        try:
+            np.testing.assert_almost_equal(loss_mine, loss_gold)
+            print("\tLoss on training: {}".format(loss_mine))
+        except AssertionError as e:
+            print("\tTraining losses not equal:\n{}".format(e))
+
+        try:
+            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
+            print("\tLoss on test: {}".format(loss_mine_test))
+        except AssertionError as e:
+            print("\tTest losses not equal:\n{}".format(e))
+        i += 1
+
+
+def test_RandomForest(N=1):
+    np.random.seed(12345)
+    i = 1
+    while i <= N:
+        n_ex = np.random.randint(2, 100)
+        n_feats = np.random.randint(2, 100)
+        n_trees = np.random.randint(2, 100)
+        max_depth = np.random.randint(1, 5)
+
+        classifier = np.random.choice([True, False])
+        if classifier:
+            # create classification problem
+            n_classes = np.random.randint(2, 10)
+            X, Y = make_blobs(
+                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
+            )
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
+
+            # initialize model
+            def loss(yp, y):
+                return 1 - accuracy_score(yp, y)
+
+            # initialize model
+            criterion = np.random.choice(["entropy", "gini"])
+            mine = RandomForest(
+                classifier=classifier,
+                n_feats=n_feats,
+                n_trees=n_trees,
+                criterion=criterion,
+                max_depth=max_depth,
+            )
+            gold = RandomForestClassifier(
+                n_estimators=n_trees,
+                max_features=n_feats,
+                criterion=criterion,
+                max_depth=max_depth,
+                bootstrap=True,
+            )
+        else:
+            # create regeression problem
+            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
+
+            # initialize model
+            criterion = "mse"
+            loss = mean_squared_error
+            mine = RandomForest(
+                criterion=criterion,
+                n_feats=n_feats,
+                n_trees=n_trees,
+                max_depth=max_depth,
+                classifier=classifier,
+            )
+            gold = RandomForestRegressor(
+                n_estimators=n_trees,
+                max_features=n_feats,
+                criterion=criterion,
+                max_depth=max_depth,
+                bootstrap=True,
+            )
+
+        print("Trial {}".format(i))
+        print("\tClassifier={}, criterion={}".format(classifier, criterion))
+        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
+        if classifier:
+            print("\tn_classes: {}".format(n_classes))
+
+        # fit 'em
+        mine.fit(X, Y)
+        gold.fit(X, Y)
+
+        # get preds
+        y_pred_mine = mine.predict(X)
+        y_pred_gold = gold.predict(X)
+
+        loss_mine = loss(y_pred_mine, Y)
+        loss_gold = loss(y_pred_gold, Y)
+
+        # get preds on test set
+        y_pred_mine_test = mine.predict(X_test)
+        y_pred_gold_test = gold.predict(X_test)
+
+        loss_mine_test = loss(y_pred_mine_test, Y_test)
+        loss_gold_test = loss(y_pred_gold_test, Y_test)
+
+        try:
+            np.testing.assert_almost_equal(loss_mine, loss_gold)
+            print("\tLoss on training: {}".format(loss_mine))
+        except AssertionError as e:
+            print("\tTraining losses not equal:\n{}".format(e))
+
+        try:
+            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
+            print("\tLoss on test: {}".format(loss_mine_test))
+        except AssertionError as e:
+            print("\tTest losses not equal:\n{}".format(e))
+
+        print("PASSED")
+        i += 1
+
+
+def test_gbdt(N=1):
+    np.random.seed(12345)
+    i = 1
+    while i <= N:
+        n_ex = np.random.randint(2, 100)
+        n_feats = np.random.randint(2, 100)
+        n_trees = np.random.randint(2, 100)
+        max_depth = np.random.randint(1, 5)
+
+        classifier = np.random.choice([True, False])
+        if classifier:
+            # create classification problem
+            n_classes = np.random.randint(2, 10)
+            X, Y = make_blobs(
+                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
+            )
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
+
+            # initialize model
+            def loss(yp, y):
+                return 1 - accuracy_score(yp, y)
+
+            # initialize model
+            criterion = np.random.choice(["entropy", "gini"])
+            mine = GradientBoostedDecisionTree(
+                n_iter=n_trees,
+                classifier=classifier,
+                max_depth=max_depth,
+                learning_rate=0.1,
+                loss="crossentropy",
+                step_size="constant",
+            )
+            gold = RandomForestClassifier(
+                n_estimators=n_trees,
+                max_features=n_feats,
+                criterion=criterion,
+                max_depth=max_depth,
+                bootstrap=True,
+            )
+        else:
+            # create regeression problem
+            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
+
+            # initialize model
+            criterion = "mse"
+            loss = mean_squared_error
+            mine = GradientBoostedDecisionTree(
+                n_iter=n_trees,
+                max_depth=max_depth,
+                classifier=classifier,
+                learning_rate=0.1,
+                loss="mse",
+                step_size="constant",
+            )
+            gold = RandomForestRegressor(
+                n_estimators=n_trees,
+                max_features=n_feats,
+                criterion=criterion,
+                max_depth=max_depth,
+                bootstrap=True,
+            )
+
+        print("Trial {}".format(i))
+        print("\tClassifier={}, criterion={}".format(classifier, criterion))
+        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
+        if classifier:
+            print("\tn_classes: {}".format(n_classes))
+
+        # fit 'em
+        mine.fit(X, Y)
+        gold.fit(X, Y)
+
+        # get preds
+        y_pred_mine = mine.predict(X)
+        y_pred_gold = gold.predict(X)
+
+        loss_mine = loss(y_pred_mine, Y)
+        loss_gold = loss(y_pred_gold, Y)
+
+        # get preds on test set
+        y_pred_mine_test = mine.predict(X_test)
+        y_pred_gold_test = gold.predict(X_test)
+
+        loss_mine_test = loss(y_pred_mine_test, Y_test)
+        loss_gold_test = loss(y_pred_gold_test, Y_test)
+
+        try:
+            np.testing.assert_almost_equal(loss_mine, loss_gold)
+            print("\tLoss on training: {}".format(loss_mine))
+        except AssertionError as e:
+            print("\tTraining losses not equal:\n{}".format(e))
+
+        try:
+            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
+            print("\tLoss on test: {}".format(loss_mine_test))
+        except AssertionError as e:
+            print("\tTest losses not equal:\n{}".format(e))
+
+        print("PASSED")
+        i += 1
diff --git a/numpy_ml/tests/test_utils.py b/numpy_ml/tests/test_utils.py
new file mode 100644
index 0000000..7721c99
--- /dev/null
+++ b/numpy_ml/tests/test_utils.py
@@ -0,0 +1,274 @@
+# flake8: noqa
+import numpy as np
+
+import scipy
+import networkx as nx
+
+from sklearn.neighbors import BallTree as sk_BallTree
+from sklearn.metrics.pairwise import rbf_kernel as sk_rbf
+from sklearn.metrics.pairwise import linear_kernel as sk_linear
+from sklearn.metrics.pairwise import polynomial_kernel as sk_poly
+
+
+from numpy_ml.utils.distance_metrics import euclidean
+from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel
+from numpy_ml.utils.data_structures import BallTree
+from numpy_ml.utils.graphs import (
+    DiGraph,
+    UndirectedGraph,
+    Edge,
+    random_unweighted_graph,
+    random_DAG,
+)
+
+#######################################################################
+#                               Kernels                               #
+#######################################################################
+
+
+def test_linear_kernel(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        N = np.random.randint(1, 100)
+        M = np.random.randint(1, 100)
+        C = np.random.randint(1, 1000)
+
+        X = np.random.rand(N, C)
+        Y = np.random.rand(M, C)
+
+        mine = LinearKernel()(X, Y)
+        gold = sk_linear(X, Y)
+
+        np.testing.assert_almost_equal(mine, gold)
+        print("PASSED")
+        i += 1
+
+
+def test_polynomial_kernel(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        N = np.random.randint(1, 100)
+        M = np.random.randint(1, 100)
+        C = np.random.randint(1, 1000)
+        gamma = np.random.rand()
+        d = np.random.randint(1, 5)
+        c0 = np.random.rand()
+
+        X = np.random.rand(N, C)
+        Y = np.random.rand(M, C)
+
+        mine = PolynomialKernel(gamma=gamma, d=d, c0=c0)(X, Y)
+        gold = sk_poly(X, Y, gamma=gamma, degree=d, coef0=c0)
+
+        np.testing.assert_almost_equal(mine, gold)
+        print("PASSED")
+        i += 1
+
+
+def test_radial_basis_kernel(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        N = np.random.randint(1, 100)
+        M = np.random.randint(1, 100)
+        C = np.random.randint(1, 1000)
+        gamma = np.random.rand()
+
+        X = np.random.rand(N, C)
+        Y = np.random.rand(M, C)
+
+        # sklearn (gamma) <-> mine (sigma) conversion:
+        # gamma = 1 / (2 * sigma^2)
+        # sigma = np.sqrt(1 / 2 * gamma)
+
+        mine = RBFKernel(sigma=np.sqrt(1 / (2 * gamma)))(X, Y)
+        gold = sk_rbf(X, Y, gamma=gamma)
+
+        np.testing.assert_almost_equal(mine, gold)
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                          Distance Metrics                           #
+#######################################################################
+
+
+def test_euclidean(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        N = np.random.randint(1, 100)
+        x = np.random.rand(N)
+        y = np.random.rand(N)
+        mine = euclidean(x, y)
+        theirs = scipy.spatial.distance.euclidean(x, y)
+        np.testing.assert_almost_equal(mine, theirs)
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                           Data Structures                           #
+#######################################################################
+
+
+def test_ball_tree(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        N = np.random.randint(2, 100)
+        M = np.random.randint(2, 100)
+        k = np.random.randint(1, N)
+        ls = np.min([np.random.randint(1, 10), N - 1])
+
+        X = np.random.rand(N, M)
+        BT = BallTree(leaf_size=ls, metric=euclidean)
+        BT.fit(X)
+
+        x = np.random.rand(M)
+        mine = BT.nearest_neighbors(k, x)
+        assert len(mine) == k
+
+        mine_neighb = np.array([n.key for n in mine])
+        mine_dist = np.array([n.distance for n in mine])
+
+        sort_ix = np.argsort(mine_dist)
+        mine_dist = mine_dist[sort_ix]
+        mine_neighb = mine_neighb[sort_ix]
+
+        sk = sk_BallTree(X, leaf_size=ls)
+        theirs_dist, ind = sk.query(x.reshape(1, -1), k=k)
+        sort_ix = np.argsort(theirs_dist.flatten())
+
+        theirs_dist = theirs_dist.flatten()[sort_ix]
+        theirs_neighb = X[ind.flatten()[sort_ix]]
+
+        for j in range(len(theirs_dist)):
+            np.testing.assert_almost_equal(mine_neighb[j], theirs_neighb[j])
+            np.testing.assert_almost_equal(mine_dist[j], theirs_dist[j])
+
+        print("PASSED")
+        i += 1
+
+
+#######################################################################
+#                               Graphs                                #
+#######################################################################
+
+
+def from_networkx(G_nx):
+    """Convert a networkx graph to my graph representation"""
+    V = list(G_nx.nodes)
+    edges = list(G_nx.edges)
+    is_weighted = "weight" in G_nx[edges[0][0]][edges[0][1]]
+
+    E = []
+    for e in edges:
+        if is_weighted:
+            E.append(Edge(e[0], e[1], G_nx[e[0]][e[1]]["weight"]))
+        else:
+            E.append(Edge(e[0], e[1]))
+
+    return DiGraph(V, E) if nx.is_directed(G_nx) else UndirectedGraph(V, E)
+
+
+def to_networkx(G):
+    """Convert my graph representation to a networkx graph"""
+    G_nx = nx.DiGraph() if G.is_directed else nx.Graph()
+    V = list(G._V2I.keys())
+    G_nx.add_nodes_from(V)
+
+    for v in V:
+        fr_i = G._V2I[v]
+        edges = G._G[fr_i]
+
+        for edge in edges:
+            G_nx.add_edge(edge.fr, edge.to, weight=edge._w)
+    return G_nx
+
+
+def test_all_paths(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        p = np.random.rand()
+        directed = np.random.rand() < 0.5
+        G = random_unweighted_graph(n_vertices=5, edge_prob=p, directed=directed)
+
+        nodes = G._I2V.keys()
+        G_nx = to_networkx(G)
+
+        # for each graph, test all_paths for all pairs of start and end
+        # vertices. note that graph is not guaranteed to be connected, so many
+        # paths will be empty
+        for s_i in nodes:
+            for e_i in nodes:
+                if s_i == e_i:
+                    continue
+
+                paths = G.all_paths(s_i, e_i)
+                paths_nx = nx.all_simple_paths(G_nx, source=s_i, target=e_i, cutoff=10)
+
+                paths = sorted(paths)
+                paths_nx = sorted(list(paths_nx))
+
+                for p1, p2 in zip(paths, paths_nx):
+                    np.testing.assert_array_equal(p1, p2)
+
+                print("PASSED")
+                i += 1
+
+
+def test_random_DAG(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        p = np.random.uniform(0.25, 1)
+        n_v = np.random.randint(5, 50)
+
+        G = random_DAG(n_v, p)
+        G_nx = to_networkx(G)
+
+        assert nx.is_directed_acyclic_graph(G_nx)
+        print("PASSED")
+        i += 1
+
+
+def test_topological_ordering(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        p = np.random.uniform(0.25, 1)
+        n_v = np.random.randint(5, 10)
+
+        G = random_DAG(n_v, p)
+        G_nx = to_networkx(G)
+
+        if nx.is_directed_acyclic_graph(G_nx):
+            topo_order = G.topological_ordering()
+
+            #  test topological order
+            seen_it = set()
+            for n_i in topo_order:
+                seen_it.add(n_i)
+                assert any([c_i in seen_it for c_i in G.get_neighbors(n_i)]) == False
+
+            print("PASSED")
+            i += 1
+
+
+def test_is_acyclic(N=1):
+    np.random.seed(12345)
+    i = 0
+    while i < N:
+        p = np.random.rand()
+        directed = np.random.rand() < 0.5
+        G = random_unweighted_graph(n_vertices=10, edge_prob=p, directed=True)
+        G_nx = to_networkx(G)
+
+        assert G.is_acyclic() == nx.is_directed_acyclic_graph(G_nx)
+        print("PASSED")
+        i += 1

From d06964127ed6f6161dca7f4ab09eb880c9450e1c Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 19:09:48 -0400
Subject: [PATCH 04/18] separate out plots from other tests

---
 numpy_ml/neural_nets/schedulers/plots.py | 172 ++++++
 numpy_ml/preprocessing/nlp.py            | 682 ++++++++++++-----------
 numpy_ml/trees/plots.py                  | 163 ++++++
 3 files changed, 682 insertions(+), 335 deletions(-)
 create mode 100644 numpy_ml/neural_nets/schedulers/plots.py
 create mode 100644 numpy_ml/trees/plots.py

diff --git a/numpy_ml/neural_nets/schedulers/plots.py b/numpy_ml/neural_nets/schedulers/plots.py
new file mode 100644
index 0000000..de11150
--- /dev/null
+++ b/numpy_ml/neural_nets/schedulers/plots.py
@@ -0,0 +1,172 @@
+# flake8: noqa
+
+import time
+import numpy as np
+import matplotlib
+
+matplotlib.use("TkAgg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# https://seaborn.pydata.org/generated/seaborn.set_context.html
+# https://seaborn.pydata.org/generated/seaborn.set_style.html
+sns.set_style("white")
+sns.set_context("notebook", font_scale=0.7)
+
+from .schedulers import (
+    ConstantScheduler,
+    ExponentialScheduler,
+    NoamScheduler,
+    KingScheduler,
+)
+
+
+def king_loss_fn(x):
+    if x <= 250:
+        return -0.25 * x + 82.50372665317208
+    elif 250 < x <= 600:
+        return 20.00372665317208
+    elif 600 < x <= 700:
+        return -0.2 * x + 140.00372665317207
+    else:
+        return 0.003726653172066108
+
+
+def plot_schedulers():
+    fig, axes = plt.subplots(2, 2)
+    schedulers = [
+        (
+            [ConstantScheduler(lr=0.01), "lr=1e-2"],
+            [ConstantScheduler(lr=0.008), "lr=8e-3"],
+            [ConstantScheduler(lr=0.006), "lr=6e-3"],
+            [ConstantScheduler(lr=0.004), "lr=4e-3"],
+            [ConstantScheduler(lr=0.002), "lr=2e-3"],
+        ),
+        (
+            [
+                ExponentialScheduler(
+                    lr=0.01, stage_length=250, staircase=False, decay=0.4
+                ),
+                "lr=0.01, stage=250, stair=False, decay=0.4",
+            ],
+            [
+                ExponentialScheduler(
+                    lr=0.01, stage_length=250, staircase=True, decay=0.4
+                ),
+                "lr=0.01, stage=250, stair=True, decay=0.4",
+            ],
+            [
+                ExponentialScheduler(
+                    lr=0.01, stage_length=125, staircase=True, decay=0.1
+                ),
+                "lr=0.01, stage=125, stair=True, decay=0.1",
+            ],
+            [
+                ExponentialScheduler(
+                    lr=0.001, stage_length=250, staircase=False, decay=0.1
+                ),
+                "lr=0.001, stage=250, stair=False, decay=0.1",
+            ],
+            [
+                ExponentialScheduler(
+                    lr=0.001, stage_length=125, staircase=False, decay=0.8
+                ),
+                "lr=0.001, stage=125, stair=False, decay=0.8",
+            ],
+            [
+                ExponentialScheduler(
+                    lr=0.01, stage_length=250, staircase=False, decay=0.01
+                ),
+                "lr=0.01, stage=250, stair=False, decay=0.01",
+            ],
+        ),
+        (
+            [
+                NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=250),
+                "dim=512, scale=1, warmup=250",
+            ],
+            [
+                NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=250),
+                "dim=256, scale=1, warmup=250",
+            ],
+            [
+                NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=500),
+                "dim=512, scale=1, warmup=500",
+            ],
+            [
+                NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=500),
+                "dim=512, scale=1, warmup=500",
+            ],
+            [
+                NoamScheduler(model_dim=512, scale_factor=2, warmup_steps=500),
+                "dim=512, scale=2, warmup=500",
+            ],
+            [
+                NoamScheduler(model_dim=512, scale_factor=0.5, warmup_steps=500),
+                "dim=512, scale=0.5, warmup=500",
+            ],
+        ),
+        (
+            #  [
+            #      KingScheduler(initial_lr=0.01, patience=100, decay=0.1),
+            #      "lr=0.01, patience=100, decay=0.8",
+            #  ],
+            #  [
+            #      KingScheduler(initial_lr=0.01, patience=300, decay=0.999),
+            #      "lr=0.01, patience=300, decay=0.999",
+            #  ],
+            [
+                KingScheduler(initial_lr=0.009, patience=150, decay=0.995),
+                "lr=0.009, patience=150, decay=0.9999",
+            ],
+            [
+                KingScheduler(initial_lr=0.008, patience=100, decay=0.995),
+                "lr=0.008, patience=100, decay=0.995",
+            ],
+            [
+                KingScheduler(initial_lr=0.007, patience=50, decay=0.995),
+                "lr=0.007, patience=50, decay=0.995",
+            ],
+            [
+                KingScheduler(initial_lr=0.005, patience=25, decay=0.9),
+                "lr=0.005, patience=25, decay=0.99",
+            ],
+        ),
+    ]
+
+    for ax, schs, title in zip(
+        axes.flatten(), schedulers, ["Constant", "Exponential", "Noam", "King"]
+    ):
+        t0 = time.time()
+        print("Running {} scheduler".format(title))
+        X = np.arange(1, 1000)
+        loss = np.array([king_loss_fn(x) for x in X])
+
+        # scale loss to fit on same axis as lr
+        scale = 0.01 / loss[0]
+        loss *= scale
+
+        if title == "King":
+            ax.plot(X, loss, ls=":", label="Loss")
+
+        for sc, lg in schs:
+            Y = np.array([sc(x, ll) for x, ll in zip(X, loss)])
+            ax.plot(X, Y, label=lg, alpha=0.6)
+
+        ax.legend(fontsize=5)
+        ax.set_xlabel("Steps")
+        ax.set_ylabel("Learning rate")
+        ax.set_title("{} scheduler".format(title))
+        print(
+            "Finished plotting {} runs of {} in {:.2f}s".format(
+                len(schs), title, time.time() - t0
+            )
+        )
+
+    plt.tight_layout()
+    plt.savefig("plot.png", dpi=300)
+    plt.close("all")
+
+
+if __name__ == "__main__":
+    plot_schedulers()
diff --git a/numpy_ml/preprocessing/nlp.py b/numpy_ml/preprocessing/nlp.py
index 4533e36..68fc28e 100644
--- a/numpy_ml/preprocessing/nlp.py
+++ b/numpy_ml/preprocessing/nlp.py
@@ -1,3 +1,4 @@
+"""Common preprocessing utilities for working with text data"""
 import re
 import heapq
 import os.path as op
@@ -9,328 +10,327 @@
 # This list of English stop words is taken from the "Glasgow Information
 # Retrieval Group". The original list can be found at
 # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
-_STOP_WORDS = set(
-    [
-        "a",
-        "about",
-        "above",
-        "across",
-        "after",
-        "afterwards",
-        "again",
-        "against",
-        "all",
-        "almost",
-        "alone",
-        "along",
-        "already",
-        "also",
-        "although",
-        "always",
-        "am",
-        "among",
-        "amongst",
-        "amoungst",
-        "amount",
-        "an",
-        "and",
-        "another",
-        "any",
-        "anyhow",
-        "anyone",
-        "anything",
-        "anyway",
-        "anywhere",
-        "are",
-        "around",
-        "as",
-        "at",
-        "back",
-        "be",
-        "became",
-        "because",
-        "become",
-        "becomes",
-        "becoming",
-        "been",
-        "before",
-        "beforehand",
-        "behind",
-        "being",
-        "below",
-        "beside",
-        "besides",
-        "between",
-        "beyond",
-        "bill",
-        "both",
-        "bottom",
-        "but",
-        "by",
-        "call",
-        "can",
-        "cannot",
-        "cant",
-        "co",
-        "con",
-        "could",
-        "couldnt",
-        "cry",
-        "de",
-        "describe",
-        "detail",
-        "do",
-        "done",
-        "down",
-        "due",
-        "during",
-        "each",
-        "eg",
-        "eight",
-        "either",
-        "eleven",
-        "else",
-        "elsewhere",
-        "empty",
-        "enough",
-        "etc",
-        "even",
-        "ever",
-        "every",
-        "everyone",
-        "everything",
-        "everywhere",
-        "except",
-        "few",
-        "fifteen",
-        "fifty",
-        "fill",
-        "find",
-        "fire",
-        "first",
-        "five",
-        "for",
-        "former",
-        "formerly",
-        "forty",
-        "found",
-        "four",
-        "from",
-        "front",
-        "full",
-        "further",
-        "get",
-        "give",
-        "go",
-        "had",
-        "has",
-        "hasnt",
-        "have",
-        "he",
-        "hence",
-        "her",
-        "here",
-        "hereafter",
-        "hereby",
-        "herein",
-        "hereupon",
-        "hers",
-        "herself",
-        "him",
-        "himself",
-        "his",
-        "how",
-        "however",
-        "hundred",
-        "i",
-        "ie",
-        "if",
-        "in",
-        "inc",
-        "indeed",
-        "interest",
-        "into",
-        "is",
-        "it",
-        "its",
-        "itself",
-        "keep",
-        "last",
-        "latter",
-        "latterly",
-        "least",
-        "less",
-        "ltd",
-        "made",
-        "many",
-        "may",
-        "me",
-        "meanwhile",
-        "might",
-        "mill",
-        "mine",
-        "more",
-        "moreover",
-        "most",
-        "mostly",
-        "move",
-        "much",
-        "must",
-        "my",
-        "myself",
-        "name",
-        "namely",
-        "neither",
-        "never",
-        "nevertheless",
-        "next",
-        "nine",
-        "no",
-        "nobody",
-        "none",
-        "noone",
-        "nor",
-        "not",
-        "nothing",
-        "now",
-        "nowhere",
-        "of",
-        "off",
-        "often",
-        "on",
-        "once",
-        "one",
-        "only",
-        "onto",
-        "or",
-        "other",
-        "others",
-        "otherwise",
-        "our",
-        "ours",
-        "ourselves",
-        "out",
-        "over",
-        "own",
-        "part",
-        "per",
-        "perhaps",
-        "please",
-        "put",
-        "rather",
-        "re",
-        "same",
-        "see",
-        "seem",
-        "seemed",
-        "seeming",
-        "seems",
-        "serious",
-        "several",
-        "she",
-        "should",
-        "show",
-        "side",
-        "since",
-        "sincere",
-        "six",
-        "sixty",
-        "so",
-        "some",
-        "somehow",
-        "someone",
-        "something",
-        "sometime",
-        "sometimes",
-        "somewhere",
-        "still",
-        "such",
-        "system",
-        "take",
-        "ten",
-        "than",
-        "that",
-        "the",
-        "their",
-        "them",
-        "themselves",
-        "then",
-        "thence",
-        "there",
-        "thereafter",
-        "thereby",
-        "therefore",
-        "therein",
-        "thereupon",
-        "these",
-        "they",
-        "thick",
-        "thin",
-        "third",
-        "this",
-        "those",
-        "though",
-        "three",
-        "through",
-        "throughout",
-        "thru",
-        "thus",
-        "to",
-        "together",
-        "too",
-        "top",
-        "toward",
-        "towards",
-        "twelve",
-        "twenty",
-        "two",
-        "un",
-        "under",
-        "until",
-        "up",
-        "upon",
-        "us",
-        "very",
-        "via",
-        "was",
-        "we",
-        "well",
-        "were",
-        "what",
-        "whatever",
-        "when",
-        "whence",
-        "whenever",
-        "where",
-        "whereafter",
-        "whereas",
-        "whereby",
-        "wherein",
-        "whereupon",
-        "wherever",
-        "whether",
-        "which",
-        "while",
-        "whither",
-        "who",
-        "whoever",
-        "whole",
-        "whom",
-        "whose",
-        "why",
-        "will",
-        "with",
-        "within",
-        "without",
-        "would",
-        "yet",
-        "you",
-        "your",
-        "yours",
-        "yourself",
-        "yourselves",
-    ]
-)
+_STOP_WORDS = {
+    "a",
+    "about",
+    "above",
+    "across",
+    "after",
+    "afterwards",
+    "again",
+    "against",
+    "all",
+    "almost",
+    "alone",
+    "along",
+    "already",
+    "also",
+    "although",
+    "always",
+    "am",
+    "among",
+    "amongst",
+    "amoungst",
+    "amount",
+    "an",
+    "and",
+    "another",
+    "any",
+    "anyhow",
+    "anyone",
+    "anything",
+    "anyway",
+    "anywhere",
+    "are",
+    "around",
+    "as",
+    "at",
+    "back",
+    "be",
+    "became",
+    "because",
+    "become",
+    "becomes",
+    "becoming",
+    "been",
+    "before",
+    "beforehand",
+    "behind",
+    "being",
+    "below",
+    "beside",
+    "besides",
+    "between",
+    "beyond",
+    "bill",
+    "both",
+    "bottom",
+    "but",
+    "by",
+    "call",
+    "can",
+    "cannot",
+    "cant",
+    "co",
+    "con",
+    "could",
+    "couldnt",
+    "cry",
+    "de",
+    "describe",
+    "detail",
+    "do",
+    "done",
+    "down",
+    "due",
+    "during",
+    "each",
+    "eg",
+    "eight",
+    "either",
+    "eleven",
+    "else",
+    "elsewhere",
+    "empty",
+    "enough",
+    "etc",
+    "even",
+    "ever",
+    "every",
+    "everyone",
+    "everything",
+    "everywhere",
+    "except",
+    "few",
+    "fifteen",
+    "fifty",
+    "fill",
+    "find",
+    "fire",
+    "first",
+    "five",
+    "for",
+    "former",
+    "formerly",
+    "forty",
+    "found",
+    "four",
+    "from",
+    "front",
+    "full",
+    "further",
+    "get",
+    "give",
+    "go",
+    "had",
+    "has",
+    "hasnt",
+    "have",
+    "he",
+    "hence",
+    "her",
+    "here",
+    "hereafter",
+    "hereby",
+    "herein",
+    "hereupon",
+    "hers",
+    "herself",
+    "him",
+    "himself",
+    "his",
+    "how",
+    "however",
+    "hundred",
+    "i",
+    "ie",
+    "if",
+    "in",
+    "inc",
+    "indeed",
+    "interest",
+    "into",
+    "is",
+    "it",
+    "its",
+    "itself",
+    "keep",
+    "last",
+    "latter",
+    "latterly",
+    "least",
+    "less",
+    "ltd",
+    "made",
+    "many",
+    "may",
+    "me",
+    "meanwhile",
+    "might",
+    "mill",
+    "mine",
+    "more",
+    "moreover",
+    "most",
+    "mostly",
+    "move",
+    "much",
+    "must",
+    "my",
+    "myself",
+    "name",
+    "namely",
+    "neither",
+    "never",
+    "nevertheless",
+    "next",
+    "nine",
+    "no",
+    "nobody",
+    "none",
+    "noone",
+    "nor",
+    "not",
+    "nothing",
+    "now",
+    "nowhere",
+    "of",
+    "off",
+    "often",
+    "on",
+    "once",
+    "one",
+    "only",
+    "onto",
+    "or",
+    "other",
+    "others",
+    "otherwise",
+    "our",
+    "ours",
+    "ourselves",
+    "out",
+    "over",
+    "own",
+    "part",
+    "per",
+    "perhaps",
+    "please",
+    "put",
+    "rather",
+    "re",
+    "same",
+    "see",
+    "seem",
+    "seemed",
+    "seeming",
+    "seems",
+    "serious",
+    "several",
+    "she",
+    "should",
+    "show",
+    "side",
+    "since",
+    "sincere",
+    "six",
+    "sixty",
+    "so",
+    "some",
+    "somehow",
+    "someone",
+    "something",
+    "sometime",
+    "sometimes",
+    "somewhere",
+    "still",
+    "such",
+    "system",
+    "take",
+    "ten",
+    "than",
+    "that",
+    "the",
+    "their",
+    "them",
+    "themselves",
+    "then",
+    "thence",
+    "there",
+    "thereafter",
+    "thereby",
+    "therefore",
+    "therein",
+    "thereupon",
+    "these",
+    "they",
+    "thick",
+    "thin",
+    "third",
+    "this",
+    "those",
+    "though",
+    "three",
+    "through",
+    "throughout",
+    "thru",
+    "thus",
+    "to",
+    "together",
+    "too",
+    "top",
+    "toward",
+    "towards",
+    "twelve",
+    "twenty",
+    "two",
+    "un",
+    "under",
+    "until",
+    "up",
+    "upon",
+    "us",
+    "very",
+    "via",
+    "was",
+    "we",
+    "well",
+    "were",
+    "what",
+    "whatever",
+    "when",
+    "whence",
+    "whenever",
+    "where",
+    "whereafter",
+    "whereas",
+    "whereby",
+    "wherein",
+    "whereupon",
+    "wherever",
+    "whether",
+    "which",
+    "while",
+    "whither",
+    "who",
+    "whoever",
+    "whole",
+    "whom",
+    "whose",
+    "why",
+    "will",
+    "with",
+    "within",
+    "without",
+    "would",
+    "yet",
+    "you",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+}
+
 _PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
 
 _WORD_REGEX = re.compile(r"(?u)\b\w\w+\b")  # sklearn default
@@ -386,21 +386,25 @@ def __init__(self, key, val):
         self.right = None
 
     def __gt__(self, other):
+        """Greater than"""
         if not isinstance(other, Node):
             return -1
         return self.val > other.val
 
     def __ge__(self, other):
+        """Greater than or equal to"""
         if not isinstance(other, Node):
             return -1
         return self.val >= other.val
 
     def __lt__(self, other):
+        """Less than"""
         if not isinstance(other, Node):
             return -1
         return self.val < other.val
 
     def __le__(self, other):
+        """Less than or equal to"""
         if not isinstance(other, Node):
             return -1
         return self.val <= other.val
@@ -482,10 +486,12 @@ def inverse_transform(self, codes):
 
     @property
     def tokens(self):
+        """A list the unique tokens in `text`"""
         return list(self._item2code.keys())
 
     @property
     def codes(self):
+        """A list with the Huffman code for each unique token in `text`"""
         return list(self._code2item.keys())
 
     def _counter(self, text):
@@ -552,6 +558,7 @@ def __init__(self, word):
         self.word = word
 
     def __repr__(self):
+        """A string representation of the token"""
         return "Token(word='{}', count={})".format(self.word, self.count)
 
 
@@ -566,7 +573,7 @@ def __init__(
         input_type="filename",
         filter_stopwords=True,
     ):
-        """
+        r"""
         An object for compiling and encoding the term-frequency
         inverse-document-frequency (TF-IDF) representation of the tokens in a
         text corpus.
@@ -578,8 +585,8 @@ def __init__(
         corpus, :math:`D = \{d_1, \ldots, d_N\}`, we have:
 
         .. math::
-            \\text{TF}(w, d)  &=  \\text{num. occurences of }`w`\\text{ in document }`d` \\\\
-            \\text{IDF}(w, D)  &=  \log \\frac{|D|}{|\{ d \in D: t \in d \}|}
+            \text{TF}(w, d)  &=  \text{num. occurences of }w \text{ in document }d \\
+            \text{IDF}(w, D)  &=  \log \frac{|D|}{|\{ d \in D: t \in d \}|}
 
         Parameters
         ----------
@@ -694,7 +701,7 @@ def fit(self, corpus_seq, encoding="utf-8-sig"):
             doc_count = {}
             idx2doc[d_ix] = doc if H["input_type"] == "files" else None
             token2idx, idx2token, tokens, doc_count = self._encode_document(
-                doc, token2idx, idx2token, tokens, doc_count, bol_ix, eol_ix
+                doc, token2idx, idx2token, tokens, doc_count, bol_ix, eol_ix,
             )
             term_freq[d_ix] = doc_count
 
@@ -720,11 +727,9 @@ def fit(self, corpus_seq, encoding="utf-8-sig"):
         self._calc_idf()
 
     def _encode_document(
-        self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix
+        self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
     ):
-        """
-        Perform tokenization and compute token counts for a single document
-        """
+        """Perform tokenization and compute token counts for a single document"""
         H = self.hyperparameters
         lowercase = H["lowercase"]
         filter_stop = H["filter_stopwords"]
@@ -816,7 +821,7 @@ def _drop_low_freq_tokens(self):
         unk_idx = 0
         word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2}
         idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"}
-        special = set(["<eol>", "<bol>", "<unk>"])
+        special = {"<eol>", "<bol>", "<unk>"}
 
         for tt in self._tokens:
             if tt.word not in special:
@@ -895,7 +900,7 @@ def _calc_idf(self):
         for word, w_ix in self.token2idx.items():
             d_count = int(smooth_idf)
             d_count += np.sum([1 if w_ix in tf[d_ix] else 0 for d_ix in doc_idxs])
-            inv_doc_freq[w_ix] = np.log(D / d_count) + 1
+            inv_doc_freq[w_ix] = 1 if d_count == 0 else np.log(D / d_count) + 1
         self.inv_doc_freq = inv_doc_freq
 
     def transform(self, ignore_special_chars=True):
@@ -944,7 +949,7 @@ def transform(self, ignore_special_chars=True):
 
 class Vocabulary:
     def __init__(
-        self, lowercase=True, min_count=None, max_tokens=None, filter_stopwords=True
+        self, lowercase=True, min_count=None, max_tokens=None, filter_stopwords=True,
     ):
         """
         An object for compiling and encoding the unique tokens in a text corpus.
@@ -977,15 +982,22 @@ def __init__(
         }
 
     def __len__(self):
+        """Return the number of tokens in the vocabulary"""
         return len(self._tokens)
 
     def __iter__(self):
+        """Return an iterator over the tokens in the vocabulary"""
         return iter(self._tokens)
 
     def __contains__(self, word):
+        """Assert whether `word` is a token in the vocabulary"""
         return word in self.token2idx
 
     def __getitem__(self, key):
+        """
+        Return the token (if key is an integer) or the index (if key is a string)
+        for the key in the vocabulary, if it exists.
+        """
         if isinstance(key, str):
             return self._tokens[self.token2idx[key]]
         if isinstance(key, int):
@@ -1014,7 +1026,7 @@ def words_with_count(self, k):
         """Return all tokens that occur `k` times in the corpus"""
         return [w for w, c in self.counts.items() if c == k]
 
-    def filter(self, words, unk=True):
+    def filter(self, words, unk=True):  # noqa: A003
         """
         Filter or replace any word in `words` that does not occur in
         `Vocabulary`
@@ -1201,7 +1213,7 @@ def _drop_low_freq_tokens(self):
         tokens = [unk_token, eol_token, bol_token]
         word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2}
         idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"}
-        special = set(["<eol>", "<bol>", "<unk>"])
+        special = {"<eol>", "<bol>", "<unk>"}
 
         for tt in self._tokens:
             if tt.word not in special:
diff --git a/numpy_ml/trees/plots.py b/numpy_ml/trees/plots.py
new file mode 100644
index 0000000..4ede83a
--- /dev/null
+++ b/numpy_ml/trees/plots.py
@@ -0,0 +1,163 @@
+# flake8: noqa
+import numpy as np
+
+from sklearn.metrics import accuracy_score, mean_squared_error
+from sklearn.datasets import make_blobs, make_regression
+from sklearn.model_selection import train_test_split
+
+import matplotlib.pyplot as plt
+
+# https://seaborn.pydata.org/generated/seaborn.set_context.html
+# https://seaborn.pydata.org/generated/seaborn.set_style.html
+import seaborn as sns
+
+sns.set_style("white")
+sns.set_context("paper", font_scale=0.9)
+
+from .gbdt import GradientBoostedDecisionTree
+from .dt import DecisionTree
+from .rf import RandomForest
+
+
+def plot():
+    fig, axes = plt.subplots(4, 4)
+    fig.set_size_inches(10, 10)
+    for ax in axes.flatten():
+        n_ex = 100
+        n_trees = 50
+        n_feats = np.random.randint(2, 100)
+        max_depth_d = np.random.randint(1, 100)
+        max_depth_r = np.random.randint(1, 10)
+
+        classifier = np.random.choice([True, False])
+        if classifier:
+            # create classification problem
+            n_classes = np.random.randint(2, 10)
+            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
+            n_feats = min(n_feats, X.shape[1])
+
+            # initialize model
+            def loss(yp, y):
+                return accuracy_score(yp, y)
+
+            # initialize model
+            criterion = np.random.choice(["entropy", "gini"])
+            mine = RandomForest(
+                classifier=classifier,
+                n_feats=n_feats,
+                n_trees=n_trees,
+                criterion=criterion,
+                max_depth=max_depth_r,
+            )
+            mine_d = DecisionTree(
+                criterion=criterion, max_depth=max_depth_d, classifier=classifier
+            )
+            mine_g = GradientBoostedDecisionTree(
+                n_trees=n_trees,
+                max_depth=max_depth_d,
+                classifier=classifier,
+                learning_rate=1,
+                loss="crossentropy",
+                step_size="constant",
+                split_criterion=criterion,
+            )
+
+        else:
+            # create regeression problem
+            X, Y = make_regression(n_samples=n_ex, n_features=1)
+            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
+            n_feats = min(n_feats, X.shape[1])
+
+            # initialize model
+            criterion = "mse"
+            loss = mean_squared_error
+            mine = RandomForest(
+                criterion=criterion,
+                n_feats=n_feats,
+                n_trees=n_trees,
+                max_depth=max_depth_r,
+                classifier=classifier,
+            )
+            mine_d = DecisionTree(
+                criterion=criterion, max_depth=max_depth_d, classifier=classifier
+            )
+            mine_g = GradientBoostedDecisionTree(
+                n_trees=n_trees,
+                max_depth=max_depth_d,
+                classifier=classifier,
+                learning_rate=1,
+                loss="mse",
+                step_size="adaptive",
+                split_criterion=criterion,
+            )
+
+        # fit 'em
+        mine.fit(X, Y)
+        mine_d.fit(X, Y)
+        mine_g.fit(X, Y)
+
+        # get preds on test set
+        y_pred_mine_test = mine.predict(X_test)
+        y_pred_mine_test_d = mine_d.predict(X_test)
+        y_pred_mine_test_g = mine_g.predict(X_test)
+
+        loss_mine_test = loss(y_pred_mine_test, Y_test)
+        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)
+        loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)
+
+        if classifier:
+            entries = [
+                ("RF", loss_mine_test, y_pred_mine_test),
+                ("DT", loss_mine_test_d, y_pred_mine_test_d),
+                ("GB", loss_mine_test_g, y_pred_mine_test_g),
+            ]
+            (lbl, test_loss, preds) = entries[np.random.randint(3)]
+            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
+            for i in np.unique(Y_test):
+                ax.scatter(
+                    X_test[preds == i, 0].flatten(),
+                    X_test[preds == i, 1].flatten(),
+                    #  s=0.5,
+                )
+        else:
+            X_ax = np.linspace(
+                np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100
+            ).reshape(-1, 1)
+            y_pred_mine_test = mine.predict(X_ax)
+            y_pred_mine_test_d = mine_d.predict(X_ax)
+            y_pred_mine_test_g = mine_g.predict(X_ax)
+
+            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
+            #  s=0.5)
+            ax.plot(
+                X_ax.flatten(),
+                y_pred_mine_test_g.flatten(),
+                #  linewidth=0.5,
+                label="GB".format(n_trees, n_feats, max_depth_d),
+                color="red",
+            )
+            ax.plot(
+                X_ax.flatten(),
+                y_pred_mine_test.flatten(),
+                #  linewidth=0.5,
+                label="RF".format(n_trees, n_feats, max_depth_r),
+                color="cornflowerblue",
+            )
+            ax.plot(
+                X_ax.flatten(),
+                y_pred_mine_test_d.flatten(),
+                #  linewidth=0.5,
+                label="DT".format(max_depth_d),
+                color="yellowgreen",
+            )
+            ax.set_title(
+                "GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format(
+                    loss_mine_test_g, loss_mine_test, loss_mine_test_d
+                )
+            )
+            ax.legend()
+        ax.xaxis.set_ticklabels([])
+        ax.yaxis.set_ticklabels([])
+    plt.savefig("plot.png", dpi=300)
+    plt.close("all")

From 730907840655a9d6a2def2cac8543e517d426de1 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 19:11:22 -0400
Subject: [PATCH 05/18] consistent handling of ties during KNN classification

---
 numpy_ml/neural_nets/schedulers/tests.py | 170 -----------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 numpy_ml/neural_nets/schedulers/tests.py

diff --git a/numpy_ml/neural_nets/schedulers/tests.py b/numpy_ml/neural_nets/schedulers/tests.py
deleted file mode 100644
index b013e81..0000000
--- a/numpy_ml/neural_nets/schedulers/tests.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import time
-import numpy as np
-import matplotlib
-
-matplotlib.use("TkAgg")
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-# https://seaborn.pydata.org/generated/seaborn.set_context.html
-# https://seaborn.pydata.org/generated/seaborn.set_style.html
-sns.set_style("white")
-sns.set_context("notebook", font_scale=0.7)
-
-from .schedulers import (
-    ConstantScheduler,
-    ExponentialScheduler,
-    NoamScheduler,
-    KingScheduler,
-)
-
-
-def king_loss_fn(x):
-    if x <= 250:
-        return -0.25 * x + 82.50372665317208
-    elif 250 < x <= 600:
-        return 20.00372665317208
-    elif 600 < x <= 700:
-        return -0.2 * x + 140.00372665317207
-    else:
-        return 0.003726653172066108
-
-
-def plot_schedulers():
-    fig, axes = plt.subplots(2, 2)
-    schedulers = [
-        (
-            [ConstantScheduler(lr=0.01), "lr=1e-2"],
-            [ConstantScheduler(lr=0.008), "lr=8e-3"],
-            [ConstantScheduler(lr=0.006), "lr=6e-3"],
-            [ConstantScheduler(lr=0.004), "lr=4e-3"],
-            [ConstantScheduler(lr=0.002), "lr=2e-3"],
-        ),
-        (
-            [
-                ExponentialScheduler(
-                    lr=0.01, stage_length=250, staircase=False, decay=0.4
-                ),
-                "lr=0.01, stage=250, stair=False, decay=0.4",
-            ],
-            [
-                ExponentialScheduler(
-                    lr=0.01, stage_length=250, staircase=True, decay=0.4
-                ),
-                "lr=0.01, stage=250, stair=True, decay=0.4",
-            ],
-            [
-                ExponentialScheduler(
-                    lr=0.01, stage_length=125, staircase=True, decay=0.1
-                ),
-                "lr=0.01, stage=125, stair=True, decay=0.1",
-            ],
-            [
-                ExponentialScheduler(
-                    lr=0.001, stage_length=250, staircase=False, decay=0.1
-                ),
-                "lr=0.001, stage=250, stair=False, decay=0.1",
-            ],
-            [
-                ExponentialScheduler(
-                    lr=0.001, stage_length=125, staircase=False, decay=0.8
-                ),
-                "lr=0.001, stage=125, stair=False, decay=0.8",
-            ],
-            [
-                ExponentialScheduler(
-                    lr=0.01, stage_length=250, staircase=False, decay=0.01
-                ),
-                "lr=0.01, stage=250, stair=False, decay=0.01",
-            ],
-        ),
-        (
-            [
-                NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=250),
-                "dim=512, scale=1, warmup=250",
-            ],
-            [
-                NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=250),
-                "dim=256, scale=1, warmup=250",
-            ],
-            [
-                NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=500),
-                "dim=512, scale=1, warmup=500",
-            ],
-            [
-                NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=500),
-                "dim=512, scale=1, warmup=500",
-            ],
-            [
-                NoamScheduler(model_dim=512, scale_factor=2, warmup_steps=500),
-                "dim=512, scale=2, warmup=500",
-            ],
-            [
-                NoamScheduler(model_dim=512, scale_factor=0.5, warmup_steps=500),
-                "dim=512, scale=0.5, warmup=500",
-            ],
-        ),
-        (
-            #  [
-            #      KingScheduler(initial_lr=0.01, patience=100, decay=0.1),
-            #      "lr=0.01, patience=100, decay=0.8",
-            #  ],
-            #  [
-            #      KingScheduler(initial_lr=0.01, patience=300, decay=0.999),
-            #      "lr=0.01, patience=300, decay=0.999",
-            #  ],
-            [
-                KingScheduler(initial_lr=0.009, patience=150, decay=0.995),
-                "lr=0.009, patience=150, decay=0.9999",
-            ],
-            [
-                KingScheduler(initial_lr=0.008, patience=100, decay=0.995),
-                "lr=0.008, patience=100, decay=0.995",
-            ],
-            [
-                KingScheduler(initial_lr=0.007, patience=50, decay=0.995),
-                "lr=0.007, patience=50, decay=0.995",
-            ],
-            [
-                KingScheduler(initial_lr=0.005, patience=25, decay=0.9),
-                "lr=0.005, patience=25, decay=0.99",
-            ],
-        ),
-    ]
-
-    for ax, schs, title in zip(
-        axes.flatten(), schedulers, ["Constant", "Exponential", "Noam", "King"]
-    ):
-        t0 = time.time()
-        print("Running {} scheduler".format(title))
-        X = np.arange(1, 1000)
-        loss = np.array([king_loss_fn(x) for x in X])
-
-        # scale loss to fit on same axis as lr
-        scale = 0.01 / loss[0]
-        loss *= scale
-
-        if title == "King":
-            ax.plot(X, loss, ls=":", label="Loss")
-
-        for sc, lg in schs:
-            Y = np.array([sc(x, ll) for x, ll in zip(X, loss)])
-            ax.plot(X, Y, label=lg, alpha=0.6)
-
-        ax.legend(fontsize=5)
-        ax.set_xlabel("Steps")
-        ax.set_ylabel("Learning rate")
-        ax.set_title("{} scheduler".format(title))
-        print(
-            "Finished plotting {} runs of {} in {:.2f}s".format(
-                len(schs), title, time.time() - t0
-            )
-        )
-
-    plt.tight_layout()
-    plt.savefig("plot.png", dpi=300)
-    plt.close("all")
-
-
-if __name__ == "__main__":
-    plot_schedulers()

From c75ed36dd3d8c9b8b273f9a9f2400e29e97d3248 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 19:11:39 -0400
Subject: [PATCH 06/18] consistent handling of ties during KNN classification

---
 numpy_ml/nonparametric/knn.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numpy_ml/nonparametric/knn.py b/numpy_ml/nonparametric/knn.py
index bf29089..8825229 100644
--- a/numpy_ml/nonparametric/knn.py
+++ b/numpy_ml/nonparametric/knn.py
@@ -82,7 +82,10 @@ def predict(self, X):
 
             if H["classifier"]:
                 if H["weights"] == "uniform":
-                    pred = Counter(targets).most_common(1)[0][0]
+                    # for consistency with sklearn / scipy.stats.mode, return
+                    # the smallest class ID in the event of a tie
+                    counts = Counter(targets).most_common()
+                    pred, _ = sorted(counts, key=lambda x: (-x[1], x[0]))[0]
                 elif H["weights"] == "distance":
                     best_score = -np.inf
                     for label in set(targets):

From 6544eee6d87fa0a0c2847a123e69dd4ac22b8145 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 19:18:21 -0400
Subject: [PATCH 07/18] add filter_punctuation flag

---
 numpy_ml/ngram/ngram.py | 66 +++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/numpy_ml/ngram/ngram.py b/numpy_ml/ngram/ngram.py
index ada15f1..29dd336 100644
--- a/numpy_ml/ngram/ngram.py
+++ b/numpy_ml/ngram/ngram.py
@@ -1,3 +1,4 @@
+"""A module for different N-gram smoothing models"""
 import textwrap
 from abc import ABC, abstractmethod
 from collections import Counter
@@ -5,11 +6,11 @@
 import numpy as np
 
 from ..linear_models.lm import LinearRegression
-from ..preprocessing.nlp import tokenize_words, ngrams
+from ..preprocessing.nlp import tokenize_words, ngrams, strip_punctuation
 
 
 class NGramBase(ABC):
-    def __init__(self, N, unk=True, filter_stopwords=True):
+    def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):
         """
         A simple word-level N-gram language model.
 
@@ -23,11 +24,13 @@ def __init__(self, N, unk=True, filter_stopwords=True):
         self.N = N
         self.unk = unk
         self.filter_stopwords = filter_stopwords
+        self.filter_punctuation = filter_punctuation
 
         self.hyperparameters = {
             "N": N,
             "unk": unk,
             "filter_stopwords": filter_stopwords,
+            "filter_punctuation": filter_punctuation,
         }
 
         super().__init__()
@@ -57,20 +60,19 @@ def train(self, corpus_fp, vocab=None, encoding=None):
         return self._train(corpus_fp, vocab=vocab, encoding=encoding)
 
     def _train(self, corpus_fp, vocab=None, encoding=None):
-        """
-        Actual N-gram training logic
-        """
+        """Actual N-gram training logic"""
         H = self.hyperparameters
         grams = {N: [] for N in range(1, self.N + 1)}
         counts = {N: Counter() for N in range(1, self.N + 1)}
-        filter_stop = H["filter_stopwords"]
+        filter_stop, filter_punc = H["filter_stopwords"], H["filter_punctuation"]
 
         _n_words = 0
-        tokens = set(["<unk>"])
+        tokens = {"<unk>"}
         bol, eol = ["<bol>"], ["<eol>"]
 
         with open(corpus_fp, "r", encoding=encoding) as text:
             for line in text:
+                line = strip_punctuation(line) if filter_punc else line
                 words = tokenize_words(line, filter_stopwords=filter_stop)
 
                 if vocab is not None:
@@ -174,7 +176,7 @@ def generate(self, N, seed_words=["<bol>"], n_sentences=5):
         return sentences
 
     def perplexity(self, words, N):
-        """
+        r"""
         Calculate the model perplexity on a sequence of words.
 
         Notes
@@ -183,13 +185,13 @@ def perplexity(self, words, N):
 
         .. math::
 
-            PP(W)  =  \\left( \\frac{1}{p(W)} \\right)^{1 / n}
+            PP(W)  =  \left( \frac{1}{p(W)} \right)^{1 / n}
 
         or simply
 
         .. math::
 
-            PP(W)  &=  \exp(-\log p(W) / n) \\\\
+            PP(W)  &=  \exp(-\log p(W) / n) \\
                    &=  \exp(H(W))
 
         where :math:`W = [w_1, \ldots, w_k]` is a sequence of words, `H(w)` is
@@ -216,7 +218,7 @@ def perplexity(self, words, N):
         return np.exp(self.cross_entropy(words, N))
 
     def cross_entropy(self, words, N):
-        """
+        r"""
         Calculate the model cross-entropy on a sequence of words against the
         empirical distribution of words in a sample.
 
@@ -226,7 +228,7 @@ def cross_entropy(self, words, N):
 
         .. math::
 
-            H(W) = -\\frac{\log p(W)}{n}
+            H(W) = -\frac{\log p(W)}{n}
 
         where :math:`W = [w_1, \ldots, w_k]` is a sequence of words, and `n` is
         the number of `N`-grams in `W`.
@@ -251,7 +253,10 @@ def cross_entropy(self, words, N):
         return -(1 / n_ngrams) * self.log_prob(words, N)
 
     def _log_prob(self, words, N):
-        """Calculate the log probability of a sequence of words under the `N`-gram model"""
+        """
+        Calculate the log probability of a sequence of words under the
+        `N`-gram model
+        """
         assert N in self.counts, "You do not have counts for {}-grams".format(N)
 
         if N > len(words):
@@ -293,10 +298,15 @@ def _num_grams_with_count(self, C, N):
 
     @abstractmethod
     def log_prob(self, words, N):
+        """
+        Compute the log probability of a sequence of words under the
+        unsmoothed, maximum-likelihood `N`-gram language model.
+        """
         raise NotImplementedError
 
     @abstractmethod
     def _log_ngram_prob(self, ngram):
+        """Return the unsmoothed log probability of the ngram"""
         raise NotImplementedError
 
 
@@ -319,6 +329,7 @@ def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):
             Whether to remove punctuation before training. Default is True.
         """
         super().__init__(N, unk, filter_stopwords, filter_punctuation)
+
         self.hyperparameters["id"] = "MLENGram"
 
     def log_prob(self, words, N):
@@ -352,7 +363,7 @@ def _log_ngram_prob(self, ngram):
 
 class AdditiveNGram(NGramBase):
     def __init__(
-        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
+        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True,
     ):
         """
         An N-Gram model with smoothed probabilities calculated via additive /
@@ -384,11 +395,12 @@ def __init__(
             Whether to remove punctuation before training. Default is True.
         """
         super().__init__(N, unk, filter_stopwords, filter_punctuation)
+
         self.hyperparameters["id"] = "AdditiveNGram"
         self.hyperparameters["K"] = K
 
     def log_prob(self, words, N):
-        """
+        r"""
         Compute the smoothed log probability of a sequence of words under the
         `N`-gram language model with additive smoothing.
 
@@ -398,15 +410,15 @@ def log_prob(self, words, N):
 
         .. math::
 
-            P(w_i \mid w_{i-1}) = \\frac{A + K}{B + KV}
+            P(w_i \mid w_{i-1}) = \frac{A + K}{B + KV}
 
         where
 
         .. math::
 
-            A  &=  \\text{Count}(w_{i-1}, w_i) \\\\
-            B  &=  \sum_j \\text{Count}(w_{i-1}, w_j) \\\\
-            V  &= |\{ w_j \ : \ \\text{Count}(w_{i-1}, w_j) > 0 \}|
+            A  &=  \text{Count}(w_{i-1}, w_i) \\
+            B  &=  \sum_j \text{Count}(w_{i-1}, w_j) \\
+            V  &= |\{ w_j \ : \ \text{Count}(w_{i-1}, w_j) > 0 \}|
 
         This is equivalent to pretending we've seen every possible `N`-gram
         sequence at least `K` times.
@@ -446,7 +458,7 @@ def _log_ngram_prob(self, ngram):
 
 class GoodTuringNGram(NGramBase):
     def __init__(
-        self, N, conf=1.96, unk=True, filter_stopwords=True, filter_punctuation=True
+        self, N, conf=1.96, unk=True, filter_stopwords=True, filter_punctuation=True,
     ):
         """
         An N-Gram model with smoothed probabilities calculated with the simple
@@ -471,6 +483,7 @@ def __init__(
             Whether to remove punctuation before training. Default is True.
         """
         super().__init__(N, unk, filter_stopwords, filter_punctuation)
+
         self.hyperparameters["id"] = "GoodTuringNGram"
         self.hyperparameters["conf"] = conf
 
@@ -497,7 +510,7 @@ def train(self, corpus_fp, vocab=None, encoding=None):
         self._calc_smoothed_counts()
 
     def log_prob(self, words, N):
-        """
+        r"""
         Compute the smoothed log probability of a sequence of words under the
         `N`-gram language model with Good-Turing smoothing.
 
@@ -507,21 +520,22 @@ def log_prob(self, words, N):
 
         .. math::
 
-            P(w_i \mid w_{i-1}) = \\frac{C^*}{\\text{Count}(w_{i-1})}
+            P(w_i \mid w_{i-1}) = \frac{C^*}{\text{Count}(w_{i-1})}
 
         where :math:`C^*` is the Good-Turing smoothed estimate of the bigram
         count:
 
         .. math::
 
-            C^* = \\frac{(c + 1) \\text{NumCounts}(c + 1, 2)}{\\text{NumCounts}(c, 2)}
+            C^* = \frac{(c + 1) \text{NumCounts}(c + 1, 2)}{\text{NumCounts}(c, 2)}
 
         where
 
         .. math::
 
-            c  &=  \\text{Count}(w_{i-1}, w_i) \\\\
-            \\text{NumCounts}(r, k)  &=  |\{ k\\text{-gram} : \\text{Count}(k\\text{-gram}) = r \}|
+            c  &=  \text{Count}(w_{i-1}, w_i) \\
+            \text{NumCounts}(r, k)  &=
+                |\{ k\text{-gram} : \text{Count}(k\text{-gram}) = r \}|
 
         In words, the probability of an `N`-gram that occurs `r` times in the
         corpus is estimated by dividing up the probability mass occupied by
@@ -532,7 +546,7 @@ def log_prob(self, words, N):
 
         .. math::
 
-            \log \\text{NumCounts}(r) = b + a \log r
+            \log \text{NumCounts}(r) = b + a \log r
 
         Under the Good-Turing estimator, the total probability assigned to
         unseen `N`-grams is equal to the relative occurrence of `N`-grams that

From cf27808d5e9c4fa48ceb5d369c0649a591cd15b6 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 23:32:46 -0400
Subject: [PATCH 08/18] move plots to a separate directory

---
 numpy_ml/hmm/hmm.py                           |  134 +-
 numpy_ml/neural_nets/activations/tests.py     |  338 ---
 numpy_ml/neural_nets/tests/__init__.py        |    8 -
 numpy_ml/neural_nets/tests/tests.py           | 2552 -----------------
 numpy_ml/neural_nets/tests/torch_models.py    | 2276 ---------------
 numpy_ml/ngram/tests.py                       |  250 --
 numpy_ml/nonparametric/tests.py               |  106 -
 .../plots.py => plots/bandit_plots.py}        |   13 +-
 numpy_ml/{gmm/plots.py => plots/gmm_plots.py} |    5 +-
 numpy_ml/{hmm/plots.py => plots/hmm_plots.py} |    5 +-
 numpy_ml/{lda/plots.py => plots/lda_plots.py} |    6 +-
 .../plots.py => plots/lm_plots.py}            |    6 +-
 .../{ngram/plots.py => plots/ngram_plots.py}  |    3 +-
 .../nn_activations_plots.py}                  |    6 +-
 .../plots.py => plots/nn_schedulers_plots.py} |    5 +-
 .../plots.py => plots/nonparametric_plots.py} |    7 +-
 .../{rl_models/plots.py => plots/rl_plots.py} |    4 +-
 .../{trees/plots.py => plots/trees_plots.py}  |    4 +-
 numpy_ml/preprocessing/tests.py               |  210 --
 numpy_ml/trees/tests.py                       |  522 ----
 numpy_ml/utils/tests.py                       |  258 --
 21 files changed, 105 insertions(+), 6613 deletions(-)
 delete mode 100644 numpy_ml/neural_nets/activations/tests.py
 delete mode 100644 numpy_ml/neural_nets/tests/__init__.py
 delete mode 100644 numpy_ml/neural_nets/tests/tests.py
 delete mode 100644 numpy_ml/neural_nets/tests/torch_models.py
 delete mode 100644 numpy_ml/ngram/tests.py
 delete mode 100644 numpy_ml/nonparametric/tests.py
 rename numpy_ml/{bandits/plots.py => plots/bandit_plots.py} (94%)
 rename numpy_ml/{gmm/plots.py => plots/gmm_plots.py} (98%)
 rename numpy_ml/{hmm/plots.py => plots/hmm_plots.py} (98%)
 rename numpy_ml/{lda/plots.py => plots/lda_plots.py} (97%)
 rename numpy_ml/{linear_models/plots.py => plots/lm_plots.py} (99%)
 rename numpy_ml/{ngram/plots.py => plots/ngram_plots.py} (97%)
 rename numpy_ml/{neural_nets/activations/plots.py => plots/nn_activations_plots.py} (95%)
 rename numpy_ml/{neural_nets/schedulers/plots.py => plots/nn_schedulers_plots.py} (98%)
 rename numpy_ml/{nonparametric/plots.py => plots/nonparametric_plots.py} (98%)
 rename numpy_ml/{rl_models/plots.py => plots/rl_plots.py} (96%)
 rename numpy_ml/{trees/plots.py => plots/trees_plots.py} (98%)
 delete mode 100644 numpy_ml/preprocessing/tests.py
 delete mode 100644 numpy_ml/trees/tests.py
 delete mode 100644 numpy_ml/utils/tests.py

diff --git a/numpy_ml/hmm/hmm.py b/numpy_ml/hmm/hmm.py
index a7e84db..51e8ec7 100644
--- a/numpy_ml/hmm/hmm.py
+++ b/numpy_ml/hmm/hmm.py
@@ -1,9 +1,11 @@
+"""Hidden Markov model module"""
+
 import numpy as np
 
 
 class MultinomialHMM:
     def __init__(self, A=None, B=None, pi=None, eps=None):
-        """
+        r"""
         A simple hidden Markov model with multinomial emission distribution.
 
         Parameters
@@ -68,10 +70,10 @@ def __init__(self, A=None, B=None, pi=None, eps=None):
             self.B[self.B == 0] = self.eps
 
         # set of training sequences
-        self.O = None
+        self.O = None  # noqa: E741
 
         # number of sequences in O
-        self.I = None
+        self.I = None  # noqa: E741
 
         # number of observations in each sequence
         self.T = None
@@ -115,10 +117,10 @@ def generate(self, n_steps, latent_state_types, obs_types):
         return np.array(states), np.array(emissions)
 
     def log_likelihood(self, O):
-        """
+        r"""
         Given the HMM parameterized by :math:`(A`, B, \pi)` and an observation
         sequence `O`, compute the marginal likelihood of the observations:
-        :math:`P(O|A,B,\pi)`, summing over latent states.
+        :math:`P(O \mid A,B,\pi)`, summing over latent states.
 
         Notes
         -----
@@ -128,7 +130,9 @@ def log_likelihood(self, O):
         probability under the HMM of being in latent state `i` after seeing the
         first `j` observations:
 
-        .. math:: \mathtt{forward[i,j]} = P(o_1,\ldots,o_j,q_j=i \mid A,B,\pi)
+        .. math::
+
+            \mathtt{forward[i,j]} = P(o_1, \ldots, o_j, q_j=i \mid A, B, \pi)
 
         Here :math:`q_j = i` indicates that the hidden state at time `j` is of
         type `i`.
@@ -137,12 +141,11 @@ def log_likelihood(self, O):
 
         .. math::
 
-            \mathtt{forward[i,j]}  &=  \sum_{s'=1}^N \mathtt{forward[s',j-1]}
-            \cdot \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
-
-                                   &=  \sum_{s'=1}^N
-                                   P(o_1,\ldots,o_{j-1},q_{j-1}=s' \mid A,B,\pi)
-                                   P(q_j=i|q_{j-1}=s') P(o_j \mid q_j=i)
+            \mathtt{forward[i,j]}
+               &= \sum_{s'=1}^N \mathtt{forward[s',j-1]} \cdot
+                   \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
+               &= \sum_{s'=1}^N P(o_1, \ldots, o_{j-1}, q_{j-1}=s' \mid A, B, \pi)
+                    P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)
 
         In words, ``forward[i,j]`` is the weighted sum of the values computed on
         the previous timestep. The weight on each previous state value is the
@@ -160,11 +163,11 @@ def log_likelihood(self, O):
             The likelihood of the observations `O` under the HMM.
         """
         if O.ndim == 1:
-            O = O.reshape(1, -1)
+            O = O.reshape(1, -1)  # noqa: E741
 
-        I, T = O.shape
+        I, T = O.shape  # noqa: E741
 
-        if I != 1:
+        if I != 1:  # noqa: E741
             raise ValueError("Likelihood only accepts a single sequence")
 
         forward = self._forward(O[0])
@@ -172,7 +175,7 @@ def log_likelihood(self, O):
         return log_likelihood
 
     def decode(self, O):
-        """
+        r"""
         Given the HMM parameterized by :math:`(A, B, \pi)` and an observation
         sequence :math:`O = o_1, \ldots, o_T`, compute the most probable
         sequence of latent states, :math:`Q = q_1, \ldots, q_T`.
@@ -187,7 +190,8 @@ def decode(self, O):
         .. math::
 
             \mathtt{viterbi[i,j]} =
-                \max_{q_1,\ldots,q_{j-1}} P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi)
+                \max_{q_1, \ldots, q_{j-1}}
+                    P(o_1, \ldots, o_j, q_1, \ldots, q_{j-1}, q_j=i \mid A, B, \pi)
 
         Here :math:`q_j = i` indicates that the hidden state at time `j` is of
         type `i`, and :math:`\max_{q_1,\ldots,q_{j-1}}` represents the maximum over
@@ -197,12 +201,12 @@ def decode(self, O):
 
         .. math::
 
-            \mathtt{viterbi[i,j]}  &=  \max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot
-            \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
-
-                                   &=  \max_{s'=1}^N
-                                   P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi)
-                                   P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)
+            \mathtt{viterbi[i,j]} &=
+                \max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot
+                    \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\
+               &=  \max_{s'=1}^N
+                   P(o_1,\ldots, o_j, q_1, \ldots, q_{j-1}, q_j=i \mid A, B, \pi)
+                   P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)
 
         In words, ``viterbi[i,j]`` is the weighted sum of the values computed
         on the previous timestep. The weight on each value is the product of
@@ -235,14 +239,14 @@ def decode(self, O):
         eps = self.eps
 
         if O.ndim == 1:
-            O = O.reshape(1, -1)
+            O = O.reshape(1, -1)  # noqa: E741
 
         # number of observations in each sequence
         T = O.shape[1]
 
         # number of training sequences
-        I = O.shape[0]
-        if I != 1:
+        I = O.shape[0]  # noqa: E741
+        if I != 1:  # noqa: E741
             raise ValueError("Can only decode a single sequence (O.shape[0] must be 1)")
 
         # initialize the viterbi and back_pointer matrices
@@ -280,7 +284,7 @@ def decode(self, O):
         return best_path, best_path_log_prob
 
     def _forward(self, Obs):
-        """
+        r"""
         Computes the forward probability trellis for an HMM parameterized by
         :math:`(A, B, \pi)`.
 
@@ -291,16 +295,22 @@ def _forward(self, Obs):
         under the HMM of being in latent state `i` after seeing the first `j`
         observations:
 
-        .. math:: \mathtt{forward[i,j]} = P(o_1,\ldots,o_j,q_j=i|A,B,\pi)
+        .. math::
+
+            \mathtt{forward[i,j]} =
+                P(o_1, \ldots, o_j, q_j=i \mid A, B, \pi)
 
         Here :math:`q_j = i` indicates that the hidden state at time `j` is of
         type `i`.
 
         The DP step is::
 
-            forward[i,j] = sum_{s'=1}^N forward[s',j-1] * A[s',i] * B[i,o_j]
-                         = sum_{s'=1}^N P(o_1,\ldots,o_{j-1},q_{j-1}=s'|A,B,pi) *
-                           P(q_j=i|q_{j-1}=s') * P(o_j|q_j=i)
+        .. math::
+
+            forward[i,j] &=
+                \sum_{s'=1}^N forward[s',j-1] \times A[s',i] \times B[i,o_j] \\
+                &= \sum_{s'=1}^N P(o_1, \ldots, o_{j-1}, q_{j-1}=s' \mid A, B, \pi)
+                    \times P(q_j=i \mid q_{j-1}=s') \times P(o_j \mid q_j=i)
 
         In words, ``forward[i,j]`` is the weighted sum of the values computed
         on the previous timestep. The weight on each previous state value is
@@ -336,12 +346,12 @@ def _forward(self, Obs):
                         + np.log(self.A[s_, s] + eps)
                         + np.log(self.B[s, ot] + eps)
                         for s_ in range(self.N)
-                    ]
+                    ]  # noqa: C812
                 )
         return forward
 
     def _backward(self, Obs):
-        """
+        r"""
         Compute the backward probability trellis for an HMM parameterized by
         :math:`(A, B, \pi)`.
 
@@ -352,15 +362,18 @@ def _backward(self, Obs):
         of seeing the observations from time `j+1` onward given that the HMM is
         in state `i` at time `j`
 
-        .. math:: \mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,\pi)
+        .. math::
+
+            \mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},\ldots,o_T \mid q_j=i,A,B,\pi)
 
         Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`.
 
         The DP step is::
 
-            backward[i,j] = sum_{s'=1}^N backward[s',j+1] * A[i, s'] * B[s',o_{j+1}]
-                          = sum_{s'=1}^N P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,pi) *
-                            P(q_{j+1}=s'|q_{j}=i) * P(o_{j+1}|q_{j+1}=s')
+            backward[i,j] &=
+                \sum_{s'=1}^N backward[s',j+1] \times A[i, s'] \times B[s',o_{j+1}] \\
+                &= \sum_{s'=1}^N P(o_{j+1}, o_{j+2}, \ldots, o_T \mid q_j=i, A, B, pi)
+                    \times P(q_{j+1}=s' \mid q_{j}=i) \times P(o_{j+1} \mid q_{j+1}=s')
 
         In words, ``backward[i,j]`` is the weighted sum of the values computed
         on the following timestep. The weight on each state value from the
@@ -396,12 +409,18 @@ def _backward(self, Obs):
                         + np.log(self.B[s_, ot1] + eps)
                         + backward[s_, t + 1]
                         for s_ in range(self.N)
-                    ]
+                    ]  # noqa: C812
                 )
         return backward
 
     def fit(
-        self, O, latent_state_types, observation_types, pi=None, tol=1e-5, verbose=False
+        self,
+        O,
+        latent_state_types,
+        observation_types,
+        pi=None,
+        tol=1e-5,
+        verbose=False,
     ):
         """
         Given an observation sequence `O` and the set of possible latent states,
@@ -446,10 +465,10 @@ def fit(
             The estimated prior probabilities of each latent state.
         """
         if O.ndim == 1:
-            O = O.reshape(1, -1)
+            O = O.reshape(1, -1)  # noqa: E741
 
         # observations
-        self.O = O
+        self.O = O  # noqa: E741
 
         # number of training examples (I) and their lengths (T)
         self.I, self.T = self.O.shape
@@ -492,7 +511,7 @@ def fit(
         return self.A, self.B, self.pi
 
     def _Estep(self):
-        """
+        r"""
         Run a single E-step update for the Baum-Welch/Forward-Backward
         algorithm. This step estimates ``xi`` and ``gamma``, the excepted
         state-state transition counts and the expected state-occupancy counts,
@@ -502,17 +521,22 @@ def _Estep(self):
         and state `j` at time `k+1` given the observed sequence `O` and the
         current estimates for transition (`A`) and emission (`B`) matrices::
 
-            xi[i,j,k] = P(q_k=i,q_{k+1}=j|O,A,B,pi)
-                      = P(q_k=i,q_{k+1}=j,O|A,B,pi) / P(O|A,B,pi)
-                      = [
-                            P(o_1,o_2,...,o_k,q_k=i|A,B,pi) *
-                            P(q_{k+1}=j|q_k=i) * P(o_{k+1}|q_{k+1}=j) *
-                            P(o_{k+2},o_{k+3},...,o_T|q_{k+1}=j,A,B,pi)
-                        ] / P(O|A,B,pi)
-                      = [
-                            fwd[j, k] * self.A[j, i] *
-                            self.B[i, o_{k+1}] * bwd[i, k + 1]
-                        ] / fwd[:, T].sum()
+        .. math::
+
+            xi[i,j,k] &= P(q_k=i,q_{k+1}=j \mid O,A,B,pi) \\
+                      &= \frac{
+                            P(q_k=i,q_{k+1}=j,O \mid A,B,pi)
+                         }{P(O \mid A,B,pi)} \\
+                      &= \frac{
+                            P(o_1,o_2,\ldots,o_k,q_k=i \mid A,B,pi) \times
+                            P(q_{k+1}=j \mid q_k=i) \times
+                            P(o_{k+1} \mid q_{k+1}=j) \times
+                            P(o_{k+2},o_{k+3},\ldots,o_T \mid q_{k+1}=j,A,B,pi)
+                         }{P(O \mid A,B,pi)} \\
+                      &= \frac{
+                            \mathtt{fwd[j, k] * self.A[j, i] *
+                            self.B[i, o_{k+1}] * bwd[i, k + 1]}
+                         }{\mathtt{fwd[:, T].sum()}}
 
         The expected number of transitions from state `i` to state `j` across the
         entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``.
@@ -614,12 +638,12 @@ def _Mstep(self, gamma, xi, phi):
         for si in range(self.N):
             for vk in range(self.V):
                 B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp(
-                    count_gamma[:, si, :]
+                    count_gamma[:, si, :]  # noqa: C812
                 )
 
             for sj in range(self.N):
                 A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp(
-                    count_xi[:, si, :]
+                    count_xi[:, si, :]  # noqa: C812
                 )
 
             np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1)
diff --git a/numpy_ml/neural_nets/activations/tests.py b/numpy_ml/neural_nets/activations/tests.py
deleted file mode 100644
index 18d799d..0000000
--- a/numpy_ml/neural_nets/activations/tests.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import sys
-import time
-import numpy as np
-
-from numpy.testing import assert_almost_equal
-from scipy.special import expit
-
-import torch
-import torch.nn.functional as F
-
-sys.path.append("../..")
-from utils.testing import random_stochastic_matrix, random_tensor
-
-
-def torch_gradient_generator(fn, **kwargs):
-    def get_grad(z):
-        z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)
-        z2 = fn(z1, **kwargs).sum()
-        z2.backward()
-        grad = z1.grad.numpy()
-        return grad
-
-    return get_grad
-
-
-#######################################################################
-#                           Debug Formatter                           #
-#######################################################################
-
-
-def err_fmt(params, golds, ix, warn_str=""):
-    mine, label = params[ix]
-    err_msg = "-" * 25 + " DEBUG " + "-" * 25 + "\n"
-    prev_mine, prev_label = params[max(ix - 1, 0)]
-    err_msg += "Mine (prev) [{}]:\n{}\n\nTheirs (prev) [{}]:\n{}".format(
-        prev_label, prev_mine, prev_label, golds[prev_label]
-    )
-    err_msg += "\n\nMine [{}]:\n{}\n\nTheirs [{}]:\n{}".format(
-        label, mine, label, golds[label]
-    )
-    err_msg += warn_str
-    err_msg += "\n" + "-" * 23 + " END DEBUG " + "-" * 23
-    return err_msg
-
-
-#######################################################################
-#                            Test Suite                               #
-#######################################################################
-
-
-def test_activations(N=50):
-    print("Testing Sigmoid activation")
-    time.sleep(1)
-    test_sigmoid_activation(N)
-    test_sigmoid_grad(N)
-
-    #  print("Testing Softmax activation")
-    #  time.sleep(1)
-    #  test_softmax_activation(N)
-    #  test_softmax_grad(N)
-
-    print("Testing Tanh activation")
-    time.sleep(1)
-    test_tanh_grad(N)
-
-    print("Testing ReLU activation")
-    time.sleep(1)
-    test_relu_activation(N)
-    test_relu_grad(N)
-
-    print("Testing ELU activation")
-    time.sleep(1)
-    test_elu_activation(N)
-    test_elu_grad(N)
-
-    print("Testing SELU activation")
-    time.sleep(1)
-    test_selu_activation(N)
-    test_selu_grad(N)
-
-    print("Testing LeakyRelu activation")
-    time.sleep(1)
-    test_leakyrelu_activation(N)
-    test_leakyrelu_grad(N)
-
-    print("Testing SoftPlus activation")
-    time.sleep(1)
-    test_softplus_activation(N)
-    test_softplus_grad(N)
-
-
-#######################################################################
-#                          Activations                                #
-#######################################################################
-
-
-def test_sigmoid_activation(N=None):
-    from activations import Sigmoid
-
-    N = np.inf if N is None else N
-
-    mine = Sigmoid()
-    gold = expit
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((1, n_dims))
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_softplus_activation(N=None):
-    from activations import SoftPlus
-
-    N = np.inf if N is None else N
-
-    mine = SoftPlus()
-    gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_elu_activation(N=None):
-    from activations import ELU
-
-    N = np.inf if N is None else N
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 10)
-        z = random_tensor((1, n_dims))
-
-        alpha = np.random.uniform(0, 10)
-
-        mine = ELU(alpha)
-        gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()
-
-        assert_almost_equal(mine.fn(z), gold(z, alpha))
-        print("PASSED")
-        i += 1
-
-
-def test_relu_activation(N=None):
-    from activations import ReLU
-
-    N = np.inf if N is None else N
-
-    mine = ReLU()
-    gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_selu_activation(N=None):
-    from activations import SELU
-
-    N = np.inf if N is None else N
-
-    mine = SELU()
-    gold = lambda z: F.selu(torch.FloatTensor(z)).numpy()
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_leakyrelu_activation(N=None):
-    from activations import LeakyReLU
-
-    N = np.inf if N is None else N
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        alpha = np.random.uniform(0, 10)
-
-        mine = LeakyReLU(alpha=alpha)
-        gold = lambda z: F.leaky_relu(torch.FloatTensor(z), alpha).numpy()
-        assert_almost_equal(mine.fn(z), gold(z))
-
-        print("PASSED")
-        i += 1
-
-
-#######################################################################
-#                      Activation Gradients                           #
-#######################################################################
-
-
-def test_sigmoid_grad(N=None):
-    from activations import Sigmoid
-
-    N = np.inf if N is None else N
-
-    mine = Sigmoid()
-    gold = torch_gradient_generator(torch.sigmoid)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_elu_grad(N=None):
-    from activations import ELU
-
-    N = np.inf if N is None else N
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 10)
-        n_dims = np.random.randint(1, 10)
-        alpha = np.random.uniform(0, 10)
-        z = random_tensor((n_ex, n_dims))
-
-        mine = ELU(alpha)
-        gold = torch_gradient_generator(F.elu, alpha=alpha)
-        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
-        print("PASSED")
-        i += 1
-
-
-def test_tanh_grad(N=None):
-    from activations import Tanh
-
-    N = np.inf if N is None else N
-
-    mine = Tanh()
-    gold = torch_gradient_generator(torch.tanh)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_relu_grad(N=None):
-    from activations import ReLU
-
-    N = np.inf if N is None else N
-
-    mine = ReLU()
-    gold = torch_gradient_generator(F.relu)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_selu_grad(N=None):
-    from activations import SELU
-
-    N = np.inf if N is None else N
-
-    mine = SELU()
-    gold = torch_gradient_generator(F.selu)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
-        print("PASSED")
-        i += 1
-
-
-def test_leakyrelu_grad(N=None):
-    from activations import LeakyReLU
-
-    N = np.inf if N is None else N
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 10)
-        n_dims = np.random.randint(1, 10)
-        alpha = np.random.uniform(0, 10)
-        z = random_tensor((n_ex, n_dims))
-
-        mine = LeakyReLU(alpha)
-        gold = torch_gradient_generator(F.leaky_relu, negative_slope=alpha)
-        assert_almost_equal(mine.grad(z), gold(z), decimal=6)
-        print("PASSED")
-        i += 1
-
-
-def test_softplus_grad(N=None):
-    from activations import SoftPlus
-
-    N = np.inf if N is None else N
-
-    mine = SoftPlus()
-    gold = torch_gradient_generator(F.softplus)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims), standardize=True)
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-if __name__ == "__main__":
-    test_activations(N=50)
diff --git a/numpy_ml/neural_nets/tests/__init__.py b/numpy_ml/neural_nets/tests/__init__.py
deleted file mode 100644
index 73f8158..0000000
--- a/numpy_ml/neural_nets/tests/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-A module of tests for many of the components in the neural_nets package.
-
-Note that many of the tests in this module rely on external packages like
-PyTorch and Tensorflow for gold-standard implementations.
-"""
-
-from .tests import *
diff --git a/numpy_ml/neural_nets/tests/tests.py b/numpy_ml/neural_nets/tests/tests.py
deleted file mode 100644
index bdfe950..0000000
--- a/numpy_ml/neural_nets/tests/tests.py
+++ /dev/null
@@ -1,2552 +0,0 @@
-# flake8: noqa
-import time
-from copy import deepcopy
-
-import numpy as np
-from numpy.testing import assert_almost_equal
-
-from sklearn.metrics import log_loss, mean_squared_error
-
-# for testing sigmoid
-from scipy.special import expit
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..utils import calc_pad_dims_2D, conv2D_naive, conv2D, pad2D, pad1D
-from ...utils.testing import (
-    random_one_hot_matrix,
-    random_stochastic_matrix,
-    random_tensor,
-)
-
-from .torch_models import (
-    TFNCELoss,
-    WGAN_GP_tf,
-    torch_xe_grad,
-    torch_mse_grad,
-    TorchVAELoss,
-    TorchFCLayer,
-    TorchRNNCell,
-    TorchLSTMCell,
-    TorchAddLayer,
-    TorchWGANGPLoss,
-    TorchConv1DLayer,
-    TorchConv2DLayer,
-    TorchPool2DLayer,
-    TorchWavenetModule,
-    TorchMultiplyLayer,
-    TorchDeconv2DLayer,
-    TorchLayerNormLayer,
-    TorchBatchNormLayer,
-    TorchEmbeddingLayer,
-    TorchLinearActivation,
-    TorchSDPAttentionLayer,
-    TorchBidirectionalLSTM,
-    torch_gradient_generator,
-    TorchSkipConnectionConv,
-    TorchSkipConnectionIdentity,
-    TorchMultiHeadedAttentionModule,
-)
-
-#######################################################################
-#                           Debug Formatter                           #
-#######################################################################
-
-
-def err_fmt(params, golds, ix, warn_str=""):
-    mine, label = params[ix]
-    err_msg = "-" * 25 + " DEBUG " + "-" * 25 + "\n"
-    prev_mine, prev_label = params[max(ix - 1, 0)]
-    err_msg += "Mine (prev) [{}]:\n{}\n\nTheirs (prev) [{}]:\n{}".format(
-        prev_label, prev_mine, prev_label, golds[prev_label]
-    )
-    err_msg += "\n\nMine [{}]:\n{}\n\nTheirs [{}]:\n{}".format(
-        label, mine, label, golds[label]
-    )
-    err_msg += warn_str
-    err_msg += "\n" + "-" * 23 + " END DEBUG " + "-" * 23
-    return err_msg
-
-
-#######################################################################
-#                            Test Suite                               #
-#######################################################################
-
-
-def test_everything(N=50):
-    test_losses(N=N)
-    test_activations(N=N)
-    test_layers(N=N)
-    test_utils(N=N)
-    test_modules(N=N)
-
-
-def test_losses(N=50):
-    print("Testing SquaredError loss")
-    time.sleep(1)
-    test_squared_error(N)
-    test_squared_error_grad(N)
-
-    print("Testing CrossEntropy loss")
-    time.sleep(1)
-    test_cross_entropy(N)
-    test_cross_entropy_grad(N)
-
-    print("Testing VAELoss")
-    time.sleep(1)
-    test_VAE_loss(N)
-
-    print("Testing WGAN_GPLoss")
-    time.sleep(1)
-    test_WGAN_GP_loss(N)
-
-    print("Testing NCELoss")
-    time.sleep(1)
-    test_NCELoss(N)
-
-
-def test_activations(N=50):
-    print("Testing Sigmoid activation")
-    time.sleep(1)
-    test_sigmoid_activation(N)
-    test_sigmoid_grad(N)
-
-    print("Testing Softmax activation")
-    time.sleep(1)
-    test_softmax_activation(N)
-    test_softmax_grad(N)
-
-    print("Testing Tanh activation")
-    time.sleep(1)
-    test_tanh_grad(N)
-
-    print("Testing ReLU activation")
-    time.sleep(1)
-    test_relu_activation(N)
-    test_relu_grad(N)
-
-    print("Testing ELU activation")
-    time.sleep(1)
-    test_elu_activation(N)
-    test_elu_grad(N)
-
-    print("Testing SoftPlus activation")
-    time.sleep(1)
-    test_softplus_activation(N)
-    test_softplus_grad(N)
-
-
-def test_layers(N=50):
-    print("Testing FullyConnected layer")
-    time.sleep(1)
-    test_FullyConnected(N)
-
-    print("Testing Conv1D layer")
-    time.sleep(1)
-    test_Conv1D(N)
-
-    print("Testing Conv2D layer")
-    time.sleep(1)
-    test_Conv2D(N)
-
-    print("Testing Pool2D layer")
-    time.sleep(1)
-    test_Pool2D(N)
-
-    print("Testing BatchNorm1D layer")
-    time.sleep(1)
-    test_BatchNorm1D(N)
-
-    print("Testing BatchNorm2D layer")
-    time.sleep(1)
-    test_BatchNorm2D(N)
-
-    print("Testing LayerNorm1D layer")
-    time.sleep(1)
-    test_LayerNorm1D(N)
-
-    print("Testing LayerNorm2D layer")
-    time.sleep(1)
-    test_LayerNorm2D(N)
-
-    print("Testing Deconv2D layer")
-    time.sleep(1)
-    test_Deconv2D(N)
-
-    print("Testing Add layer")
-    time.sleep(1)
-    test_AddLayer(N)
-
-    print("Testing Multiply layer")
-    time.sleep(1)
-    test_MultiplyLayer(N)
-
-    print("Testing LSTMCell layer")
-    time.sleep(1)
-    test_LSTMCell(N)
-
-    print("Testing RNNCell layer")
-    time.sleep(1)
-    test_RNNCell(N)
-
-    print("Testing DotProductAttention layer")
-    time.sleep(1)
-    test_DPAttention(N)
-
-
-def test_utils(N=50):
-    print("Testing pad1D util")
-    time.sleep(1)
-    test_pad1D(N)
-
-    print("Testing conv2D util")
-    time.sleep(1)
-    test_conv(N)
-
-
-def test_modules(N=50):
-    print("Testing MultiHeadedAttentionModule")
-    time.sleep(1)
-    test_MultiHeadedAttentionModule(N)
-
-    print("Testing BidirectionalLSTM module")
-    time.sleep(1)
-    test_BidirectionalLSTM(N)
-
-    print("Testing WaveNet module")
-    time.sleep(1)
-    test_WaveNetModule(N)
-
-    print("Testing SkipConnectionIdentity module")
-    time.sleep(1)
-    test_SkipConnectionIdentityModule(N)
-
-    print("Testing SkipConnectionConv module")
-    time.sleep(1)
-    test_SkipConnectionConvModule(N)
-
-
-#######################################################################
-#                         Loss Functions                              #
-#######################################################################
-
-
-def test_squared_error(N=None):
-    from ..losses import SquaredError
-
-    N = np.inf if N is None else N
-
-    mine = SquaredError()
-    gold = (
-        lambda y, y_pred: mean_squared_error(y, y_pred)
-        * y_pred.shape[0]
-        * y_pred.shape[1]
-        * 0.5
-    )
-
-    # ensure we get 0 when the two arrays are equal
-    n_dims = np.random.randint(2, 100)
-    n_examples = np.random.randint(1, 1000)
-    y = y_pred = random_tensor((n_examples, n_dims))
-    assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))
-    print("PASSED")
-
-    i = 1
-    while i < N:
-        n_dims = np.random.randint(2, 100)
-        n_examples = np.random.randint(1, 1000)
-        y = random_tensor((n_examples, n_dims))
-        y_pred = random_tensor((n_examples, n_dims))
-        assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred), decimal=5)
-        print("PASSED")
-        i += 1
-
-
-def test_cross_entropy(N=None):
-    from ..losses import CrossEntropy
-
-    N = np.inf if N is None else N
-
-    mine = CrossEntropy()
-    gold = log_loss
-
-    # ensure we get 0 when the two arrays are equal
-    n_classes = np.random.randint(2, 100)
-    n_examples = np.random.randint(1, 1000)
-    y = y_pred = random_one_hot_matrix(n_examples, n_classes)
-    assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))
-    print("PASSED")
-
-    # test on random inputs
-    i = 1
-    while i < N:
-        n_classes = np.random.randint(2, 100)
-        n_examples = np.random.randint(1, 1000)
-        y = random_one_hot_matrix(n_examples, n_classes)
-        y_pred = random_stochastic_matrix(n_examples, n_classes)
-
-        assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred, normalize=False))
-        print("PASSED")
-        i += 1
-
-
-def test_VAE_loss(N=None):
-    from ..losses import VAELoss
-
-    N = np.inf if N is None else N
-
-    i = 1
-    while i < N:
-        n_ex = np.random.randint(1, 10)
-        t_dim = np.random.randint(2, 10)
-        t_mean = random_tensor([n_ex, t_dim], standardize=True)
-        t_log_var = np.log(np.abs(random_tensor([n_ex, t_dim], standardize=True)))
-        im_cols, im_rows = np.random.randint(2, 40), np.random.randint(2, 40)
-        X = np.random.rand(n_ex, im_rows * im_cols)
-        X_recon = np.random.rand(n_ex, im_rows * im_cols)
-
-        mine = VAELoss()
-        mine_loss = mine(X, X_recon, t_mean, t_log_var)
-        dX_recon, dLogVar, dMean = mine.grad(X, X_recon, t_mean, t_log_var)
-        golds = TorchVAELoss().extract_grads(X, X_recon, t_mean, t_log_var)
-
-        params = [
-            (mine_loss, "loss"),
-            (dX_recon, "dX_recon"),
-            (dLogVar, "dt_log_var"),
-            (dMean, "dt_mean"),
-        ]
-        print("\nTrial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            np.testing.assert_allclose(
-                mine,
-                golds[label],
-                err_msg=err_fmt(params, golds, ix),
-                rtol=0.1,
-                atol=1e-2,
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_WGAN_GP_loss(N=None):
-    from ..losses import WGAN_GPLoss
-
-    N = np.inf if N is None else N
-
-    i = 1
-    while i < N:
-        lambda_ = np.random.randint(0, 10)
-        n_ex = np.random.randint(1, 10)
-        n_feats = np.random.randint(2, 10)
-        Y_real = random_tensor([n_ex], standardize=True)
-        Y_fake = random_tensor([n_ex], standardize=True)
-        gradInterp = random_tensor([n_ex, n_feats], standardize=True)
-
-        mine = WGAN_GPLoss(lambda_=lambda_)
-        C_loss = mine(Y_fake, "C", Y_real, gradInterp)
-        G_loss = mine(Y_fake, "G")
-
-        C_dY_fake, dY_real, dGradInterp = mine.grad(Y_fake, "C", Y_real, gradInterp)
-        G_dY_fake = mine.grad(Y_fake, "G")
-
-        golds = TorchWGANGPLoss(lambda_).extract_grads(Y_real, Y_fake, gradInterp)
-        if np.isnan(golds["C_dGradInterp"]).any():
-            continue
-
-        params = [
-            (Y_real, "Y_real"),
-            (Y_fake, "Y_fake"),
-            (gradInterp, "gradInterp"),
-            (C_loss, "C_loss"),
-            (G_loss, "G_loss"),
-            (-dY_real, "C_dY_real"),
-            (-C_dY_fake, "C_dY_fake"),
-            (dGradInterp, "C_dGradInterp"),
-            (G_dY_fake, "G_dY_fake"),
-        ]
-
-        print("\nTrial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            np.testing.assert_allclose(
-                mine,
-                golds[label],
-                err_msg=err_fmt(params, golds, ix),
-                rtol=0.1,
-                atol=1e-2,
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_NCELoss(N=None):
-    from ..losses import NCELoss
-    from numpy_ml.utils.data_structures import DiscreteSampler
-
-    np.random.seed(12345)
-
-    N = np.inf if N is None else N
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        n_c = np.random.randint(1, 10)
-        n_out = np.random.randint(1, 300)
-        vocab_size = np.random.randint(200, 1000)
-        num_negative_samples = np.random.randint(1, 10)
-
-        embeddings = random_tensor((n_ex, n_c, n_out), standardize=True)
-        target = np.random.randint(0, vocab_size, (n_ex, 1))
-
-        probs = np.random.rand(vocab_size)
-        probs /= probs.sum()
-
-        D = DiscreteSampler(probs, log=False, with_replacement=False)
-        NCE = NCELoss(vocab_size, D, num_negative_samples)
-        my_loss, _ = NCE(embeddings, target.flatten())
-
-        my_dLdX = NCE.grad(update_params=False)
-        my_dLdW = NCE.gradients["W"]
-        my_dLdb = NCE.gradients["b"]
-
-        NCE.gradients["W"] = np.zeros_like(NCE.parameters["W"])
-        NCE.gradients["b"] = np.zeros_like(NCE.parameters["b"])
-
-        MY_final_loss, TF_final_loss = 0, 0
-        MY_dLdX, TF_dLdX = np.zeros_like(embeddings), np.zeros_like(embeddings)
-        TF_dLdW, TF_dLdb = (
-            np.zeros_like(NCE.parameters["W"]),
-            np.zeros_like(NCE.parameters["b"]),
-        )
-
-        # XXX: instead of calculating the tf NCE on the entire batch, we
-        # calculate it per-example and then sum. this is really lame and should
-        # be changed to operate on batches.
-        nv = NCE.derived_variables["noise_samples"][0]
-        for ix, emb in enumerate(embeddings):
-            sv = (nv[0], np.array([nv[1][0, ix]]), nv[2])
-
-            NCE.X = []
-            for k, v in NCE.derived_variables.items():
-                NCE.derived_variables[k] = []
-
-            for k, v in NCE.gradients.items():
-                NCE.gradients[k] = np.zeros_like(v)
-
-            my, _ = NCE(emb[None, :, :], target[ix], neg_samples=sv[0])
-
-            NCE.derived_variables["noise_samples"] = [sv]
-            dldx = NCE.grad(update_params=False)
-            NCE.derived_variables["noise_samples"] = sv
-
-            MY_final_loss += my
-            MY_dLdX[ix, ...] += np.squeeze(dldx, axis=0)
-
-            TF_dict = TFNCELoss(emb, np.array([target[ix]]), NCE)
-
-            TF_loss = TF_dict["final_loss"]
-            TF_final_loss += TF_loss
-            TF_dLdX[ix, ...] += TF_dict["dLdX"]
-            TF_dLdW[TF_dict["dLdW"].indices, :] += TF_dict["dLdW"].values
-            TF_dLdb[:, TF_dict["dLdb"].indices] += TF_dict["dLdb"].values
-
-            tf_dw = np.zeros_like(NCE.gradients["W"])
-            tf_dw[TF_dict["dLdW"].indices, :] += TF_dict["dLdW"].values
-
-            tf_db = np.zeros_like(NCE.gradients["b"])
-            tf_db[:, TF_dict["dLdb"].indices] += TF_dict["dLdb"].values
-
-        print("\nTrial {}".format(i))
-        np.testing.assert_almost_equal(my_loss, TF_final_loss, decimal=3)
-        print("PASSED: final loss")
-
-        maps = [
-            ("dLdW", my_dLdW, TF_dLdW),
-            ("dLdb", my_dLdb, TF_dLdb),
-            ("dLdX", my_dLdX, TF_dLdX),
-        ]
-        for (ll, k1, k2) in maps:
-            np.testing.assert_almost_equal(k1, k2, decimal=2, err_msg=ll)
-            print("PASSED: {}".format(ll))
-
-        i += 1
-
-
-#######################################################################
-#                       Loss Function Gradients                       #
-#######################################################################
-
-
-def test_squared_error_grad(N=None):
-    from ..losses import SquaredError
-    from ..activations import Tanh
-
-    N = np.inf if N is None else N
-
-    mine = SquaredError()
-    gold = torch_mse_grad
-    act = Tanh()
-
-    i = 1
-    while i < N:
-        n_dims = np.random.randint(2, 100)
-        n_examples = np.random.randint(1, 1000)
-        y = random_tensor((n_examples, n_dims))
-
-        # raw inputs
-        z = random_tensor((n_examples, n_dims))
-        y_pred = act.fn(z)
-
-        assert_almost_equal(
-            mine.grad(y, y_pred, z, act), 0.5 * gold(y, z, F.tanh), decimal=4
-        )
-        print("PASSED")
-        i += 1
-
-
-def test_cross_entropy_grad(N=None):
-    from ..losses import CrossEntropy
-    from ..layers import Softmax
-
-    N = np.inf if N is None else N
-
-    mine = CrossEntropy()
-    gold = torch_xe_grad
-    sm = Softmax()
-
-    i = 1
-    while i < N:
-        n_classes = np.random.randint(2, 100)
-        n_examples = np.random.randint(1, 1000)
-
-        y = random_one_hot_matrix(n_examples, n_classes)
-
-        # the cross_entropy_gradient returns the gradient wrt. z (NOT softmax(z))
-        z = random_tensor((n_examples, n_classes))
-        y_pred = sm.forward(z)
-
-        assert_almost_equal(mine.grad(y, y_pred), gold(y, z), decimal=5)
-        print("PASSED")
-        i += 1
-
-
-#######################################################################
-#                          Activations                                #
-#######################################################################
-
-
-def test_sigmoid_activation(N=None):
-    from ..activations import Sigmoid
-
-    N = np.inf if N is None else N
-
-    mine = Sigmoid()
-    gold = expit
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((1, n_dims))
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_elu_activation(N=None):
-    from ..activations import ELU
-
-    N = np.inf if N is None else N
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 10)
-        z = random_tensor((1, n_dims))
-
-        alpha = np.random.uniform(0, 10)
-
-        mine = ELU(alpha)
-        gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()
-
-        assert_almost_equal(mine.fn(z), gold(z, alpha))
-        print("PASSED")
-        i += 1
-
-
-def test_softmax_activation(N=None):
-    from ..layers import Softmax
-
-    N = np.inf if N is None else N
-
-    mine = Softmax()
-    gold = lambda z: F.softmax(torch.FloatTensor(z), dim=1).numpy()
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        assert_almost_equal(mine.forward(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_relu_activation(N=None):
-    from ..activations import ReLU
-
-    N = np.inf if N is None else N
-
-    mine = ReLU()
-    gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_softplus_activation(N=None):
-    from ..activations import SoftPlus
-
-    N = np.inf if N is None else N
-
-    mine = SoftPlus()
-    gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()
-
-    i = 0
-    while i < N:
-        n_dims = np.random.randint(1, 100)
-        z = random_stochastic_matrix(1, n_dims)
-        assert_almost_equal(mine.fn(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-#######################################################################
-#                      Activation Gradients                           #
-#######################################################################
-
-
-def test_sigmoid_grad(N=None):
-    from ..activations import Sigmoid
-
-    N = np.inf if N is None else N
-
-    mine = Sigmoid()
-    gold = torch_gradient_generator(F.sigmoid)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_elu_grad(N=None):
-    from ..activations import ELU
-
-    N = np.inf if N is None else N
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 10)
-        n_dims = np.random.randint(1, 10)
-        alpha = np.random.uniform(0, 10)
-        z = random_tensor((n_ex, n_dims))
-
-        mine = ELU(alpha)
-        gold = torch_gradient_generator(F.elu, alpha=alpha)
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_tanh_grad(N=None):
-    from ..activations import Tanh
-
-    N = np.inf if N is None else N
-
-    mine = Tanh()
-    gold = torch_gradient_generator(F.tanh)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_relu_grad(N=None):
-    from ..activations import ReLU
-
-    N = np.inf if N is None else N
-
-    mine = ReLU()
-    gold = torch_gradient_generator(F.relu)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims))
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-def test_softmax_grad(N=None):
-    from ..layers import Softmax
-    from functools import partial
-
-    np.random.seed(12345)
-
-    N = np.inf if N is None else N
-    p_soft = partial(F.softmax, dim=1)
-    gold = torch_gradient_generator(p_soft)
-
-    i = 0
-    while i < N:
-        mine = Softmax()
-        n_ex = np.random.randint(1, 3)
-        n_dims = np.random.randint(1, 50)
-        z = random_tensor((n_ex, n_dims), standardize=True)
-        out = mine.forward(z)
-
-        assert_almost_equal(
-            gold(z),
-            mine.backward(np.ones_like(out)),
-            err_msg="Theirs:\n{}\n\nMine:\n{}\n".format(
-                gold(z), mine.backward(np.ones_like(out))
-            ),
-            decimal=3,
-        )
-        print("PASSED")
-        i += 1
-
-
-def test_softplus_grad(N=None):
-    from ..activations import SoftPlus
-
-    N = np.inf if N is None else N
-
-    mine = SoftPlus()
-    gold = torch_gradient_generator(F.softplus)
-
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(1, 100)
-        n_dims = np.random.randint(1, 100)
-        z = random_tensor((n_ex, n_dims), standardize=True)
-        assert_almost_equal(mine.grad(z), gold(z))
-        print("PASSED")
-        i += 1
-
-
-#######################################################################
-#                          Layers                                     #
-#######################################################################
-
-
-def test_FullyConnected(N=None):
-    from ..layers import FullyConnected
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 100)
-        n_in = np.random.randint(1, 100)
-        n_out = np.random.randint(1, 100)
-        X = random_tensor((n_ex, n_in), standardize=True)
-
-        # randomly select an activation function
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        # initialize FC layer
-        L1 = FullyConnected(n_out=n_out, act_fn=act_fn)
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchFCLayer(n_in, n_out, torch_fn, L1.parameters)
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (y_pred, "y"),
-            (L1.parameters["W"].T, "W"),
-            (L1.parameters["b"], "b"),
-            (dLdy, "dLdy"),
-            (L1.gradients["W"].T, "dLdW"),
-            (L1.gradients["b"], "dLdB"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("\nTrial {}\nact_fn={}".format(i, act_fn_name))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_Embedding(N=None):
-    from ..layers import Embedding
-
-    N = np.inf if N is None else N
-
-    i = 1
-    while i < N + 1:
-        vocab_size = np.random.randint(1, 2000)
-        n_ex = np.random.randint(1, 100)
-        n_in = np.random.randint(1, 100)
-        emb_dim = np.random.randint(1, 100)
-
-        X = np.random.randint(0, vocab_size, (n_ex, n_in))
-
-        # initialize Embedding layer
-        L1 = Embedding(n_out=emb_dim, vocab_size=vocab_size)
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        #  dLdX = L1.backward(dLdy)
-        L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchEmbeddingLayer(vocab_size, emb_dim, L1.parameters)
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (y_pred, "y"),
-            (L1.parameters["W"], "W"),
-            (dLdy, "dLdy"),
-            (L1.gradients["W"], "dLdW"),
-            #  (dLdX, "dLdX"),
-        ]
-
-        print("\nTrial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_BatchNorm1D(N=None):
-    from ..layers import BatchNorm1D
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(2, 1000)
-        n_in = np.random.randint(1, 1000)
-        X = random_tensor((n_ex, n_in), standardize=True)
-
-        # initialize BatchNorm1D layer
-        L1 = BatchNorm1D()
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchBatchNormLayer(
-            n_in, L1.parameters, "1D", epsilon=L1.epsilon, momentum=L1.momentum
-        )
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (y_pred, "y"),
-            (L1.parameters["scaler"].T, "scaler"),
-            (L1.parameters["intercept"], "intercept"),
-            (L1.parameters["running_mean"], "running_mean"),
-            #  (L1.parameters["running_var"], "running_var"),
-            (L1.gradients["scaler"], "dLdScaler"),
-            (L1.gradients["intercept"], "dLdIntercept"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Trial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_LayerNorm1D(N=None):
-    from ..layers import LayerNorm1D
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(2, 1000)
-        n_in = np.random.randint(1, 1000)
-        X = random_tensor((n_ex, n_in), standardize=True)
-
-        # initialize BatchNorm1D layer
-        L1 = LayerNorm1D()
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchLayerNormLayer(n_in, L1.parameters, "1D", epsilon=L1.epsilon)
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (y_pred, "y"),
-            (L1.parameters["scaler"].T, "scaler"),
-            (L1.parameters["intercept"], "intercept"),
-            (L1.gradients["scaler"], "dLdScaler"),
-            (L1.gradients["intercept"], "dLdIntercept"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Trial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_LayerNorm2D(N=None):
-    from ..layers import LayerNorm2D
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(2, 10)
-        in_rows = np.random.randint(1, 10)
-        in_cols = np.random.randint(1, 10)
-        n_in = np.random.randint(1, 3)
-
-        # initialize LayerNorm2D layer
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-        L1 = LayerNorm2D()
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # standard sum loss
-        dLdy = np.ones_like(X)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchLayerNormLayer(
-            [n_in, in_rows, in_cols], L1.parameters, mode="2D", epsilon=L1.epsilon
-        )
-        golds = gold_mod.extract_grads(X, Y_true=None)
-
-        params = [
-            (L1.X[0], "X"),
-            (L1.hyperparameters["epsilon"], "epsilon"),
-            (L1.parameters["scaler"], "scaler"),
-            (L1.parameters["intercept"], "intercept"),
-            (y_pred, "y"),
-            (L1.gradients["scaler"], "dLdScaler"),
-            (L1.gradients["intercept"], "dLdIntercept"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Trial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
-            )
-
-            print("\tPASSED {}".format(label))
-
-        i += 1
-
-
-def test_MultiplyLayer(N=None):
-    from ..layers import Multiply
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        Xs = []
-        n_ex = np.random.randint(1, 100)
-        n_in = np.random.randint(1, 100)
-        n_entries = np.random.randint(2, 5)
-        for _ in range(n_entries):
-            Xs.append(random_tensor((n_ex, n_in), standardize=True))
-
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        # initialize Add layer
-        L1 = Multiply(act_fn)
-
-        # forward prop
-        y_pred = L1.forward(Xs)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdXs = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchMultiplyLayer(torch_fn)
-        golds = gold_mod.extract_grads(Xs)
-
-        params = [(Xs, "Xs"), (y_pred, "Y")]
-        params.extend(
-            [(dldxi, "dLdX{}".format(i + 1)) for i, dldxi in enumerate(dLdXs)]
-        )
-
-        print("\nTrial {}".format(i))
-        print("n_ex={}, n_in={}".format(n_ex, n_in))
-        print("n_entries={}, act_fn={}".format(n_entries, str(act_fn)))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_AddLayer(N=None):
-    from ..layers import Add
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        Xs = []
-        n_ex = np.random.randint(1, 100)
-        n_in = np.random.randint(1, 100)
-        n_entries = np.random.randint(2, 5)
-        for _ in range(n_entries):
-            Xs.append(random_tensor((n_ex, n_in), standardize=True))
-
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        # initialize Add layer
-        L1 = Add(act_fn)
-
-        # forward prop
-        y_pred = L1.forward(Xs)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdXs = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchAddLayer(torch_fn)
-        golds = gold_mod.extract_grads(Xs)
-
-        params = [(Xs, "Xs"), (y_pred, "Y")]
-        params.extend(
-            [(dldxi, "dLdX{}".format(i + 1)) for i, dldxi in enumerate(dLdXs)]
-        )
-
-        print("\nTrial {}".format(i))
-        print("n_ex={}, n_in={}".format(n_ex, n_in))
-        print("n_entries={}, act_fn={}".format(n_entries, str(act_fn)))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_BatchNorm2D(N=None):
-    from ..layers import BatchNorm2D
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(2, 10)
-        in_rows = np.random.randint(1, 10)
-        in_cols = np.random.randint(1, 10)
-        n_in = np.random.randint(1, 3)
-
-        # initialize BatchNorm2D layer
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-        L1 = BatchNorm2D()
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # standard sum loss
-        dLdy = np.ones_like(X)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchBatchNormLayer(
-            n_in, L1.parameters, mode="2D", epsilon=L1.epsilon, momentum=L1.momentum
-        )
-        golds = gold_mod.extract_grads(X, Y_true=None)
-
-        params = [
-            (L1.X[0], "X"),
-            (L1.hyperparameters["momentum"], "momentum"),
-            (L1.hyperparameters["epsilon"], "epsilon"),
-            (L1.parameters["scaler"].T, "scaler"),
-            (L1.parameters["intercept"], "intercept"),
-            (L1.parameters["running_mean"], "running_mean"),
-            #  (L1.parameters["running_var"], "running_var"),
-            (y_pred, "y"),
-            (L1.gradients["scaler"], "dLdScaler"),
-            (L1.gradients["intercept"], "dLdIntercept"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Trial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
-            )
-
-            print("\tPASSED {}".format(label))
-
-        i += 1
-
-
-def test_RNNCell(N=None):
-    from ..layers import RNNCell
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        n_in = np.random.randint(1, 10)
-        n_out = np.random.randint(1, 10)
-        n_t = np.random.randint(1, 10)
-        X = random_tensor((n_ex, n_in, n_t), standardize=True)
-
-        # initialize RNN layer
-        L1 = RNNCell(n_out=n_out)
-
-        # forward prop
-        y_preds = []
-        for t in range(n_t):
-            y_pred = L1.forward(X[:, :, t])
-            y_preds += [y_pred]
-
-        # backprop
-        dLdX = []
-        dLdAt = np.ones_like(y_preds[t])
-        for t in reversed(range(n_t)):
-            dLdXt = L1.backward(dLdAt)
-            dLdX.insert(0, dLdXt)
-        dLdX = np.dstack(dLdX)
-
-        # get gold standard gradients
-        gold_mod = TorchRNNCell(n_in, n_out, L1.parameters)
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (X, "X"),
-            (np.array(y_preds), "y"),
-            (L1.parameters["ba"].T, "ba"),
-            (L1.parameters["bx"].T, "bx"),
-            (L1.parameters["Wax"].T, "Wax"),
-            (L1.parameters["Waa"].T, "Waa"),
-            (L1.gradients["ba"].T, "dLdBa"),
-            (L1.gradients["bx"].T, "dLdBx"),
-            (L1.gradients["Wax"].T, "dLdWax"),
-            (L1.gradients["Waa"].T, "dLdWaa"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Trial {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            np.testing.assert_allclose(
-                mine,
-                golds[label],
-                err_msg=err_fmt(params, golds, ix),
-                atol=1e-3,
-                rtol=1e-3,
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_Conv2D(N=None):
-    from ..layers import Conv2D
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        in_rows = np.random.randint(1, 10)
-        in_cols = np.random.randint(1, 10)
-        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
-        f_shape = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
-        d = np.random.randint(0, 5)
-
-        fr, fc = f_shape[0] * (d + 1) - d, f_shape[1] * (d + 1) - d
-        out_rows = int(1 + (in_rows + 2 * p - fr) / s)
-        out_cols = int(1 + (in_cols + 2 * p - fc) / s)
-
-        if out_rows <= 0 or out_cols <= 0:
-            continue
-
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-
-        # randomly select an activation function
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        # initialize Conv2D layer
-        L1 = Conv2D(
-            out_ch=n_out,
-            kernel_shape=f_shape,
-            act_fn=act_fn,
-            pad=p,
-            stride=s,
-            dilation=d,
-        )
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchConv2DLayer(
-            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
-        )
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (y_pred, "y"),
-            (L1.parameters["W"], "W"),
-            (L1.parameters["b"], "b"),
-            (L1.gradients["W"], "dLdW"),
-            (L1.gradients["b"], "dLdB"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
-        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
-        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_out))
-        print("dilation={}".format(d))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_DPAttention(N=None):
-    from ..layers import DotProductAttention
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        d_k = np.random.randint(1, 100)
-        d_v = np.random.randint(1, 100)
-
-        Q = random_tensor((n_ex, d_k), standardize=True)
-        K = random_tensor((n_ex, d_k), standardize=True)
-        V = random_tensor((n_ex, d_v), standardize=True)
-
-        # initialize DotProductAttention layer
-        mine = DotProductAttention(scale=True, dropout_p=0)
-
-        # forward prop
-        y_pred = mine.forward(Q, K, V)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdQ, dLdK, dLdV = mine.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchSDPAttentionLayer()
-        golds = gold_mod.extract_grads(Q, K, V)
-
-        params = [
-            (mine.X[0][0], "Q"),
-            (mine.X[0][1], "K"),
-            (mine.X[0][2], "V"),
-            (y_pred, "Y"),
-            (dLdV, "dLdV"),
-            (dLdK, "dLdK"),
-            (dLdQ, "dLdQ"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("n_ex={} d_k={} d_v={}".format(n_ex, d_k, d_v))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_Conv1D(N=None):
-    from ..layers import Conv1D
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        l_in = np.random.randint(1, 10)
-        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
-        f_width = min(l_in, np.random.randint(1, 5))
-        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
-        d = np.random.randint(0, 5)
-
-        fc = f_width * (d + 1) - d
-        l_out = int(1 + (l_in + 2 * p - fc) / s)
-
-        if l_out <= 0:
-            continue
-
-        X = random_tensor((n_ex, l_in, n_in), standardize=True)
-
-        # randomly select an activation function
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        # initialize Conv2D layer
-        L1 = Conv1D(
-            out_ch=n_out,
-            kernel_width=f_width,
-            act_fn=act_fn,
-            pad=p,
-            stride=s,
-            dilation=d,
-        )
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchConv1DLayer(
-            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
-        )
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (y_pred, "y"),
-            (L1.parameters["W"], "W"),
-            (L1.parameters["b"], "b"),
-            (L1.gradients["W"], "dLdW"),
-            (L1.gradients["b"], "dLdB"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("pad={}, stride={}, f_width={}, n_ex={}".format(p, s, f_width, n_ex))
-        print("l_in={}, n_in={}".format(l_in, n_in))
-        print("l_out={}, n_out={}".format(l_out, n_out))
-        print("dilation={}".format(d))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_Deconv2D(N=None):
-    from ..layers import Deconv2D
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        in_rows = np.random.randint(1, 10)
-        in_cols = np.random.randint(1, 10)
-        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
-        f_shape = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        p, s = np.random.randint(0, 5), np.random.randint(1, 3)
-
-        out_rows = s * (in_rows - 1) - 2 * p + f_shape[0]
-        out_cols = s * (in_cols - 1) - 2 * p + f_shape[1]
-
-        if out_rows <= 0 or out_cols <= 0:
-            continue
-
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-
-        # randomly select an activation function
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        # initialize Deconv2D layer
-        L1 = Deconv2D(
-            out_ch=n_out, kernel_shape=f_shape, act_fn=act_fn, pad=p, stride=s
-        )
-
-        # forward prop
-        try:
-            y_pred = L1.forward(X)
-        except ValueError:
-            print("Improper dimensions; retrying")
-            continue
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchDeconv2DLayer(
-            n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters
-        )
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (L1.X[0], "X"),
-            (L1.parameters["W"], "W"),
-            (L1.parameters["b"], "b"),
-            (y_pred, "y"),
-            (L1.gradients["W"], "dLdW"),
-            (L1.gradients["b"], "dLdB"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
-        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
-        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_out))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_Pool2D(N=None):
-    from ..layers import Pool2D
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        in_rows = np.random.randint(1, 10)
-        in_cols = np.random.randint(1, 10)
-        n_in = np.random.randint(1, 3)
-        f_shape = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        p, s = np.random.randint(0, max(1, min(f_shape) // 2)), np.random.randint(1, 3)
-        #  mode = ["max", "average"][np.random.randint(0, 2)]
-        mode = "average"
-        out_rows = int(1 + (in_rows + 2 * p - f_shape[0]) / s)
-        out_cols = int(1 + (in_cols + 2 * p - f_shape[1]) / s)
-
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-        print("\nmode: {}".format(mode))
-        print("pad={}, stride={}, f_shape={}, n_ex={}".format(p, s, f_shape, n_ex))
-        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
-        print("out_rows={}, out_cols={}, n_out={}".format(out_rows, out_cols, n_in))
-
-        # initialize Pool2D layer
-        L1 = Pool2D(kernel_shape=f_shape, pad=p, stride=s, mode=mode)
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchPool2DLayer(n_in, L1.hyperparameters)
-        golds = gold_mod.extract_grads(X)
-
-        params = [(L1.X[0], "X"), (y_pred, "y"), (dLdX, "dLdX")]
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_LSTMCell(N=None):
-    from ..layers import LSTMCell
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        n_in = np.random.randint(1, 10)
-        n_out = np.random.randint(1, 10)
-        n_t = np.random.randint(1, 10)
-        X = random_tensor((n_ex, n_in, n_t), standardize=True)
-
-        # initialize LSTM layer
-        L1 = LSTMCell(n_out=n_out)
-
-        # forward prop
-        Cs = []
-        y_preds = []
-        for t in range(n_t):
-            y_pred, Ct = L1.forward(X[:, :, t])
-            y_preds.append(y_pred)
-            Cs.append(Ct)
-
-        # backprop
-        dLdX = []
-        dLdAt = np.ones_like(y_preds[t])
-        for t in reversed(range(n_t)):
-            dLdXt = L1.backward(dLdAt)
-            dLdX.insert(0, dLdXt)
-        dLdX = np.dstack(dLdX)
-        y_preds = np.dstack(y_preds)
-        Cs = np.array(Cs)
-
-        # get gold standard gradients
-        gold_mod = TorchLSTMCell(n_in, n_out, L1.parameters)
-        golds = gold_mod.extract_grads(X)
-
-        params = [
-            (X, "X"),
-            (np.array(Cs), "C"),
-            (y_preds, "y"),
-            (L1.parameters["bo"].T, "bo"),
-            (L1.parameters["bu"].T, "bu"),
-            (L1.parameters["bf"].T, "bf"),
-            (L1.parameters["bc"].T, "bc"),
-            (L1.parameters["Wo"], "Wo"),
-            (L1.parameters["Wu"], "Wu"),
-            (L1.parameters["Wf"], "Wf"),
-            (L1.parameters["Wc"], "Wc"),
-            (L1.gradients["bo"].T, "dLdBo"),
-            (L1.gradients["bu"].T, "dLdBu"),
-            (L1.gradients["bf"].T, "dLdBf"),
-            (L1.gradients["bc"].T, "dLdBc"),
-            (L1.gradients["Wo"], "dLdWo"),
-            (L1.gradients["Wu"], "dLdWu"),
-            (L1.gradients["Wf"], "dLdWf"),
-            (L1.gradients["Wc"], "dLdWc"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Case {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            np.testing.assert_allclose(
-                mine,
-                golds[label],
-                err_msg=err_fmt(params, golds, ix),
-                atol=1e-4,
-                rtol=1e-4,
-            )
-
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):
-    """
-    Manual gradient calc for vanilla RNN parameters
-    """
-    if param_name in ["Ba", "Bx"]:
-        param_name = param_name.lower()
-    elif param_name in ["X", "y"]:
-        return None
-
-    param_orig = model.parameters[param_name]
-    model.flush_gradients()
-    grads = np.zeros_like(param_orig)
-
-    for flat_ix, val in enumerate(param_orig.flat):
-        param = deepcopy(param_orig)
-        md_ix = np.unravel_index(flat_ix, param.shape)
-
-        # plus
-        y_preds_plus = []
-        param[md_ix] = val + epsilon
-        model.parameters[param_name] = param
-        for t in range(n_t):
-            y_pred_plus = model.forward(X[:, :, t])
-            y_preds_plus += [y_pred_plus]
-        loss_plus = loss_func(y_preds_plus)
-        model.flush_gradients()
-
-        # minus
-        y_preds_minus = []
-        param[md_ix] = val - epsilon
-        model.parameters[param_name] = param
-        for t in range(n_t):
-            y_pred_minus = model.forward(X[:, :, t])
-            y_preds_minus += [y_pred_minus]
-        loss_minus = loss_func(y_preds_minus)
-        model.flush_gradients()
-
-        grad = (loss_plus - loss_minus) / (2 * epsilon)
-        grads[md_ix] = grad
-    return grads.T
-
-
-#######################################################################
-#                               Modules                               #
-#######################################################################
-
-
-def test_MultiHeadedAttentionModule(N=None):
-    from ..modules import MultiHeadedAttentionModule
-
-    N = np.inf if N is None else N
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        latent_dim = np.random.randint(1, 20)
-        n_heads = np.random.randint(2, 10)
-        d_k = d_v = n_heads * latent_dim
-
-        Q = random_tensor((n_ex, d_k), standardize=True)
-        K = random_tensor((n_ex, d_k), standardize=True)
-        V = random_tensor((n_ex, d_v), standardize=True)
-
-        mine = MultiHeadedAttentionModule(n_heads=n_heads, dropout_p=0)
-
-        # forward prop
-        y_pred = mine.forward(Q, K, V)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdQ, dLdK, dLdV = mine.backward(dLdy)
-
-        # get gold standard gradients
-        params = mine.parameters
-        hparams = mine.hyperparameters
-        gold_mod = TorchMultiHeadedAttentionModule(params, hparams)
-        golds = gold_mod.extract_grads(Q, K, V)
-
-        dv = mine.derived_variables
-        params = mine.parameters["components"]
-        grads = mine.gradients["components"]
-        params = [
-            (Q, "Q"),
-            (K, "K"),
-            (V, "V"),
-            (mine.n_heads, "n_heads"),
-            (mine.latent_dim, "latent_dim"),
-            (params["O"]["W"], "O_W"),
-            (params["K"]["W"], "K_W"),
-            (params["V"]["W"], "V_W"),
-            (params["Q"]["W"], "Q_W"),
-            (params["O"]["b"], "O_b"),
-            (params["K"]["b"], "K_b"),
-            (params["V"]["b"], "V_b"),
-            (params["Q"]["b"], "Q_b"),
-            (dv["Q_proj"], "Q_proj"),
-            (dv["K_proj"], "K_proj"),
-            (dv["V_proj"], "V_proj"),
-            (dv["attention_weights"][0], "weights"),
-            (dv["attention_out"], "attn_out"),
-            (y_pred, "Y"),
-            (dLdy, "dLdy"),
-            (dv["dQ_proj"], "dQ_proj"),
-            (dv["dK_proj"], "dK_proj"),
-            (dv["dV_proj"], "dV_proj"),
-            (grads["O"]["W"], "dO_W"),
-            (grads["V"]["W"], "dV_W"),
-            (grads["K"]["W"], "dK_W"),
-            (grads["Q"]["W"], "dQ_W"),
-            (grads["O"]["b"], "dO_b"),
-            (grads["V"]["b"], "dV_b"),
-            (grads["K"]["b"], "dK_b"),
-            (grads["Q"]["b"], "dQ_b"),
-            (dLdQ, "dQ"),
-            (dLdK, "dK"),
-            (dLdV, "dV"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print(
-            "n_ex={} d_k=d_v={} latent_dim={} n_heads={}".format(
-                n_ex, d_k, latent_dim, n_heads
-            )
-        )
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_SkipConnectionIdentityModule(N=None):
-    from ..modules import SkipConnectionIdentityModule
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(2, 10)
-        in_rows = np.random.randint(2, 25)
-        in_cols = np.random.randint(2, 25)
-        n_in = np.random.randint(2, 5)
-        n_out = n_in
-        f_shape1 = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        f_shape2 = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        s1 = np.random.randint(1, 5)
-        s2 = np.random.randint(1, 5)
-
-        # randomly select an activation function
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-
-        p1 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape1, s1)
-        if p1[0] != p1[1] or p1[2] != p1[3]:
-            continue
-
-        p2 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape2, s2)
-        if p2[0] != p2[1] or p2[2] != p2[3]:
-            continue
-
-        p1 = (p1[0], p1[2])
-        p2 = (p2[0], p2[2])
-
-        # initialize SkipConnectionIdentity module
-        L1 = SkipConnectionIdentityModule(
-            out_ch=n_out,
-            kernel_shape1=f_shape1,
-            kernel_shape2=f_shape2,
-            stride1=s1,
-            stride2=s2,
-            act_fn=act_fn,
-            epsilon=1e-5,
-            momentum=0.9,
-        )
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchSkipConnectionIdentity(
-            torch_fn,
-            p1,
-            p2,
-            L1.parameters,
-            L1.hyperparameters,
-            momentum=L1.momentum,
-            epsilon=L1.epsilon,
-        )
-        golds = gold_mod.extract_grads(X)
-
-        params = L1.parameters["components"]
-        grads = L1.gradients["components"]
-        params = [
-            (X, "X"),
-            (params["conv1"]["W"], "conv1_W"),
-            (params["conv1"]["b"], "conv1_b"),
-            (params["batchnorm1"]["scaler"].T, "bn1_scaler"),
-            (params["batchnorm1"]["intercept"], "bn1_intercept"),
-            (params["batchnorm1"]["running_mean"], "bn1_running_mean"),
-            #  (params["batchnorm1"]["running_var"], "bn1_running_var"),
-            (params["conv2"]["W"], "conv2_W"),
-            (params["conv2"]["b"], "conv2_b"),
-            (params["batchnorm2"]["scaler"].T, "bn2_scaler"),
-            (params["batchnorm2"]["intercept"], "bn2_intercept"),
-            (params["batchnorm2"]["running_mean"], "bn2_running_mean"),
-            #  (params["batchnorm2"]["running_var"], "bn2_running_var"),
-            (L1._dv["conv1_out"], "act1_out"),
-            (L1._dv["batchnorm1_out"], "bn1_out"),
-            (L1._dv["conv2_out"], "conv2_out"),
-            (L1._dv["batchnorm2_out"], "bn2_out"),
-            (y_pred, "Y"),
-            (dLdy, "dLdY"),
-            (L1.derived_variables["dLdBn2"], "dLdBn2_out"),
-            (L1.derived_variables["dLdConv2"], "dLdConv2_out"),
-            (L1.derived_variables["dLdBn1"], "dLdBn1_out"),
-            (L1.derived_variables["dLdConv1"], "dLdActFn1_out"),
-            (dLdX, "dLdX"),
-            (grads["batchnorm2"]["scaler"].T, "dLdBn2_scaler"),
-            (grads["batchnorm2"]["intercept"], "dLdBn2_intercept"),
-            (grads["conv2"]["W"], "dLdConv2_W"),
-            (grads["conv2"]["b"], "dLdConv2_b"),
-            (grads["batchnorm1"]["scaler"].T, "dLdBn1_scaler"),
-            (grads["batchnorm1"]["intercept"], "dLdBn1_intercept"),
-            (grads["conv1"]["W"], "dLdConv1_W"),
-            (grads["conv1"]["b"], "dLdConv1_b"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("act_fn={}, n_ex={}".format(act_fn, n_ex))
-        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
-        print("pad1={}, stride1={}, f_shape1={}".format(p1, s1, f_shape1))
-        print("pad2={}, stride2={}, f_shape2={}".format(p2, s2, f_shape2))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=2
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_SkipConnectionConvModule(N=None):
-    from ..modules import SkipConnectionConvModule
-    from ..activations import Tanh, ReLU, Sigmoid, Affine
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    acts = [
-        (Tanh(), nn.Tanh(), "Tanh"),
-        (Sigmoid(), nn.Sigmoid(), "Sigmoid"),
-        (ReLU(), nn.ReLU(), "ReLU"),
-        (Affine(), TorchLinearActivation(), "Affine"),
-    ]
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(2, 10)
-        in_rows = np.random.randint(2, 10)
-        in_cols = np.random.randint(2, 10)
-        n_in = np.random.randint(2, 5)
-        n_out1 = np.random.randint(2, 5)
-        n_out2 = np.random.randint(2, 5)
-        f_shape1 = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        f_shape2 = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-        f_shape_skip = (
-            min(in_rows, np.random.randint(1, 5)),
-            min(in_cols, np.random.randint(1, 5)),
-        )
-
-        s1 = np.random.randint(1, 5)
-        s2 = np.random.randint(1, 5)
-        s_skip = np.random.randint(1, 5)
-
-        # randomly select an activation function
-        act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]
-
-        X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)
-
-        p1 = (np.random.randint(1, 5), np.random.randint(1, 5))
-        p2 = (np.random.randint(1, 5), np.random.randint(1, 5))
-
-        # initialize SkipConnectionConv module
-        L1 = SkipConnectionConvModule(
-            out_ch1=n_out1,
-            out_ch2=n_out2,
-            kernel_shape1=f_shape1,
-            kernel_shape2=f_shape2,
-            kernel_shape_skip=f_shape_skip,
-            stride1=s1,
-            stride2=s2,
-            stride_skip=s_skip,
-            pad1=p1,
-            pad2=p2,
-            act_fn=act_fn,
-            epsilon=1e-5,
-            momentum=0.9,
-        )
-
-        # forward prop
-        try:
-            y_pred = L1.forward(X)
-        except (ValueError, AssertionError):
-            print("Invalid padding; Retrying")
-            continue
-
-        ps = L1.hyperparameters["pad_skip"]
-        if ps[0] != ps[1] or ps[2] != ps[3]:
-            continue
-        pad_skip = (ps[0], ps[2])
-
-        # backprop
-        dLdy = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdy)
-
-        # get gold standard gradients
-        gold_mod = TorchSkipConnectionConv(
-            torch_fn,
-            p1,
-            p2,
-            pad_skip,
-            L1.parameters,
-            L1.hyperparameters,
-            momentum=L1.momentum,
-            epsilon=L1.epsilon,
-        )
-        golds = gold_mod.extract_grads(X)
-
-        params = L1.parameters["components"]
-        grads = L1.gradients["components"]
-        params = [
-            (X, "X"),
-            (params["conv1"]["W"], "conv1_W"),
-            (params["conv1"]["b"], "conv1_b"),
-            (params["batchnorm1"]["scaler"].T, "bn1_scaler"),
-            (params["batchnorm1"]["intercept"], "bn1_intercept"),
-            (params["batchnorm1"]["running_mean"], "bn1_running_mean"),
-            #  (params["batchnorm1"]["running_var"], "bn1_running_var"),
-            (params["conv2"]["W"], "conv2_W"),
-            (params["conv2"]["b"], "conv2_b"),
-            (params["batchnorm2"]["scaler"].T, "bn2_scaler"),
-            (params["batchnorm2"]["intercept"], "bn2_intercept"),
-            (params["batchnorm2"]["running_mean"], "bn2_running_mean"),
-            #  (params["batchnorm2"]["running_var"], "bn2_running_var"),
-            (params["conv_skip"]["W"], "conv_skip_W"),
-            (params["conv_skip"]["b"], "conv_skip_b"),
-            (params["batchnorm_skip"]["scaler"].T, "bn_skip_scaler"),
-            (params["batchnorm_skip"]["intercept"], "bn_skip_intercept"),
-            (params["batchnorm_skip"]["running_mean"], "bn_skip_running_mean"),
-            #  (params["batchnorm_skip"]["running_var"], "bn_skip_running_var"),
-            (L1._dv["conv1_out"], "act1_out"),
-            (L1._dv["batchnorm1_out"], "bn1_out"),
-            (L1._dv["conv2_out"], "conv2_out"),
-            (L1._dv["batchnorm2_out"], "bn2_out"),
-            (L1._dv["conv_skip_out"], "conv_skip_out"),
-            (L1._dv["batchnorm_skip_out"], "bn_skip_out"),
-            (y_pred, "Y"),
-            (dLdy, "dLdY"),
-            (L1.derived_variables["dLdBn2"], "dLdBn2_out"),
-            (L1.derived_variables["dLdConv2"], "dLdConv2_out"),
-            (L1.derived_variables["dLdBnSkip"], "dLdBnSkip_out"),
-            (L1.derived_variables["dLdConvSkip"], "dLdConvSkip_out"),
-            (L1.derived_variables["dLdBn1"], "dLdBn1_out"),
-            (L1.derived_variables["dLdConv1"], "dLdActFn1_out"),
-            (dLdX, "dLdX"),
-            (grads["batchnorm_skip"]["scaler"].T, "dLdBnSkip_scaler"),
-            (grads["batchnorm_skip"]["intercept"], "dLdBnSkip_intercept"),
-            (grads["conv_skip"]["W"], "dLdConvSkip_W"),
-            (grads["conv_skip"]["b"], "dLdConvSkip_b"),
-            (grads["batchnorm2"]["scaler"].T, "dLdBn2_scaler"),
-            (grads["batchnorm2"]["intercept"], "dLdBn2_intercept"),
-            (grads["conv2"]["W"], "dLdConv2_W"),
-            (grads["conv2"]["b"], "dLdConv2_b"),
-            (grads["batchnorm1"]["scaler"].T, "dLdBn1_scaler"),
-            (grads["batchnorm1"]["intercept"], "dLdBn1_intercept"),
-            (grads["conv1"]["W"], "dLdConv1_W"),
-            (grads["conv1"]["b"], "dLdConv1_b"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("act_fn={}, n_ex={}".format(act_fn, n_ex))
-        print("in_rows={}, in_cols={}, n_in={}".format(in_rows, in_cols, n_in))
-        print("pad1={}, stride1={}, f_shape1={}".format(p1, s1, f_shape1))
-        print("pad2={}, stride2={}, f_shape2={}".format(p2, s2, f_shape2))
-        print("stride_skip={}, f_shape_skip={}".format(s_skip, f_shape_skip))
-        warn_str = (
-            "\n[NOTE] The tests in this module can fail sometimes during "
-            "backprop due to the ReLU issue: while the difference in the forward pass "
-            "between z=-1e-9 and z=1e-9 is miniscule, the difference during the backward "
-            "pass is significant due to ReLU's kink about 0."
-        )
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine,
-                golds[label],
-                err_msg=err_fmt(params, golds, ix, warn_str),
-                decimal=2,
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_BidirectionalLSTM(N=None):
-    from ..modules import BidirectionalLSTM
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        n_in = np.random.randint(1, 10)
-        n_out = np.random.randint(1, 10)
-        n_t = np.random.randint(1, 10)
-        X = random_tensor((n_ex, n_in, n_t), standardize=True)
-
-        # initialize LSTM layer
-        L1 = BidirectionalLSTM(n_out=n_out)
-
-        # forward prop
-        y_pred = L1.forward(X)
-
-        # backprop
-        dLdA = np.ones_like(y_pred)
-        dLdX = L1.backward(dLdA)
-
-        # get gold standard gradients
-        gold_mod = TorchBidirectionalLSTM(n_in, n_out, L1.parameters)
-        golds = gold_mod.extract_grads(X)
-
-        pms, grads = L1.parameters["components"], L1.gradients["components"]
-        params = [
-            (X, "X"),
-            (y_pred, "y"),
-            (pms["cell_fwd"]["bo"].T, "bo_f"),
-            (pms["cell_fwd"]["bu"].T, "bu_f"),
-            (pms["cell_fwd"]["bf"].T, "bf_f"),
-            (pms["cell_fwd"]["bc"].T, "bc_f"),
-            (pms["cell_fwd"]["Wo"], "Wo_f"),
-            (pms["cell_fwd"]["Wu"], "Wu_f"),
-            (pms["cell_fwd"]["Wf"], "Wf_f"),
-            (pms["cell_fwd"]["Wc"], "Wc_f"),
-            (pms["cell_bwd"]["bo"].T, "bo_b"),
-            (pms["cell_bwd"]["bu"].T, "bu_b"),
-            (pms["cell_bwd"]["bf"].T, "bf_b"),
-            (pms["cell_bwd"]["bc"].T, "bc_b"),
-            (pms["cell_bwd"]["Wo"], "Wo_b"),
-            (pms["cell_bwd"]["Wu"], "Wu_b"),
-            (pms["cell_bwd"]["Wf"], "Wf_b"),
-            (pms["cell_bwd"]["Wc"], "Wc_b"),
-            (grads["cell_fwd"]["bo"].T, "dLdBo_f"),
-            (grads["cell_fwd"]["bu"].T, "dLdBu_f"),
-            (grads["cell_fwd"]["bf"].T, "dLdBf_f"),
-            (grads["cell_fwd"]["bc"].T, "dLdBc_f"),
-            (grads["cell_fwd"]["Wo"], "dLdWo_f"),
-            (grads["cell_fwd"]["Wu"], "dLdWu_f"),
-            (grads["cell_fwd"]["Wf"], "dLdWf_f"),
-            (grads["cell_fwd"]["Wc"], "dLdWc_f"),
-            (grads["cell_bwd"]["bo"].T, "dLdBo_b"),
-            (grads["cell_bwd"]["bu"].T, "dLdBu_b"),
-            (grads["cell_bwd"]["bf"].T, "dLdBf_b"),
-            (grads["cell_bwd"]["bc"].T, "dLdBc_b"),
-            (grads["cell_bwd"]["Wo"], "dLdWo_b"),
-            (grads["cell_bwd"]["Wu"], "dLdWu_b"),
-            (grads["cell_bwd"]["Wf"], "dLdWf_b"),
-            (grads["cell_bwd"]["Wc"], "dLdWc_b"),
-            (dLdX, "dLdX"),
-        ]
-
-        print("Case {}".format(i))
-        for ix, (mine, label) in enumerate(params):
-            np.testing.assert_allclose(
-                mine,
-                golds[label],
-                err_msg=err_fmt(params, golds, ix),
-                atol=1e-4,
-                rtol=1e-4,
-            )
-
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-def test_WaveNetModule(N=None):
-    from ..modules import WavenetResidualModule
-
-    N = np.inf if N is None else N
-
-    np.random.seed(12345)
-
-    i = 1
-    while i < N + 1:
-        n_ex = np.random.randint(1, 10)
-        l_in = np.random.randint(1, 10)
-        ch_residual, ch_dilation = np.random.randint(1, 5), np.random.randint(1, 5)
-        f_width = min(l_in, np.random.randint(1, 5))
-        d = np.random.randint(0, 5)
-
-        X_main = np.zeros_like(
-            random_tensor((n_ex, l_in, ch_residual), standardize=True)
-        )
-        X_main[0][0][0] = 1.0
-        X_skip = np.zeros_like(
-            random_tensor((n_ex, l_in, ch_residual), standardize=True)
-        )
-
-        # initialize Conv2D layer
-        L1 = WavenetResidualModule(
-            ch_residual=ch_residual,
-            ch_dilation=ch_dilation,
-            kernel_width=f_width,
-            dilation=d,
-        )
-
-        # forward prop
-        Y_main, Y_skip = L1.forward(X_main, X_skip)
-
-        # backprop
-        dLdY_skip = np.ones_like(Y_skip)
-        dLdY_main = np.ones_like(Y_main)
-        dLdX_main, dLdX_skip = L1.backward(dLdY_skip, dLdY_main)
-
-        _, conv_1x1_pad = pad1D(
-            L1._dv["multiply_gate_out"], "same", kernel_width=1, stride=1, dilation=0
-        )
-        if conv_1x1_pad[0] != conv_1x1_pad[1]:
-            print("Skipping")
-            continue
-
-        conv_1x1_pad = conv_1x1_pad[0]
-
-        # get gold standard gradients
-        gold_mod = TorchWavenetModule(L1.parameters, L1.hyperparameters, conv_1x1_pad)
-        golds = gold_mod.extract_grads(X_main, X_skip)
-
-        dv = L1.derived_variables
-        pc = L1.parameters["components"]
-        gr = L1.gradients["components"]
-
-        params = [
-            (L1.X_main, "X_main"),
-            (L1.X_skip, "X_skip"),
-            (pc["conv_dilation"]["W"], "conv_dilation_W"),
-            (pc["conv_dilation"]["b"], "conv_dilation_b"),
-            (pc["conv_1x1"]["W"], "conv_1x1_W"),
-            (pc["conv_1x1"]["b"], "conv_1x1_b"),
-            (dv["conv_dilation_out"], "conv_dilation_out"),
-            (dv["tanh_out"], "tanh_out"),
-            (dv["sigm_out"], "sigm_out"),
-            (dv["multiply_gate_out"], "multiply_gate_out"),
-            (dv["conv_1x1_out"], "conv_1x1_out"),
-            (Y_main, "Y_main"),
-            (Y_skip, "Y_skip"),
-            (dLdY_skip, "dLdY_skip"),
-            (dLdY_main, "dLdY_main"),
-            (dv["dLdConv_1x1"], "dLdConv_1x1_out"),
-            (gr["conv_1x1"]["W"], "dLdConv_1x1_W"),
-            (gr["conv_1x1"]["b"], "dLdConv_1x1_b"),
-            (dv["dLdMultiply"], "dLdMultiply_out"),
-            (dv["dLdTanh"], "dLdTanh_out"),
-            (dv["dLdSigmoid"], "dLdSigm_out"),
-            (dv["dLdConv_dilation"], "dLdConv_dilation_out"),
-            (gr["conv_dilation"]["W"], "dLdConv_dilation_W"),
-            (gr["conv_dilation"]["b"], "dLdConv_dilation_b"),
-            (dLdX_main, "dLdX_main"),
-            (dLdX_skip, "dLdX_skip"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("f_width={}, n_ex={}".format(f_width, n_ex))
-        print("l_in={}, ch_residual={}".format(l_in, ch_residual))
-        print("ch_dilation={} dilation={}".format(ch_dilation, d))
-        for ix, (mine, label) in enumerate(params):
-            assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
-
-
-#######################################################################
-#                                Utils                                #
-#######################################################################
-
-
-def test_pad1D(N=None):
-    from ..layers import Conv1D
-    from .torch_models import TorchCausalConv1d, torchify
-
-    N = np.inf if N is None else N
-
-    i = 1
-    while i < N + 1:
-        p = np.random.choice(["same", "causal"])
-        n_ex = np.random.randint(1, 10)
-        l_in = np.random.randint(1, 10)
-        n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)
-        f_width = min(l_in, np.random.randint(1, 5))
-        s = np.random.randint(1, 3)
-        d = np.random.randint(0, 5)
-
-        X = random_tensor((n_ex, l_in, n_in), standardize=True)
-        X_pad, _ = pad1D(X, p, kernel_width=f_width, stride=s, dilation=d)
-
-        # initialize Conv2D layer
-        L1 = Conv1D(out_ch=n_out, kernel_width=f_width, pad=0, stride=s, dilation=d)
-
-        # forward prop
-        try:
-            y_pred = L1.forward(X_pad)
-        except ValueError:
-            continue
-
-        # ignore n. output channels
-        print("Trial {}".format(i))
-        print("p={} d={} s={} l_in={} f_width={}".format(p, d, s, l_in, f_width))
-        print("n_ex={} n_in={} n_out={}".format(n_ex, n_in, n_out))
-        assert y_pred.shape[:2] == X.shape[:2], print(
-            "y_pred.shape={} X.shape={}".format(y_pred.shape, X.shape)
-        )
-
-        if p == "causal":
-            gold = TorchCausalConv1d(
-                in_channels=n_in,
-                out_channels=n_out,
-                kernel_size=f_width,
-                stride=s,
-                dilation=d + 1,
-                bias=True,
-            )
-            if s != 1:
-                print(
-                    "TorchCausalConv1D does not do `same` padding for stride > 1. Skipping"
-                )
-                continue
-
-            XT = torchify(np.moveaxis(X, [0, 1, 2], [0, -1, -2]))
-        else:
-            gold = nn.Conv1d(
-                in_channels=n_in,
-                out_channels=n_out,
-                kernel_size=f_width,
-                padding=0,
-                stride=s,
-                dilation=d + 1,
-                bias=True,
-            )
-            XT = torchify(np.moveaxis(X_pad, [0, 1, 2], [0, -1, -2]))
-
-        # import weights and biases
-        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
-        b = L1.parameters["b"]
-        W = np.moveaxis(L1.parameters["W"], [0, 1, 2], [-1, -2, -3])
-        assert gold.weight.shape == W.shape
-        assert gold.bias.shape == b.flatten().shape
-
-        gold.weight = nn.Parameter(torch.FloatTensor(W))
-        gold.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-        outT = gold(XT)
-        if outT.ndimension() == 2:
-            import ipdb
-
-            ipdb.set_trace()
-
-        gold_out = np.moveaxis(outT.detach().numpy(), [0, 1, 2], [0, -1, -2])
-        assert gold_out.shape[:2] == X.shape[:2]
-
-        np.testing.assert_almost_equal(
-            y_pred,
-            gold_out,
-            err_msg=err_fmt(
-                [(y_pred.shape, "out.shape"), (y_pred, "out")],
-                {"out.shape": gold_out.shape, "out": gold_out},
-                1,
-            ),
-            decimal=4,
-        )
-        print("PASSED\n")
-        i += 1
-
-
-def test_conv(N=None):
-    N = np.inf if N is None else N
-    i = 0
-    while i < N:
-        n_ex = np.random.randint(2, 15)
-        in_rows = np.random.randint(2, 15)
-        in_cols = np.random.randint(2, 15)
-        in_ch = np.random.randint(2, 15)
-        out_ch = np.random.randint(2, 15)
-        f_shape = (
-            min(in_rows, np.random.randint(2, 10)),
-            min(in_cols, np.random.randint(2, 10)),
-        )
-        s = np.random.randint(1, 3)
-        p = np.random.randint(0, 5)
-
-        X = np.random.rand(n_ex, in_rows, in_cols, in_ch)
-        X_pad, p = pad2D(X, p)
-        W = np.random.randn(f_shape[0], f_shape[1], in_ch, out_ch)
-
-        gold = conv2D_naive(X, W, s, p)
-        mine = conv2D(X, W, s, p)
-
-        np.testing.assert_almost_equal(mine, gold)
-        print("PASSED")
-        i += 1
-
-
-#######################################################################
-#                               Models                                #
-#######################################################################
-
-
-def test_VAE():
-    # for testing
-    from keras.datasets import mnist
-    from ..models.vae import BernoulliVAE
-
-    (X_train, y_train), (X_test, y_test) = mnist.load_data()
-
-    # scale pixel intensities to [0, 1]
-    X_train = np.expand_dims(X_train.astype("float32") / 255.0, 3)
-    X_test = np.expand_dims(X_test.astype("float32") / 255.0, 3)
-
-    X_train = X_train[: 128 * 10]
-
-    BV = BernoulliVAE()
-    BV.fit(X_train, verbose=True)
-
-
-def test_WGAN_GP(N=1):
-    from ..models.wgan_gp import WGAN_GP
-
-    ss = np.random.randint(0, 1000)
-    np.random.seed(ss)
-
-    N = np.inf if N is None else N
-
-    i = 1
-    while i < N + 1:
-        c_updates_per_epoch, n_steps = 1, 1
-        n_ex = np.random.randint(1, 500)
-        n_in = np.random.randint(1, 100)
-        lambda_ = np.random.randint(0, 20)
-        g_hidden = np.random.randint(2, 500)
-        X = random_tensor((n_ex, n_in), standardize=True)
-
-        # initialize WGAN_GP model
-        L1 = WGAN_GP(g_hidden=g_hidden, debug=True)
-
-        # forward prop
-        batchsize = n_ex
-        L1.fit(
-            X,
-            lambda_=lambda_,
-            c_updates_per_epoch=c_updates_per_epoch,
-            n_steps=n_steps,
-            batchsize=batchsize,
-        )
-
-        # backprop
-        dv = L1.derived_variables
-        params = L1.parameters["components"]
-        grads = L1.gradients["components"]
-        params["noise"] = dv["noise"]
-        params["alpha"] = dv["alpha"]
-        params["n_in"] = n_in
-        params["g_hidden"] = g_hidden
-        params["c_updates_per_epoch"] = c_updates_per_epoch
-        params["n_steps"] = n_steps
-
-        # get gold standard gradients
-        golds = WGAN_GP_tf(X, lambda_=lambda_, batch_size=batchsize, params=params)
-
-        params = [
-            (dv["X_real"], "X_real"),
-            (params["generator"]["FC1"]["W"], "G_weights_FC1"),
-            (params["generator"]["FC2"]["W"], "G_weights_FC2"),
-            (params["generator"]["FC3"]["W"], "G_weights_FC3"),
-            (params["generator"]["FC4"]["W"], "G_weights_FC4"),
-            (dv["G_fwd_X_fake"]["FC1"], "G_fwd_X_fake_FC1"),
-            (dv["G_fwd_X_fake"]["FC2"], "G_fwd_X_fake_FC2"),
-            (dv["G_fwd_X_fake"]["FC3"], "G_fwd_X_fake_FC3"),
-            (dv["G_fwd_X_fake"]["FC4"], "G_fwd_X_fake_FC4"),
-            (dv["X_fake"], "X_fake"),
-            (dv["X_interp"], "X_interp"),
-            (params["critic"]["FC1"]["W"], "C_weights_Y_real_FC1"),
-            (params["critic"]["FC2"]["W"], "C_weights_Y_real_FC2"),
-            (params["critic"]["FC3"]["W"], "C_weights_Y_real_FC3"),
-            (params["critic"]["FC4"]["W"], "C_weights_Y_real_FC4"),
-            (dv["C_fwd_Y_real"]["FC1"], "C_fwd_Y_real_FC1"),
-            (dv["C_fwd_Y_real"]["FC2"], "C_fwd_Y_real_FC2"),
-            (dv["C_fwd_Y_real"]["FC3"], "C_fwd_Y_real_FC3"),
-            (dv["C_fwd_Y_real"]["FC4"], "C_fwd_Y_real_FC4"),
-            (dv["Y_real"].flatten(), "Y_real"),
-            (params["critic"]["FC1"]["W"], "C_weights_Y_fake_FC1"),
-            (params["critic"]["FC2"]["W"], "C_weights_Y_fake_FC2"),
-            (params["critic"]["FC3"]["W"], "C_weights_Y_fake_FC3"),
-            (params["critic"]["FC4"]["W"], "C_weights_Y_fake_FC4"),
-            (dv["C_fwd_Y_fake"]["FC1"], "C_fwd_Y_fake_FC1"),
-            (dv["C_fwd_Y_fake"]["FC2"], "C_fwd_Y_fake_FC2"),
-            (dv["C_fwd_Y_fake"]["FC3"], "C_fwd_Y_fake_FC3"),
-            (dv["C_fwd_Y_fake"]["FC4"], "C_fwd_Y_fake_FC4"),
-            (dv["Y_fake"].flatten(), "Y_fake"),
-            (params["critic"]["FC1"]["W"], "C_weights_Y_interp_FC1"),
-            (params["critic"]["FC2"]["W"], "C_weights_Y_interp_FC2"),
-            (params["critic"]["FC3"]["W"], "C_weights_Y_interp_FC3"),
-            (params["critic"]["FC4"]["W"], "C_weights_Y_interp_FC4"),
-            (dv["C_fwd_Y_interp"]["FC1"], "C_fwd_Y_interp_FC1"),
-            (dv["C_fwd_Y_interp"]["FC2"], "C_fwd_Y_interp_FC2"),
-            (dv["C_fwd_Y_interp"]["FC3"], "C_fwd_Y_interp_FC3"),
-            (dv["C_fwd_Y_interp"]["FC4"], "C_fwd_Y_interp_FC4"),
-            (dv["Y_interp"].flatten(), "Y_interp"),
-            (dv["C_dY_interp_wrt"]["FC4"], "dY_interp_wrt_FC4"),
-            (dv["C_dY_interp_wrt"]["FC3"], "dY_interp_wrt_FC3"),
-            (dv["C_dY_interp_wrt"]["FC2"], "dY_interp_wrt_FC2"),
-            (dv["C_dY_interp_wrt"]["FC1"], "dY_interp_wrt_FC1"),
-            (dv["gradInterp"], "gradInterp"),
-            (dv["C_loss"], "C_loss"),
-            (dv["G_loss"], "G_loss"),
-            (grads["critic"]["FC1"]["W"], "dC_loss_dW_FC1"),
-            (grads["critic"]["FC1"]["b"].flatten(), "dC_loss_db_FC1"),
-            (grads["critic"]["FC2"]["W"], "dC_loss_dW_FC2"),
-            (grads["critic"]["FC2"]["b"].flatten(), "dC_loss_db_FC2"),
-            (grads["critic"]["FC3"]["W"], "dC_loss_dW_FC3"),
-            (grads["critic"]["FC3"]["b"].flatten(), "dC_loss_db_FC3"),
-            (grads["critic"]["FC4"]["W"], "dC_loss_dW_FC4"),
-            (grads["critic"]["FC4"]["b"].flatten(), "dC_loss_db_FC4"),
-            (dv["dG_Y_fake"].flatten(), "dG_Y_fake"),
-            (dv["dY_real"].flatten(), "dC_Y_real"),
-            (dv["dC_Y_fake"].flatten(), "dC_Y_fake"),
-            (dv["dGrad_interp"], "dC_gradInterp"),
-        ]
-
-        print("\nTrial {}".format(i))
-        print("Seed: {} g_hidden={}".format(ss, g_hidden))
-        print("lambda={} n_ex={} n_in={}".format(lambda_, n_ex, n_in))
-        print(
-            "c_updates_per_epoch={}, n_steps={} batchsize={}".format(
-                c_updates_per_epoch, n_steps, batchsize
-            )
-        )
-
-        for ix, (mine, label) in enumerate(params):
-            np.testing.assert_almost_equal(
-                mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3
-            )
-            print("\tPASSED {}".format(label))
-        i += 1
diff --git a/numpy_ml/neural_nets/tests/torch_models.py b/numpy_ml/neural_nets/tests/torch_models.py
deleted file mode 100644
index 8de9212..0000000
--- a/numpy_ml/neural_nets/tests/torch_models.py
+++ /dev/null
@@ -1,2276 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import tensorflow as tf
-
-import numpy as np
-
-#######################################################################
-#       Gold-standard implementations for testing custom layers       #
-#                       (Requires Pytorch)                            #
-#######################################################################
-
-
-def torchify(var, requires_grad=True):
-    return torch.autograd.Variable(torch.FloatTensor(var), requires_grad=requires_grad)
-
-
-def torch_gradient_generator(fn, **kwargs):
-    def get_grad(z):
-        z1 = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
-        z2 = fn(z1, **kwargs).sum()
-        z2.backward()
-        grad = z1.grad.numpy()
-        return grad
-
-    return get_grad
-
-
-def torch_xe_grad(y, z):
-    z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
-    y = torch.LongTensor(y.argmax(axis=1))
-    loss = F.cross_entropy(z, y, size_average=False).sum()
-    loss.backward()
-    grad = z.grad.numpy()
-    return grad
-
-
-def torch_mse_grad(y, z, act_fn):
-    y = torch.FloatTensor(y)
-    z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)
-    y_pred = act_fn(z)
-    loss = F.mse_loss(y_pred, y, size_average=False).sum()
-    loss.backward()
-    grad = z.grad.numpy()
-    return grad
-
-
-class TorchVAELoss(nn.Module):
-    def __init__(self):
-        super(TorchVAELoss, self).__init__()
-
-    def extract_grads(self, X, X_recon, t_mean, t_log_var):
-        eps = np.finfo(float).eps
-        X = torchify(X, requires_grad=False)
-        X_recon = torchify(np.clip(X_recon, eps, 1 - eps))
-        t_mean = torchify(t_mean)
-        t_log_var = torchify(t_log_var)
-
-        BCE = torch.sum(F.binary_cross_entropy(X_recon, X, reduce=False), dim=1)
-
-        # see Appendix B from VAE paper:
-        # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
-        # https://arxiv.org/abs/1312.6114
-        # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
-        KLD = -0.5 * torch.sum(1 + t_log_var - t_mean.pow(2) - t_log_var.exp(), dim=1)
-
-        loss = torch.mean(BCE + KLD)
-        loss.backward()
-
-        grads = {
-            "loss": loss.detach().numpy(),
-            "dX_recon": X_recon.grad.numpy(),
-            "dt_mean": t_mean.grad.numpy(),
-            "dt_log_var": t_log_var.grad.numpy(),
-        }
-        return grads
-
-
-class TorchWGANGPLoss(nn.Module):
-    def __init__(self, lambda_=10):
-        self.lambda_ = torchify([lambda_])
-        super(TorchWGANGPLoss, self).__init__()
-
-    def forward(self, Y_real, Y_fake, gradInterp):
-        GY_fake = Y_fake.copy()
-        self.Y_real = torchify(Y_real)
-        self.Y_fake = torchify(Y_fake)
-        self.GY_fake = torchify(GY_fake)
-        self.gradInterp = torchify(gradInterp)
-
-        # calc grad penalty
-        norm = self.gradInterp.norm(2, dim=1)
-        self.norm1 = torch.sqrt(torch.sum(self.gradInterp.pow(2), dim=1))
-        assert torch.allclose(norm, self.norm1)
-
-        self.gpenalty = self.lambda_ * ((self.norm1 - 1).pow(2)).mean()
-        self.C_loss = self.Y_fake.mean() - self.Y_real.mean() + self.gpenalty
-        self.G_loss = -self.GY_fake.mean()
-
-    def extract_grads(self, Y_real, Y_fake, gradInterp):
-        self.forward(Y_real, Y_fake, gradInterp)
-
-        self.C_loss.backward()
-        self.G_loss.backward()
-
-        grads = {
-            "Y_real": self.Y_real.detach().numpy(),
-            "Y_fake": self.Y_fake.detach().numpy(),
-            "gradInterp": self.gradInterp.detach().numpy(),
-            "GP": self.gpenalty.detach().numpy(),
-            "C_loss": self.C_loss.detach().numpy(),
-            "G_loss": self.G_loss.detach().numpy(),
-            "C_dY_real": self.Y_real.grad.numpy(),
-            "C_dGradInterp": self.gradInterp.grad.numpy(),
-            "C_dY_fake": self.Y_fake.grad.numpy(),
-            "G_dY_fake": self.GY_fake.grad.numpy(),
-        }
-        return grads
-
-
-class TorchLinearActivation(nn.Module):
-    def __init__(self):
-        super(TorchLinearActivation, self).__init__()
-        pass
-
-    @staticmethod
-    def forward(input):
-        return input
-
-    @staticmethod
-    def backward(grad_output):
-        return torch.ones_like(grad_output)
-
-
-class TorchBatchNormLayer(nn.Module):
-    def __init__(self, n_in, params, mode, momentum=0.9, epsilon=1e-5):
-        super(TorchBatchNormLayer, self).__init__()
-
-        scaler = params["scaler"]
-        intercept = params["intercept"]
-
-        if mode == "1D":
-            self.layer1 = nn.BatchNorm1d(
-                num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True
-            )
-        elif mode == "2D":
-            self.layer1 = nn.BatchNorm2d(
-                num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True
-            )
-
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-    def forward(self, X):
-        # (N, H, W, C) -> (N, C, H, W)
-        if X.ndim == 4:
-            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-
-        if not isinstance(X, torch.Tensor):
-            X = torchify(X)
-
-        self.X = X
-        self.Y = self.layer1(self.X)
-        self.Y.retain_grad()
-
-    def extract_grads(self, X, Y_true=None):
-        self.forward(X)
-
-        if isinstance(Y_true, np.ndarray):
-            Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])
-            self.loss1 = (
-                0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()
-            )
-        else:
-            self.loss1 = self.Y.sum()
-
-        self.loss1.backward()
-
-        X_np = self.X.detach().numpy()
-        Y_np = self.Y.detach().numpy()
-        dX_np = self.X.grad.numpy()
-        dY_np = self.Y.grad.numpy()
-
-        if self.X.dim() == 4:
-            orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
-            if isinstance(Y_true, np.ndarray):
-                Y_true = np.moveaxis(Y_true, orig, X_swap)
-            X_np = np.moveaxis(X_np, orig, X_swap)
-            Y_np = np.moveaxis(Y_np, orig, X_swap)
-            dX_np = np.moveaxis(dX_np, orig, X_swap)
-            dY_np = np.moveaxis(dY_np, orig, X_swap)
-
-        grads = {
-            "loss": self.loss1.detach().numpy(),
-            "X": X_np,
-            "momentum": 1 - self.layer1.momentum,
-            "epsilon": self.layer1.eps,
-            "intercept": self.layer1.bias.detach().numpy(),
-            "scaler": self.layer1.weight.detach().numpy(),
-            "running_mean": self.layer1.running_mean.detach().numpy(),
-            "running_var": self.layer1.running_var.detach().numpy(),
-            "y": Y_np,
-            "dLdy": dY_np,
-            "dLdIntercept": self.layer1.bias.grad.numpy(),
-            "dLdScaler": self.layer1.weight.grad.numpy(),
-            "dLdX": dX_np,
-        }
-        if isinstance(Y_true, np.ndarray):
-            grads["Y_true"] = Y_true
-        return grads
-
-
-class TorchLayerNormLayer(nn.Module):
-    def __init__(self, feat_dims, params, mode, epsilon=1e-5):
-        super(TorchLayerNormLayer, self).__init__()
-
-        self.layer1 = nn.LayerNorm(
-            normalized_shape=feat_dims, eps=epsilon, elementwise_affine=True
-        )
-
-        scaler = params["scaler"]
-        intercept = params["intercept"]
-
-        if mode == "2D":
-            scaler = np.moveaxis(scaler, [0, 1, 2], [-2, -1, -3])
-            intercept = np.moveaxis(intercept, [0, 1, 2], [-2, -1, -3])
-
-        assert scaler.shape == self.layer1.weight.shape
-        assert intercept.shape == self.layer1.bias.shape
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-    def forward(self, X):
-        # (N, H, W, C) -> (N, C, H, W)
-        if X.ndim == 4:
-            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-
-        if not isinstance(X, torch.Tensor):
-            X = torchify(X)
-
-        self.X = X
-        self.Y = self.layer1(self.X)
-        self.Y.retain_grad()
-
-    def extract_grads(self, X, Y_true=None):
-        self.forward(X)
-
-        if isinstance(Y_true, np.ndarray):
-            Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])
-            self.loss1 = (
-                0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()
-            )
-        else:
-            self.loss1 = self.Y.sum()
-
-        self.loss1.backward()
-
-        X_np = self.X.detach().numpy()
-        Y_np = self.Y.detach().numpy()
-        dX_np = self.X.grad.numpy()
-        dY_np = self.Y.grad.numpy()
-        intercept_np = self.layer1.bias.detach().numpy()
-        scaler_np = self.layer1.weight.detach().numpy()
-        dIntercept_np = self.layer1.bias.grad.numpy()
-        dScaler_np = self.layer1.weight.grad.numpy()
-
-        if self.X.dim() == 4:
-            orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
-            orig_p, p_swap = [0, 1, 2], [-1, -3, -2]
-            if isinstance(Y_true, np.ndarray):
-                Y_true = np.moveaxis(Y_true, orig, X_swap)
-            X_np = np.moveaxis(X_np, orig, X_swap)
-            Y_np = np.moveaxis(Y_np, orig, X_swap)
-            dX_np = np.moveaxis(dX_np, orig, X_swap)
-            dY_np = np.moveaxis(dY_np, orig, X_swap)
-            scaler_np = np.moveaxis(scaler_np, orig_p, p_swap)
-            intercept_np = np.moveaxis(intercept_np, orig_p, p_swap)
-            dScaler_np = np.moveaxis(dScaler_np, orig_p, p_swap)
-            dIntercept_np = np.moveaxis(dIntercept_np, orig_p, p_swap)
-
-        grads = {
-            "loss": self.loss1.detach().numpy(),
-            "X": X_np,
-            "epsilon": self.layer1.eps,
-            "intercept": intercept_np,
-            "scaler": scaler_np,
-            "y": Y_np,
-            "dLdy": dY_np,
-            "dLdIntercept": dIntercept_np,
-            "dLdScaler": dScaler_np,
-            "dLdX": dX_np,
-        }
-        if isinstance(Y_true, np.ndarray):
-            grads["Y_true"] = Y_true
-        return grads
-
-
-class TorchAddLayer(nn.Module):
-    def __init__(self, act_fn, **kwargs):
-        super(TorchAddLayer, self).__init__()
-        self.act_fn = act_fn
-
-    def forward(self, Xs):
-        self.Xs = []
-        x = Xs[0].copy()
-        if not isinstance(x, torch.Tensor):
-            x = torchify(x)
-
-        self.sum = x.clone()
-        x.retain_grad()
-        self.Xs.append(x)
-
-        for i in range(1, len(Xs)):
-            x = Xs[i]
-            if not isinstance(x, torch.Tensor):
-                x = torchify(x)
-
-            x.retain_grad()
-            self.Xs.append(x)
-            self.sum += x
-
-        self.sum.retain_grad()
-        self.Y = self.act_fn(self.sum)
-        self.Y.retain_grad()
-        return self.Y
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-        grads = {
-            "Xs": X,
-            "Sum": self.sum.detach().numpy(),
-            "Y": self.Y.detach().numpy(),
-            "dLdY": self.Y.grad.numpy(),
-            "dLdSum": self.sum.grad.numpy(),
-        }
-        grads.update(
-            {"dLdX{}".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}
-        )
-        return grads
-
-
-class TorchMultiplyLayer(nn.Module):
-    def __init__(self, act_fn, **kwargs):
-        super(TorchMultiplyLayer, self).__init__()
-        self.act_fn = act_fn
-
-    def forward(self, Xs):
-        self.Xs = []
-        x = Xs[0].copy()
-        if not isinstance(x, torch.Tensor):
-            x = torchify(x)
-
-        self.prod = x.clone()
-        x.retain_grad()
-        self.Xs.append(x)
-
-        for i in range(1, len(Xs)):
-            x = Xs[i]
-            if not isinstance(x, torch.Tensor):
-                x = torchify(x)
-
-            x.retain_grad()
-            self.Xs.append(x)
-            self.prod *= x
-
-        self.prod.retain_grad()
-        self.Y = self.act_fn(self.prod)
-        self.Y.retain_grad()
-        return self.Y
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-        grads = {
-            "Xs": X,
-            "Prod": self.prod.detach().numpy(),
-            "Y": self.Y.detach().numpy(),
-            "dLdY": self.Y.grad.numpy(),
-            "dLdProd": self.prod.grad.numpy(),
-        }
-        grads.update(
-            {"dLdX{}".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}
-        )
-        return grads
-
-
-class TorchSkipConnectionIdentity(nn.Module):
-    def __init__(self, act_fn, pad1, pad2, params, hparams, momentum=0.9, epsilon=1e-5):
-        super(TorchSkipConnectionIdentity, self).__init__()
-
-        self.conv1 = nn.Conv2d(
-            hparams["in_ch"],
-            hparams["out_ch"],
-            hparams["kernel_shape1"],
-            padding=pad1,
-            stride=hparams["stride1"],
-            bias=True,
-        )
-
-        self.act_fn = act_fn
-
-        self.batchnorm1 = nn.BatchNorm2d(
-            num_features=hparams["out_ch"],
-            momentum=1 - momentum,
-            eps=epsilon,
-            affine=True,
-        )
-
-        self.conv2 = nn.Conv2d(
-            hparams["out_ch"],
-            hparams["out_ch"],
-            hparams["kernel_shape2"],
-            padding=pad2,
-            stride=hparams["stride2"],
-            bias=True,
-        )
-
-        self.batchnorm2 = nn.BatchNorm2d(
-            num_features=hparams["out_ch"],
-            momentum=1 - momentum,
-            eps=epsilon,
-            affine=True,
-        )
-
-        orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]
-        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
-        W = params["components"]["conv1"]["W"]
-        b = params["components"]["conv1"]["b"]
-        W = np.moveaxis(W, orig, W_swap)
-        assert self.conv1.weight.shape == W.shape
-        assert self.conv1.bias.shape == b.flatten().shape
-        self.conv1.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-        scaler = params["components"]["batchnorm1"]["scaler"]
-        intercept = params["components"]["batchnorm1"]["intercept"]
-        self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
-        W = params["components"]["conv2"]["W"]
-        b = params["components"]["conv2"]["b"]
-        W = np.moveaxis(W, orig, W_swap)
-        assert self.conv2.weight.shape == W.shape
-        assert self.conv2.bias.shape == b.flatten().shape
-        self.conv2.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-        scaler = params["components"]["batchnorm2"]["scaler"]
-        intercept = params["components"]["batchnorm2"]["intercept"]
-        self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-    def forward(self, X):
-        if not isinstance(X, torch.Tensor):
-            # (N, H, W, C) -> (N, C, H, W)
-            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-            X = torchify(X)
-
-        self.X = X
-        self.X.retain_grad()
-
-        self.conv1_out = self.conv1(self.X)
-        self.conv1_out.retain_grad()
-
-        self.act_fn1_out = self.act_fn(self.conv1_out)
-        self.act_fn1_out.retain_grad()
-
-        self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)
-        self.batchnorm1_out.retain_grad()
-
-        self.conv2_out = self.conv2(self.batchnorm1_out)
-        self.conv2_out.retain_grad()
-
-        self.batchnorm2_out = self.batchnorm2(self.conv2_out)
-        self.batchnorm2_out.retain_grad()
-
-        self.layer3_in = self.batchnorm2_out + self.X
-        self.layer3_in.retain_grad()
-
-        self.Y = self.act_fn(self.layer3_in)
-        self.Y.retain_grad()
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-
-        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
-        grads = {
-            # layer parameters
-            "conv1_W": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),
-            "conv1_b": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "bn1_intercept": self.batchnorm1.bias.detach().numpy(),
-            "bn1_scaler": self.batchnorm1.weight.detach().numpy(),
-            "bn1_running_mean": self.batchnorm1.running_mean.detach().numpy(),
-            "bn1_running_var": self.batchnorm1.running_var.detach().numpy(),
-            "conv2_W": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),
-            "conv2_b": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "bn2_intercept": self.batchnorm2.bias.detach().numpy(),
-            "bn2_scaler": self.batchnorm2.weight.detach().numpy(),
-            "bn2_running_mean": self.batchnorm2.running_mean.detach().numpy(),
-            "bn2_running_var": self.batchnorm2.running_var.detach().numpy(),
-            # layer inputs/outputs (forward step)
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "conv1_out": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),
-            "act1_out": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),
-            "bn1_out": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),
-            "conv2_out": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),
-            "bn2_out": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),
-            "add_out": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),
-            "Y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
-            # layer gradients (backward step)
-            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
-            "dLdAdd": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),
-            "dLdBn2_out": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),
-            "dLdConv2_out": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),
-            "dLdBn1_out": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),
-            "dLdActFn1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
-            "dLdConv1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-            # layer parameter gradients (backward step)
-            "dLdBn2_intercept": self.batchnorm2.bias.grad.numpy(),
-            "dLdBn2_scaler": self.batchnorm2.weight.grad.numpy(),
-            "dLdConv2_W": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),
-            "dLdConv2_b": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),
-            "dLdBn1_intercept": self.batchnorm1.bias.grad.numpy(),
-            "dLdBn1_scaler": self.batchnorm1.weight.grad.numpy(),
-            "dLdConv1_W": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),
-            "dLdConv1_b": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),
-        }
-        return grads
-
-
-class TorchCausalConv1d(torch.nn.Conv1d):
-    """https://github.com/pytorch/pytorch/issues/1333
-
-    NB: this is only ensures that the convolution out length is the same as
-    the input length IFF stride = 1. Otherwise, in/out lengths will differ.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        dilation=1,
-        groups=1,
-        bias=True,
-    ):
-        self.__padding = (kernel_size - 1) * dilation
-
-        super(TorchCausalConv1d, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=self.__padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-
-    def forward(self, input):
-        result = super(TorchCausalConv1d, self).forward(input)
-        if self.__padding != 0:
-            return result[:, :, : -self.__padding]
-        return result
-
-
-class TorchWavenetModule(nn.Module):
-    def __init__(self, params, hparams, conv_1x1_pad):
-        super(TorchWavenetModule, self).__init__()
-        self.conv_dilation = TorchCausalConv1d(
-            in_channels=hparams["components"]["conv_dilation"]["in_ch"],
-            out_channels=hparams["components"]["conv_dilation"]["out_ch"],
-            kernel_size=hparams["components"]["conv_dilation"]["kernel_width"],
-            stride=hparams["components"]["conv_dilation"]["stride"],
-            dilation=hparams["components"]["conv_dilation"]["dilation"] + 1,
-            bias=True,
-        )
-
-        self.conv_1x1 = nn.Conv1d(
-            in_channels=hparams["components"]["conv_1x1"]["in_ch"],
-            out_channels=hparams["components"]["conv_1x1"]["out_ch"],
-            kernel_size=hparams["components"]["conv_1x1"]["kernel_width"],
-            stride=hparams["components"]["conv_1x1"]["stride"],
-            padding=conv_1x1_pad,
-            dilation=hparams["components"]["conv_1x1"]["dilation"] + 1,
-            bias=True,
-        )
-
-        W = params["components"]["conv_dilation"]["W"]
-        b = params["components"]["conv_dilation"]["b"]
-        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
-        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
-        self.conv_dilation.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv_dilation.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-        assert self.conv_dilation.weight.shape == W.shape
-        assert self.conv_dilation.bias.shape == b.flatten().shape
-
-        W = params["components"]["conv_1x1"]["W"]
-        b = params["components"]["conv_1x1"]["b"]
-        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
-        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
-        self.conv_1x1.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv_1x1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-        assert self.conv_1x1.weight.shape == W.shape
-        assert self.conv_1x1.bias.shape == b.flatten().shape
-
-    def forward(self, X_main, X_skip):
-        # (N, W, C) -> (N, C, W)
-        self.X_main = np.moveaxis(X_main, [0, 1, 2], [0, -1, -2])
-        self.X_main = torchify(self.X_main)
-        self.X_main.retain_grad()
-
-        self.conv_dilation_out = self.conv_dilation(self.X_main)
-        self.conv_dilation_out.retain_grad()
-
-        self.tanh_out = F.tanh(self.conv_dilation_out)
-        self.sigm_out = F.sigmoid(self.conv_dilation_out)
-
-        self.tanh_out.retain_grad()
-        self.sigm_out.retain_grad()
-
-        self.multiply_gate_out = self.tanh_out * self.sigm_out
-        self.multiply_gate_out.retain_grad()
-
-        self.conv_1x1_out = self.conv_1x1(self.multiply_gate_out)
-        self.conv_1x1_out.retain_grad()
-
-        self.X_skip = torch.zeros_like(self.conv_1x1_out)
-        if X_skip is not None:
-            self.X_skip = torchify(np.moveaxis(X_skip, [0, 1, 2], [0, -1, -2]))
-        self.X_skip.retain_grad()
-
-        self.Y_skip = self.X_skip + self.conv_1x1_out
-        self.Y_main = self.X_main + self.conv_1x1_out
-
-        self.Y_skip.retain_grad()
-        self.Y_main.retain_grad()
-
-    def extract_grads(self, X_main, X_skip):
-        self.forward(X_main, X_skip)
-        self.loss = (self.Y_skip + self.Y_main).sum()
-        self.loss.backward()
-
-        # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)
-        # X (theirs): (N, C, W)              -> X (mine): (N, W, C)
-        # Y (theirs): (N, C, W)              -> Y (mine): (N, W, C)
-        orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]
-        grads = {
-            "X_main": np.moveaxis(self.X_main.detach().numpy(), orig, X_swap),
-            "X_skip": np.moveaxis(self.X_skip.detach().numpy(), orig, X_swap),
-            "conv_dilation_W": np.moveaxis(
-                self.conv_dilation.weight.detach().numpy(), orig, W_swap
-            ),
-            "conv_dilation_b": self.conv_dilation.bias.detach()
-            .numpy()
-            .reshape(1, 1, -1),
-            "conv_1x1_W": np.moveaxis(
-                self.conv_1x1.weight.detach().numpy(), orig, W_swap
-            ),
-            "conv_1x1_b": self.conv_1x1.bias.detach().numpy().reshape(1, 1, -1),
-            "conv_dilation_out": np.moveaxis(
-                self.conv_dilation_out.detach().numpy(), orig, X_swap
-            ),
-            "tanh_out": np.moveaxis(self.tanh_out.detach().numpy(), orig, X_swap),
-            "sigm_out": np.moveaxis(self.sigm_out.detach().numpy(), orig, X_swap),
-            "multiply_gate_out": np.moveaxis(
-                self.multiply_gate_out.detach().numpy(), orig, X_swap
-            ),
-            "conv_1x1_out": np.moveaxis(
-                self.conv_1x1_out.detach().numpy(), orig, X_swap
-            ),
-            "Y_main": np.moveaxis(self.Y_main.detach().numpy(), orig, X_swap),
-            "Y_skip": np.moveaxis(self.Y_skip.detach().numpy(), orig, X_swap),
-            "dLdY_skip": np.moveaxis(self.Y_skip.grad.numpy(), orig, X_swap),
-            "dLdY_main": np.moveaxis(self.Y_main.grad.numpy(), orig, X_swap),
-            "dLdConv_1x1_out": np.moveaxis(
-                self.conv_1x1_out.grad.numpy(), orig, X_swap
-            ),
-            "dLdConv_1x1_W": np.moveaxis(
-                self.conv_1x1.weight.grad.numpy(), orig, W_swap
-            ),
-            "dLdConv_1x1_b": self.conv_1x1.bias.grad.numpy().reshape(1, 1, -1),
-            "dLdMultiply_out": np.moveaxis(
-                self.multiply_gate_out.grad.numpy(), orig, X_swap
-            ),
-            "dLdTanh_out": np.moveaxis(self.tanh_out.grad.numpy(), orig, X_swap),
-            "dLdSigm_out": np.moveaxis(self.sigm_out.grad.numpy(), orig, X_swap),
-            "dLdConv_dilation_out": np.moveaxis(
-                self.conv_dilation_out.grad.numpy(), orig, X_swap
-            ),
-            "dLdConv_dilation_W": np.moveaxis(
-                self.conv_dilation.weight.grad.numpy(), orig, W_swap
-            ),
-            "dLdConv_dilation_b": self.conv_dilation.bias.grad.numpy().reshape(
-                1, 1, -1
-            ),
-            "dLdX_main": np.moveaxis(self.X_main.grad.numpy(), orig, X_swap),
-            "dLdX_skip": np.moveaxis(self.X_skip.grad.numpy(), orig, X_swap),
-        }
-
-        return grads
-
-
-class TorchSkipConnectionConv(nn.Module):
-    def __init__(
-        self, act_fn, pad1, pad2, pad_skip, params, hparams, momentum=0.9, epsilon=1e-5
-    ):
-        super(TorchSkipConnectionConv, self).__init__()
-
-        self.conv1 = nn.Conv2d(
-            hparams["in_ch"],
-            hparams["out_ch1"],
-            hparams["kernel_shape1"],
-            padding=pad1,
-            stride=hparams["stride1"],
-            bias=True,
-        )
-
-        self.act_fn = act_fn
-
-        self.batchnorm1 = nn.BatchNorm2d(
-            num_features=hparams["out_ch1"],
-            momentum=1 - momentum,
-            eps=epsilon,
-            affine=True,
-        )
-
-        self.conv2 = nn.Conv2d(
-            hparams["out_ch1"],
-            hparams["out_ch2"],
-            hparams["kernel_shape2"],
-            padding=pad2,
-            stride=hparams["stride2"],
-            bias=True,
-        )
-
-        self.batchnorm2 = nn.BatchNorm2d(
-            num_features=hparams["out_ch2"],
-            momentum=1 - momentum,
-            eps=epsilon,
-            affine=True,
-        )
-
-        self.conv_skip = nn.Conv2d(
-            hparams["in_ch"],
-            hparams["out_ch2"],
-            hparams["kernel_shape_skip"],
-            padding=pad_skip,
-            stride=hparams["stride_skip"],
-            bias=True,
-        )
-
-        self.batchnorm_skip = nn.BatchNorm2d(
-            num_features=hparams["out_ch2"],
-            momentum=1 - momentum,
-            eps=epsilon,
-            affine=True,
-        )
-
-        orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]
-        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
-        W = params["components"]["conv1"]["W"]
-        b = params["components"]["conv1"]["b"]
-        W = np.moveaxis(W, orig, W_swap)
-        assert self.conv1.weight.shape == W.shape
-        assert self.conv1.bias.shape == b.flatten().shape
-        self.conv1.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-        scaler = params["components"]["batchnorm1"]["scaler"]
-        intercept = params["components"]["batchnorm1"]["intercept"]
-        self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
-        W = params["components"]["conv2"]["W"]
-        b = params["components"]["conv2"]["b"]
-        W = np.moveaxis(W, orig, W_swap)
-        assert self.conv2.weight.shape == W.shape
-        assert self.conv2.bias.shape == b.flatten().shape
-        self.conv2.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-        scaler = params["components"]["batchnorm2"]["scaler"]
-        intercept = params["components"]["batchnorm2"]["intercept"]
-        self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-        W = params["components"]["conv_skip"]["W"]
-        b = params["components"]["conv_skip"]["b"]
-        W = np.moveaxis(W, orig, W_swap)
-        assert self.conv_skip.weight.shape == W.shape
-        assert self.conv_skip.bias.shape == b.flatten().shape
-        self.conv_skip.weight = nn.Parameter(torch.FloatTensor(W))
-        self.conv_skip.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-        scaler = params["components"]["batchnorm_skip"]["scaler"]
-        intercept = params["components"]["batchnorm_skip"]["intercept"]
-        self.batchnorm_skip.weight = nn.Parameter(torch.FloatTensor(scaler))
-        self.batchnorm_skip.bias = nn.Parameter(torch.FloatTensor(intercept))
-
-    def forward(self, X):
-        if not isinstance(X, torch.Tensor):
-            # (N, H, W, C) -> (N, C, H, W)
-            X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-            X = torchify(X)
-
-        self.X = X
-        self.X.retain_grad()
-
-        self.conv1_out = self.conv1(self.X)
-        self.conv1_out.retain_grad()
-
-        self.act_fn1_out = self.act_fn(self.conv1_out)
-        self.act_fn1_out.retain_grad()
-
-        self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)
-        self.batchnorm1_out.retain_grad()
-
-        self.conv2_out = self.conv2(self.batchnorm1_out)
-        self.conv2_out.retain_grad()
-
-        self.batchnorm2_out = self.batchnorm2(self.conv2_out)
-        self.batchnorm2_out.retain_grad()
-
-        self.c_skip_out = self.conv_skip(self.X)
-        self.c_skip_out.retain_grad()
-
-        self.bn_skip_out = self.batchnorm_skip(self.c_skip_out)
-        self.bn_skip_out.retain_grad()
-
-        self.layer3_in = self.batchnorm2_out + self.bn_skip_out
-        self.layer3_in.retain_grad()
-
-        self.Y = self.act_fn(self.layer3_in)
-        self.Y.retain_grad()
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-
-        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
-        grads = {
-            # layer parameters
-            "conv1_W": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),
-            "conv1_b": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "bn1_intercept": self.batchnorm1.bias.detach().numpy(),
-            "bn1_scaler": self.batchnorm1.weight.detach().numpy(),
-            "bn1_running_mean": self.batchnorm1.running_mean.detach().numpy(),
-            "bn1_running_var": self.batchnorm1.running_var.detach().numpy(),
-            "conv2_W": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),
-            "conv2_b": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "bn2_intercept": self.batchnorm2.bias.detach().numpy(),
-            "bn2_scaler": self.batchnorm2.weight.detach().numpy(),
-            "bn2_running_mean": self.batchnorm2.running_mean.detach().numpy(),
-            "bn2_running_var": self.batchnorm2.running_var.detach().numpy(),
-            "conv_skip_W": np.moveaxis(
-                self.conv_skip.weight.detach().numpy(), orig, W_swap
-            ),
-            "conv_skip_b": self.conv_skip.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "bn_skip_intercept": self.batchnorm_skip.bias.detach().numpy(),
-            "bn_skip_scaler": self.batchnorm_skip.weight.detach().numpy(),
-            "bn_skip_running_mean": self.batchnorm_skip.running_mean.detach().numpy(),
-            "bn_skip_running_var": self.batchnorm_skip.running_var.detach().numpy(),
-            # layer inputs/outputs (forward step)
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "conv1_out": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),
-            "act1_out": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),
-            "bn1_out": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),
-            "conv2_out": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),
-            "bn2_out": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),
-            "conv_skip_out": np.moveaxis(
-                self.c_skip_out.detach().numpy(), orig, X_swap
-            ),
-            "bn_skip_out": np.moveaxis(self.bn_skip_out.detach().numpy(), orig, X_swap),
-            "add_out": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),
-            "Y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
-            # layer gradients (backward step)
-            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
-            "dLdAdd": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),
-            "dLdBnSkip_out": np.moveaxis(self.bn_skip_out.grad.numpy(), orig, X_swap),
-            "dLdConvSkip_out": np.moveaxis(self.c_skip_out.grad.numpy(), orig, X_swap),
-            "dLdBn2_out": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),
-            "dLdConv2_out": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),
-            "dLdBn1_out": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),
-            "dLdActFn1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
-            "dLdConv1_out": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-            # layer parameter gradients (backward step)
-            "dLdBnSkip_intercept": self.batchnorm_skip.bias.grad.numpy(),
-            "dLdBnSkip_scaler": self.batchnorm_skip.weight.grad.numpy(),
-            "dLdConvSkip_W": np.moveaxis(
-                self.conv_skip.weight.grad.numpy(), orig, W_swap
-            ),
-            "dLdConvSkip_b": self.conv_skip.bias.grad.numpy().reshape(1, 1, 1, -1),
-            "dLdBn2_intercept": self.batchnorm2.bias.grad.numpy(),
-            "dLdBn2_scaler": self.batchnorm2.weight.grad.numpy(),
-            "dLdConv2_W": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),
-            "dLdConv2_b": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),
-            "dLdBn1_intercept": self.batchnorm1.bias.grad.numpy(),
-            "dLdBn1_scaler": self.batchnorm1.weight.grad.numpy(),
-            "dLdConv1_W": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),
-            "dLdConv1_b": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),
-        }
-        return grads
-
-
-class TorchBidirectionalLSTM(nn.Module):
-    def __init__(self, n_in, n_out, params, **kwargs):
-        super(TorchBidirectionalLSTM, self).__init__()
-
-        self.layer1 = nn.LSTM(
-            input_size=n_in,
-            hidden_size=n_out,
-            num_layers=1,
-            bidirectional=True,
-            bias=True,
-        )
-
-        Wiu = params["components"]["cell_fwd"]["Wu"][n_out:, :].T
-        Wif = params["components"]["cell_fwd"]["Wf"][n_out:, :].T
-        Wic = params["components"]["cell_fwd"]["Wc"][n_out:, :].T
-        Wio = params["components"]["cell_fwd"]["Wo"][n_out:, :].T
-        W_ih_f = np.vstack([Wiu, Wif, Wic, Wio])
-
-        Whu = params["components"]["cell_fwd"]["Wu"][:n_out, :].T
-        Whf = params["components"]["cell_fwd"]["Wf"][:n_out, :].T
-        Whc = params["components"]["cell_fwd"]["Wc"][:n_out, :].T
-        Who = params["components"]["cell_fwd"]["Wo"][:n_out, :].T
-        W_hh_f = np.vstack([Whu, Whf, Whc, Who])
-
-        assert self.layer1.weight_ih_l0.shape == W_ih_f.shape
-        assert self.layer1.weight_hh_l0.shape == W_hh_f.shape
-
-        self.layer1.weight_ih_l0 = nn.Parameter(torch.FloatTensor(W_ih_f))
-        self.layer1.weight_hh_l0 = nn.Parameter(torch.FloatTensor(W_hh_f))
-
-        Wiu = params["components"]["cell_bwd"]["Wu"][n_out:, :].T
-        Wif = params["components"]["cell_bwd"]["Wf"][n_out:, :].T
-        Wic = params["components"]["cell_bwd"]["Wc"][n_out:, :].T
-        Wio = params["components"]["cell_bwd"]["Wo"][n_out:, :].T
-        W_ih_b = np.vstack([Wiu, Wif, Wic, Wio])
-
-        Whu = params["components"]["cell_bwd"]["Wu"][:n_out, :].T
-        Whf = params["components"]["cell_bwd"]["Wf"][:n_out, :].T
-        Whc = params["components"]["cell_bwd"]["Wc"][:n_out, :].T
-        Who = params["components"]["cell_bwd"]["Wo"][:n_out, :].T
-        W_hh_b = np.vstack([Whu, Whf, Whc, Who])
-
-        assert self.layer1.weight_ih_l0_reverse.shape == W_ih_b.shape
-        assert self.layer1.weight_hh_l0_reverse.shape == W_hh_b.shape
-
-        self.layer1.weight_ih_l0_reverse = nn.Parameter(torch.FloatTensor(W_ih_b))
-        self.layer1.weight_hh_l0_reverse = nn.Parameter(torch.FloatTensor(W_hh_b))
-
-        b_f = np.concatenate(
-            [
-                params["components"]["cell_fwd"]["bu"],
-                params["components"]["cell_fwd"]["bf"],
-                params["components"]["cell_fwd"]["bc"],
-                params["components"]["cell_fwd"]["bo"],
-            ],
-            axis=-1,
-        ).flatten()
-
-        assert self.layer1.bias_ih_l0.shape == b_f.shape
-        assert self.layer1.bias_hh_l0.shape == b_f.shape
-
-        self.layer1.bias_ih_l0 = nn.Parameter(torch.FloatTensor(b_f))
-        self.layer1.bias_hh_l0 = nn.Parameter(torch.FloatTensor(b_f))
-
-        b_b = np.concatenate(
-            [
-                params["components"]["cell_bwd"]["bu"],
-                params["components"]["cell_bwd"]["bf"],
-                params["components"]["cell_bwd"]["bc"],
-                params["components"]["cell_bwd"]["bo"],
-            ],
-            axis=-1,
-        ).flatten()
-
-        assert self.layer1.bias_ih_l0_reverse.shape == b_b.shape
-        assert self.layer1.bias_hh_l0_reverse.shape == b_b.shape
-
-        self.layer1.bias_ih_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))
-        self.layer1.bias_hh_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))
-
-    def forward(self, X):
-        # (batch, input_size, seq_len) -> (seq_len, batch, input_size)
-        self.X = np.moveaxis(X, [0, 1, 2], [-2, -1, -3])
-
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-
-        # initial hidden state is 0
-        n_ex, n_in, n_timesteps = self.X.shape
-        n_out, n_out = self.layer1.weight_hh_l0.shape
-
-        # forward pass
-        self.A, (At, Ct) = self.layer1(self.X)
-        self.A.retain_grad()
-        return self.A
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.A.sum()
-        self.loss.backward()
-
-        # forward
-        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0.chunk(4, 0)
-        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0.chunk(4, 0)
-        bu_f, bf_f, bc_f, bo_f = self.layer1.bias_ih_l0.chunk(4, 0)
-
-        Wu_f = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
-        Wf_f = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
-        Wc_f = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
-        Wo_f = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)
-
-        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0.grad.chunk(4, 0)
-        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0.grad.chunk(4, 0)
-        dbu_f, dbf_f, dbc_f, dbo_f = self.layer1.bias_ih_l0.grad.chunk(4, 0)
-
-        dWu_f = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
-        dWf_f = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
-        dWc_f = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
-        dWo_f = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)
-
-        # backward
-        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0_reverse.chunk(4, 0)
-        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0_reverse.chunk(4, 0)
-        bu_b, bf_b, bc_b, bo_b = self.layer1.bias_ih_l0_reverse.chunk(4, 0)
-
-        Wu_b = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
-        Wf_b = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
-        Wc_b = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
-        Wo_b = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)
-
-        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0_reverse.grad.chunk(4, 0)
-        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0_reverse.grad.chunk(4, 0)
-        dbu_b, dbf_b, dbc_b, dbo_b = self.layer1.bias_ih_l0_reverse.grad.chunk(4, 0)
-
-        dWu_b = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
-        dWf_b = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
-        dWc_b = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
-        dWo_b = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)
-
-        orig, X_swap = [0, 1, 2], [-1, -3, -2]
-        grads = {
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "Wu_f": Wu_f.detach().numpy(),
-            "Wf_f": Wf_f.detach().numpy(),
-            "Wc_f": Wc_f.detach().numpy(),
-            "Wo_f": Wo_f.detach().numpy(),
-            "bu_f": bu_f.detach().numpy().reshape(-1, 1),
-            "bf_f": bf_f.detach().numpy().reshape(-1, 1),
-            "bc_f": bc_f.detach().numpy().reshape(-1, 1),
-            "bo_f": bo_f.detach().numpy().reshape(-1, 1),
-            "Wu_b": Wu_b.detach().numpy(),
-            "Wf_b": Wf_b.detach().numpy(),
-            "Wc_b": Wc_b.detach().numpy(),
-            "Wo_b": Wo_b.detach().numpy(),
-            "bu_b": bu_b.detach().numpy().reshape(-1, 1),
-            "bf_b": bf_b.detach().numpy().reshape(-1, 1),
-            "bc_b": bc_b.detach().numpy().reshape(-1, 1),
-            "bo_b": bo_b.detach().numpy().reshape(-1, 1),
-            "y": np.moveaxis(self.A.detach().numpy(), orig, X_swap),
-            "dLdA": self.A.grad.numpy(),
-            "dLdWu_f": dWu_f.numpy(),
-            "dLdWf_f": dWf_f.numpy(),
-            "dLdWc_f": dWc_f.numpy(),
-            "dLdWo_f": dWo_f.numpy(),
-            "dLdBu_f": dbu_f.numpy().reshape(-1, 1),
-            "dLdBf_f": dbf_f.numpy().reshape(-1, 1),
-            "dLdBc_f": dbc_f.numpy().reshape(-1, 1),
-            "dLdBo_f": dbo_f.numpy().reshape(-1, 1),
-            "dLdWu_b": dWu_b.numpy(),
-            "dLdWf_b": dWf_b.numpy(),
-            "dLdWc_b": dWc_b.numpy(),
-            "dLdWo_b": dWo_b.numpy(),
-            "dLdBu_b": dbu_b.numpy().reshape(-1, 1),
-            "dLdBf_b": dbf_b.numpy().reshape(-1, 1),
-            "dLdBc_b": dbc_b.numpy().reshape(-1, 1),
-            "dLdBo_b": dbo_b.numpy().reshape(-1, 1),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-        }
-        return grads
-
-
-class TorchPool2DLayer(nn.Module):
-    def __init__(self, in_channels, hparams, **kwargs):
-        super(TorchPool2DLayer, self).__init__()
-
-        if hparams["mode"] == "max":
-            self.layer1 = nn.MaxPool2d(
-                kernel_size=hparams["kernel_shape"],
-                padding=hparams["pad"],
-                stride=hparams["stride"],
-            )
-        elif hparams["mode"] == "average":
-            self.layer1 = nn.AvgPool2d(
-                kernel_size=hparams["kernel_shape"],
-                padding=hparams["pad"],
-                stride=hparams["stride"],
-            )
-
-    def forward(self, X):
-        # (N, H, W, C) -> (N, C, H, W)
-        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-        self.Y = self.layer1(self.X)
-        self.Y.retain_grad()
-        return self.Y
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-
-        # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
-        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
-        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
-        orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]
-        grads = {
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
-            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-        }
-        return grads
-
-
-class TorchConv2DLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
-        super(TorchConv2DLayer, self).__init__()
-
-        W = params["W"]
-        b = params["b"]
-        self.act_fn = act_fn
-
-        self.layer1 = nn.Conv2d(
-            in_channels,
-            out_channels,
-            hparams["kernel_shape"],
-            padding=hparams["pad"],
-            stride=hparams["stride"],
-            dilation=hparams["dilation"] + 1,
-            bias=True,
-        )
-
-        # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])
-        W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -3, -4])
-        assert self.layer1.weight.shape == W.shape
-        assert self.layer1.bias.shape == b.flatten().shape
-
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
-        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-    def forward(self, X):
-        # (N, H, W, C) -> (N, C, H, W)
-        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-
-        self.Z = self.layer1(self.X)
-        self.Z.retain_grad()
-
-        self.Y = self.act_fn(self.Z)
-        self.Y.retain_grad()
-        return self.Y
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-
-        # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
-        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
-        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
-        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]
-        grads = {
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
-            "b": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
-            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
-            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
-            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
-            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-        }
-        return grads
-
-
-class TorchConv1DLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
-        super(TorchConv1DLayer, self).__init__()
-
-        W = params["W"]
-        b = params["b"]
-        self.act_fn = act_fn
-
-        self.layer1 = nn.Conv1d(
-            in_channels,
-            out_channels,
-            hparams["kernel_width"],
-            padding=hparams["pad"],
-            stride=hparams["stride"],
-            dilation=hparams["dilation"] + 1,
-            bias=True,
-        )
-
-        # (f[0], n_in, n_out) -> (n_out, n_in, f[0])
-        W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])
-        assert self.layer1.weight.shape == W.shape
-        assert self.layer1.bias.shape == b.flatten().shape
-
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
-        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-    def forward(self, X):
-        # (N, W, C) -> (N, C, W)
-        self.X = np.moveaxis(X, [0, 1, 2], [0, -1, -2])
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-
-        self.Z = self.layer1(self.X)
-        self.Z.retain_grad()
-
-        self.Y = self.act_fn(self.Z)
-        self.Y.retain_grad()
-        return self.Y
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-
-        # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)
-        # X (theirs): (N, C, W)              -> X (mine): (N, W, C)
-        # Y (theirs): (N, C, W)              -> Y (mine): (N, W, C)
-        orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]
-        grads = {
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
-            "b": self.layer1.bias.detach().numpy().reshape(1, 1, -1),
-            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
-            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
-            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
-            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
-            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, -1),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-        }
-        return grads
-
-
-class TorchDeconv2DLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):
-        super(TorchDeconv2DLayer, self).__init__()
-
-        W = params["W"]
-        b = params["b"]
-        self.act_fn = act_fn
-
-        self.layer1 = nn.ConvTranspose2d(
-            in_channels,
-            out_channels,
-            hparams["kernel_shape"],
-            padding=hparams["pad"],
-            stride=hparams["stride"],
-            dilation=1,
-            bias=True,
-        )
-
-        # (f[0], f[1], n_in, n_out) -> (n_in, n_out, f[0], f[1])
-        W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -4, -3])
-        assert self.layer1.weight.shape == W.shape
-        assert self.layer1.bias.shape == b.flatten().shape
-
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(W))
-        self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))
-
-    def forward(self, X):
-        # (N, H, W, C) -> (N, C, H, W)
-        self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-
-        self.Z = self.layer1(self.X)
-        self.Z.retain_grad()
-
-        self.Y = self.act_fn(self.Z)
-        self.Y.retain_grad()
-        return self.Y
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = self.Y.sum()
-        self.loss.backward()
-
-        # W (theirs): (n_in, n_out, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)
-        # X (theirs): (N, C, H, W)              -> X (mine): (N, H, W, C)
-        # Y (theirs): (N, C, H, W)              -> Y (mine): (N, H, W, C)
-        orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-2, -1, -4, -3]
-        grads = {
-            "X": np.moveaxis(self.X.detach().numpy(), orig, X_swap),
-            "W": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),
-            "b": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),
-            "y": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),
-            "dLdY": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),
-            "dLdZ": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),
-            "dLdW": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),
-            "dLdB": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),
-            "dLdX": np.moveaxis(self.X.grad.numpy(), orig, X_swap),
-        }
-        return grads
-
-
-class TorchLSTMCell(nn.Module):
-    def __init__(self, n_in, n_out, params, **kwargs):
-        super(TorchLSTMCell, self).__init__()
-
-        Wiu = params["Wu"][n_out:, :].T
-        Wif = params["Wf"][n_out:, :].T
-        Wic = params["Wc"][n_out:, :].T
-        Wio = params["Wo"][n_out:, :].T
-        W_ih = np.vstack([Wiu, Wif, Wic, Wio])
-
-        Whu = params["Wu"][:n_out, :].T
-        Whf = params["Wf"][:n_out, :].T
-        Whc = params["Wc"][:n_out, :].T
-        Who = params["Wo"][:n_out, :].T
-        W_hh = np.vstack([Whu, Whf, Whc, Who])
-
-        self.layer1 = nn.LSTMCell(input_size=n_in, hidden_size=n_out, bias=True)
-        assert self.layer1.weight_ih.shape == W_ih.shape
-        assert self.layer1.weight_hh.shape == W_hh.shape
-        self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(W_ih))
-        self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(W_hh))
-
-        b = np.concatenate(
-            [params["bu"], params["bf"], params["bc"], params["bo"]], axis=-1
-        ).flatten()
-        assert self.layer1.bias_ih.shape == b.shape
-        assert self.layer1.bias_hh.shape == b.shape
-        self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(b))
-        self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(b))
-
-    def forward(self, X):
-        self.X = X
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-
-        # initial hidden state is 0
-        n_ex, n_in, n_timesteps = self.X.shape
-        n_out, n_out = self.layer1.weight_hh.shape
-
-        # initialize hidden states
-        a0 = torchify(np.zeros((n_ex, n_out)))
-        c0 = torchify(np.zeros((n_ex, n_out)))
-        a0.retain_grad()
-        c0.retain_grad()
-
-        # forward pass
-        A, C = [], []
-        at = a0
-        ct = c0
-        for t in range(n_timesteps):
-            A.append(at)
-            C.append(ct)
-            at1, ct1 = self.layer1(self.X[:, :, t], (at, ct))
-            at.retain_grad()
-            ct.retain_grad()
-            at = at1
-            ct = ct1
-
-        at.retain_grad()
-        ct.retain_grad()
-        A.append(at)
-        C.append(ct)
-
-        # don't inclue a0 in our outputs
-        self.A = A[1:]
-        self.C = C[1:]
-        return self.A, self.C
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = torch.stack(self.A).sum()
-        self.loss.backward()
-
-        w_ii, w_if, w_ic, w_io = self.layer1.weight_ih.chunk(4, 0)
-        w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh.chunk(4, 0)
-        bu, bf, bc, bo = self.layer1.bias_ih.chunk(4, 0)
-
-        Wu = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)
-        Wf = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)
-        Wc = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)
-        Wo = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)
-
-        dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih.grad.chunk(4, 0)
-        dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh.grad.chunk(4, 0)
-        dbu, dbf, dbc, dbo = self.layer1.bias_ih.grad.chunk(4, 0)
-
-        dWu = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)
-        dWf = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)
-        dWc = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)
-        dWo = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)
-
-        grads = {
-            "X": self.X.detach().numpy(),
-            "Wu": Wu.detach().numpy(),
-            "Wf": Wf.detach().numpy(),
-            "Wc": Wc.detach().numpy(),
-            "Wo": Wo.detach().numpy(),
-            "bu": bu.detach().numpy().reshape(-1, 1),
-            "bf": bf.detach().numpy().reshape(-1, 1),
-            "bc": bc.detach().numpy().reshape(-1, 1),
-            "bo": bo.detach().numpy().reshape(-1, 1),
-            "C": torch.stack(self.C).detach().numpy(),
-            "y": np.swapaxes(
-                np.swapaxes(torch.stack(self.A).detach().numpy(), 1, 0), 1, 2
-            ),
-            "dLdA": np.array([a.grad.numpy() for a in self.A]),
-            "dLdWu": dWu.numpy(),
-            "dLdWf": dWf.numpy(),
-            "dLdWc": dWc.numpy(),
-            "dLdWo": dWo.numpy(),
-            "dLdBu": dbu.numpy().reshape(-1, 1),
-            "dLdBf": dbf.numpy().reshape(-1, 1),
-            "dLdBc": dbc.numpy().reshape(-1, 1),
-            "dLdBo": dbo.numpy().reshape(-1, 1),
-            "dLdX": self.X.grad.numpy(),
-        }
-        return grads
-
-
-class TorchRNNCell(nn.Module):
-    def __init__(self, n_in, n_hid, params, **kwargs):
-        super(TorchRNNCell, self).__init__()
-
-        self.layer1 = nn.RNNCell(n_in, n_hid, bias=True, nonlinearity="tanh")
-
-        # set weights and bias to match those of RNNCell
-        # NB: we pass the *transpose* of the RNNCell weights and biases to
-        # pytorch, meaning we need to check against the *transpose* of our
-        # outputs for any function of the weights
-        self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(params["Wax"].T))
-        self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(params["Waa"].T))
-        self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(params["bx"].T))
-        self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(params["ba"].T))
-
-    def forward(self, X):
-        self.X = X
-        if not isinstance(self.X, torch.Tensor):
-            self.X = torchify(self.X)
-
-        self.X.retain_grad()
-
-        # initial hidden state is 0
-        n_ex, n_in, n_timesteps = self.X.shape
-        n_out, n_out = self.layer1.weight_hh.shape
-
-        # initialize hidden states
-        a0 = torchify(np.zeros((n_ex, n_out)))
-        a0.retain_grad()
-
-        # forward pass
-        A = []
-        at = a0
-        for t in range(n_timesteps):
-            A += [at]
-            at1 = self.layer1(self.X[:, :, t], at)
-            at.retain_grad()
-            at = at1
-
-        at.retain_grad()
-        A += [at]
-
-        # don't inclue a0 in our outputs
-        self.A = A[1:]
-        return self.A
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss = torch.stack(self.A).sum()
-        self.loss.backward()
-        grads = {
-            "X": self.X.detach().numpy(),
-            "ba": self.layer1.bias_hh.detach().numpy(),
-            "bx": self.layer1.bias_ih.detach().numpy(),
-            "Wax": self.layer1.weight_ih.detach().numpy(),
-            "Waa": self.layer1.weight_hh.detach().numpy(),
-            "y": torch.stack(self.A).detach().numpy(),
-            "dLdA": np.array([a.grad.numpy() for a in self.A]),
-            "dLdWaa": self.layer1.weight_hh.grad.numpy(),
-            "dLdWax": self.layer1.weight_ih.grad.numpy(),
-            "dLdBa": self.layer1.bias_hh.grad.numpy(),
-            "dLdBx": self.layer1.bias_ih.grad.numpy(),
-            "dLdX": self.X.grad.numpy(),
-        }
-        return grads
-
-
-class TorchFCLayer(nn.Module):
-    def __init__(self, n_in, n_hid, act_fn, params, **kwargs):
-        super(TorchFCLayer, self).__init__()
-        self.layer1 = nn.Linear(n_in, n_hid)
-
-        # explicitly set weights and bias
-        # NB: we pass the *transpose* of the weights to pytorch, meaning
-        # we'll need to check against the *transpose* of our outputs for
-        # any function of the weights
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(params["W"].T))
-        self.layer1.bias = nn.Parameter(torch.FloatTensor(params["b"]))
-
-        self.act_fn = act_fn
-        self.model = nn.Sequential(self.layer1, self.act_fn)
-
-    def forward(self, X):
-        self.X = X
-        if not isinstance(X, torch.Tensor):
-            self.X = torchify(X)
-
-        self.z1 = self.layer1(self.X)
-        self.z1.retain_grad()
-
-        self.out1 = self.act_fn(self.z1)
-        self.out1.retain_grad()
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss1 = self.out1.sum()
-        self.loss1.backward()
-        grads = {
-            "X": self.X.detach().numpy(),
-            "b": self.layer1.bias.detach().numpy(),
-            "W": self.layer1.weight.detach().numpy(),
-            "y": self.out1.detach().numpy(),
-            "dLdy": self.out1.grad.numpy(),
-            "dLdZ": self.z1.grad.numpy(),
-            "dLdB": self.layer1.bias.grad.numpy(),
-            "dLdW": self.layer1.weight.grad.numpy(),
-            "dLdX": self.X.grad.numpy(),
-        }
-        return grads
-
-
-class TorchEmbeddingLayer(nn.Module):
-    def __init__(self, vocab_size, n_out, params, **kwargs):
-        super(TorchEmbeddingLayer, self).__init__()
-        self.layer1 = nn.Embedding(vocab_size, n_out)
-
-        # explicitly set embedding weights
-        self.layer1.weight = nn.Parameter(torch.FloatTensor(params["W"]))
-        self.model = nn.Sequential(self.layer1)
-
-    def forward(self, X):
-        self.X = X
-        if not isinstance(X, torch.Tensor):
-            self.X = torch.from_numpy(X)
-
-        self.out1 = self.layer1(self.X)
-        self.out1.retain_grad()
-
-    def extract_grads(self, X):
-        self.forward(X)
-        self.loss1 = self.out1.sum()
-        self.loss1.backward()
-        grads = {
-            "X": self.X.detach().numpy(),
-            "W": self.layer1.weight.detach().numpy(),
-            "y": self.out1.detach().numpy(),
-            "dLdy": self.out1.grad.numpy(),
-            "dLdW": self.layer1.weight.grad.numpy(),
-        }
-        return grads
-
-
-class TorchSDPAttentionLayer(nn.Module):
-    def __init__(self):
-        super(TorchSDPAttentionLayer, self).__init__()
-
-    def forward(self, Q, K, V, mask=None):
-        self.Q = Q
-        self.K = K
-        self.V = V
-
-        if not isinstance(self.Q, torch.Tensor):
-            self.Q = torchify(self.Q)
-        if not isinstance(self.K, torch.Tensor):
-            self.K = torchify(self.K)
-        if not isinstance(self.V, torch.Tensor):
-            self.V = torchify(self.V)
-
-        self.Q.retain_grad()
-        self.K.retain_grad()
-        self.V.retain_grad()
-
-        self.d_k = self.Q.size(-1)
-        self.scores = torch.matmul(self.Q, self.K.transpose(-2, -1)) / np.sqrt(self.d_k)
-        if mask is not None:
-            self.scores = self.scores.masked_fill(mask == 0, -1e9)
-        self.scores.retain_grad()
-
-        self.weights = F.softmax(self.scores, dim=-1)
-        self.weights.retain_grad()
-        self.Y = torch.matmul(self.weights, self.V)
-        self.Y.retain_grad()
-        return self.Y, self.weights
-
-    def extract_grads(self, Q, K, V, mask=None):
-        self.forward(Q, K, V, mask=mask)
-        self.loss1 = self.Y.sum()
-        self.loss1.backward()
-        grads = {
-            "Q": self.Q.detach().numpy(),
-            "K": self.K.detach().numpy(),
-            "V": self.V.detach().numpy(),
-            "d_k": self.d_k,
-            "scores": self.scores.detach().numpy(),
-            "weights": self.weights.detach().numpy(),
-            "Y": self.Y.detach().numpy(),
-            "dLdV": self.V.grad.numpy(),
-            "dWeights": self.weights.grad.numpy(),
-            "dScores": self.scores.grad.numpy(),
-            "dLdQ": self.Q.grad.numpy(),
-            "dLdK": self.K.grad.numpy(),
-        }
-        return grads
-
-
-class TorchMultiHeadedAttentionModule(nn.Module):
-    def __init__(self, params, hparams):
-        "Take in model size and number of heads."
-        super(TorchMultiHeadedAttentionModule, self).__init__()
-        assert hparams["kqv_dim"] % hparams["n_heads"] == 0
-        self.n_heads = hparams["n_heads"]
-        self.latent_dim = hparams["kqv_dim"] // hparams["n_heads"]
-        self.p_dropout = hparams["dropout_p"]
-        self.projections = {
-            "Q": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
-            "K": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
-            "V": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
-            "O": nn.Linear(hparams["kqv_dim"], hparams["kqv_dim"]),
-        }
-        self.projections["Q"].weight = nn.Parameter(
-            torch.FloatTensor(params["components"]["Q"]["W"].T)
-        )
-        self.projections["Q"].bias = nn.Parameter(
-            torch.FloatTensor(params["components"]["Q"]["b"])
-        )
-        self.projections["K"].weight = nn.Parameter(
-            torch.FloatTensor(params["components"]["K"]["W"].T)
-        )
-        self.projections["K"].bias = nn.Parameter(
-            torch.FloatTensor(params["components"]["K"]["b"])
-        )
-        self.projections["V"].weight = nn.Parameter(
-            torch.FloatTensor(params["components"]["V"]["W"].T)
-        )
-        self.projections["V"].bias = nn.Parameter(
-            torch.FloatTensor(params["components"]["V"]["b"])
-        )
-        self.projections["O"].weight = nn.Parameter(
-            torch.FloatTensor(params["components"]["O"]["W"].T)
-        )
-        self.projections["O"].bias = nn.Parameter(
-            torch.FloatTensor(params["components"]["O"]["b"])
-        )
-
-        self.attn = None
-        self.dropout = nn.Dropout(p=hparams["dropout_p"])
-
-    def forward(self, Q, K, V, mask=None):
-        self.Q = Q
-        self.K = K
-        self.V = V
-
-        if not isinstance(self.Q, torch.Tensor):
-            self.Q = torchify(self.Q)
-        if not isinstance(self.K, torch.Tensor):
-            self.K = torchify(self.K)
-        if not isinstance(self.V, torch.Tensor):
-            self.V = torchify(self.V)
-
-        self.Q.retain_grad()
-        self.K.retain_grad()
-        self.V.retain_grad()
-
-        if mask is not None:
-            # Same mask applied to all h heads.
-            mask = mask.unsqueeze(1)
-        n_ex = self.Q.size(0)
-
-        self.Q_proj = (
-            self.projections["Q"](self.Q)
-            .view(n_ex, -1, self.n_heads, self.latent_dim)
-            .transpose(1, 2)
-        )
-
-        self.K_proj = (
-            self.projections["K"](self.K)
-            .view(n_ex, -1, self.n_heads, self.latent_dim)
-            .transpose(1, 2)
-        )
-
-        self.V_proj = (
-            self.projections["V"](self.V)
-            .view(n_ex, -1, self.n_heads, self.latent_dim)
-            .transpose(1, 2)
-        )
-
-        self.Q_proj.retain_grad()
-        self.K_proj.retain_grad()
-        self.V_proj.retain_grad()
-
-        # 2) Apply attention on all the projected vectors in batch.
-        self.attn_out, self.attn = TorchSDPAttentionLayer().forward(
-            self.Q_proj, self.K_proj, self.V_proj, mask=mask
-        )
-        self.attn.retain_grad()
-        self.attn_out.retain_grad()
-
-        # 3) "Concat" using a view and apply a final linear transformation
-        self.attn_out_reshaped = (
-            self.attn_out.transpose(1, 2)
-            .contiguous()
-            .view(n_ex, -1, self.n_heads * self.latent_dim)
-        )
-        self.attn_out_reshaped.retain_grad()
-        print(self.attn_out_reshaped.shape)
-        self.Y = self.projections["O"](self.attn_out_reshaped)
-        print(self.Y.shape)
-        self.Y.retain_grad()
-
-    def extract_grads(self, Q, K, V, mask=None):
-        self.forward(Q, K, V, mask=mask)
-        self.loss1 = self.Y.sum()
-        self.loss1.backward()
-        grads = {
-            "Q": self.Q.detach().numpy(),
-            "K": self.K.detach().numpy(),
-            "V": self.V.detach().numpy(),
-            "O_W": self.projections["O"].weight.detach().numpy().T,
-            "V_W": self.projections["V"].weight.detach().numpy().T,
-            "K_W": self.projections["K"].weight.detach().numpy().T,
-            "Q_W": self.projections["Q"].weight.detach().numpy().T,
-            "O_b": self.projections["O"].bias.detach().numpy(),
-            "V_b": self.projections["V"].bias.detach().numpy(),
-            "K_b": self.projections["K"].bias.detach().numpy(),
-            "Q_b": self.projections["Q"].bias.detach().numpy(),
-            "latent_dim": self.latent_dim,
-            "n_heads": self.n_heads,
-            "Q_proj": self.Q_proj.detach().numpy(),  # .reshape(self.Q_proj.shape[0], -1),
-            "K_proj": self.K_proj.detach().numpy(),  # .reshape(self.K_proj.shape[0], -1),
-            "V_proj": self.V_proj.detach().numpy(),  # .reshape(self.V_proj.shape[0], -1),
-            "weights": self.attn.detach().numpy(),
-            "attn_out": self.attn_out_reshaped.detach().numpy(),  # .squeeze(),
-            #  .reshape(self.attn_out_reshaped.shape[0], -1),
-            "Y": self.Y.detach().numpy(),
-            "dO_W": self.projections["O"].weight.grad.numpy().T,
-            "dV_W": self.projections["V"].weight.grad.numpy().T,
-            "dK_W": self.projections["K"].weight.grad.numpy().T,
-            "dQ_W": self.projections["Q"].weight.grad.numpy().T,
-            "dO_b": self.projections["O"].bias.grad.numpy(),
-            "dV_b": self.projections["V"].bias.grad.numpy(),
-            "dK_b": self.projections["K"].bias.grad.numpy(),
-            "dQ_b": self.projections["Q"].bias.grad.numpy(),
-            "dLdy": self.Y.grad.numpy(),
-            "dAttn_out": self.attn_out_reshaped.grad.numpy(),
-            "dWeights": self.attn.grad.numpy(),
-            "dQ_proj": self.Q_proj.grad.numpy(),
-            "dK_proj": self.K_proj.grad.numpy(),
-            "dV_proj": self.V_proj.grad.numpy(),
-            "dQ": self.Q.grad.numpy(),
-            "dK": self.K.grad.numpy(),
-            "dV": self.V.grad.numpy(),
-        }
-        return grads
-
-
-#######################################################################
-#              TF WGAN GP Gold Standard Implementation                #
-#  adapted from: https://github.com/igul222/improved_wgan_training/   #
-#######################################################################
-
-_params = {}
-_param_aliases = {}
-
-
-def param(name, *args, **kwargs):
-    """
-    A wrapper for `tf.Variable` which enables parameter sharing in models.
-
-    Creates and returns theano shared variables similarly to `tf.Variable`,
-    except if you try to create a param with the same name as a
-    previously-created one, `param(...)` will just return the old one instead of
-    making a new one.
-
-    This constructor also adds a `param` attribute to the shared variables it
-    creates, so that you can easily search a graph for all params.
-    """
-
-    if name not in _params:
-        kwargs["name"] = name
-        param = tf.Variable(*args, **kwargs)
-        param.param = True
-        _params[name] = param
-    result = _params[name]
-    i = 0
-    while result in _param_aliases:
-        i += 1
-        result = _param_aliases[result]
-    return result
-
-
-def params_with_name(name):
-    return [p for n, p in _params.items() if name in n]
-
-
-def ReLULayer(name, n_in, n_out, inputs, w_initialization):
-    if isinstance(w_initialization, np.ndarray):
-        weight_values = w_initialization.astype("float32")
-
-    W = param(name + ".W", weight_values)
-    result = tf.matmul(inputs, W)
-    output = tf.nn.bias_add(
-        result, param(name + ".b", np.zeros((n_out,), dtype="float32"))
-    )
-    output = tf.nn.relu(output)
-    return output, W
-
-
-def LinearLayer(name, n_in, n_out, inputs, w_initialization):
-    if isinstance(w_initialization, np.ndarray):
-        weight_values = w_initialization.astype("float32")
-
-    W = param(name + ".W", weight_values)
-    result = tf.matmul(inputs, W)
-    output = tf.nn.bias_add(
-        result, param(name + ".b", np.zeros((n_out,), dtype="float32"))
-    )
-    return output, W
-
-
-def Generator(n_samples, X_real, params=None):
-    n_feats = 2
-    W1 = W2 = W3 = W4 = "he"
-    noise = tf.random_normal([n_samples, 2])
-    if params is not None:
-        noise = tf.convert_to_tensor(params["noise"], dtype="float32")
-        W1 = params["generator"]["FC1"]["W"]
-        W2 = params["generator"]["FC2"]["W"]
-        W3 = params["generator"]["FC3"]["W"]
-        W4 = params["generator"]["FC4"]["W"]
-        DIM = params["g_hidden"]
-        n_feats = params["n_in"]
-
-    outs = {}
-    weights = {}
-    output, W = ReLULayer("Generator.1", n_feats, DIM, noise, w_initialization=W1)
-    outs["FC1"] = output
-    weights["FC1"] = W
-    output, W = ReLULayer("Generator.2", DIM, DIM, output, w_initialization=W2)
-    outs["FC2"] = output
-    weights["FC2"] = W
-    output, W = ReLULayer("Generator.3", DIM, DIM, output, w_initialization=W3)
-    outs["FC3"] = output
-    weights["FC3"] = W
-    output, W = LinearLayer("Generator.4", DIM, n_feats, output, w_initialization=W4)
-    outs["FC4"] = output
-    weights["FC4"] = W
-    return output, outs, weights
-
-
-def Discriminator(inputs, params=None):
-    n_feats = 2
-    W1 = W2 = W3 = W4 = "he"
-    if params is not None:
-        W1 = params["critic"]["FC1"]["W"]
-        W2 = params["critic"]["FC2"]["W"]
-        W3 = params["critic"]["FC3"]["W"]
-        W4 = params["critic"]["FC4"]["W"]
-        DIM = params["g_hidden"]
-        n_feats = params["n_in"]
-
-    outs = {}
-    weights = {}
-    output, W = ReLULayer("Discriminator.1", n_feats, DIM, inputs, w_initialization=W1)
-    outs["FC1"] = output
-    weights["FC1"] = W
-
-    output, W = ReLULayer("Discriminator.2", DIM, DIM, output, w_initialization=W2)
-    outs["FC2"] = output
-    weights["FC2"] = W
-
-    output, W = ReLULayer("Discriminator.3", DIM, DIM, output, w_initialization=W3)
-    outs["FC3"] = output
-    weights["FC3"] = W
-
-    output, W = LinearLayer("Discriminator.4", DIM, 1, output, w_initialization=W4)
-    outs["FC4"] = output
-    weights["FC4"] = W
-
-    # get bias
-    for var in params_with_name("Discriminator"):
-        if "1.b:" in var.name:
-            weights["FC1_b"] = var
-        elif "2.b:" in var.name:
-            weights["FC2_b"] = var
-        elif "3.b:" in var.name:
-            weights["FC3_b"] = var
-        elif "4.b:" in var.name:
-            weights["FC4_b"] = var
-
-    return tf.reshape(output, [-1]), outs, weights
-
-
-def WGAN_GP_tf(X, lambda_, params, batch_size):
-    batch_size = X.shape[0]
-
-    # get alpha value
-    n_steps = params["n_steps"]
-    c_updates_per_epoch = params["c_updates_per_epoch"]
-    alpha = tf.convert_to_tensor(params["alpha"], dtype="float32")
-
-    X_real = tf.placeholder(tf.float32, shape=[None, params["n_in"]])
-    X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params)
-
-    Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params)
-    Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params)
-
-    # WGAN loss
-    mean_fake = tf.reduce_mean(Y_fake)
-    mean_real = tf.reduce_mean(Y_real)
-
-    C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real)
-    G_loss = -tf.reduce_mean(Y_fake)
-
-    # WGAN gradient penalty
-    X_interp = alpha * X_real + ((1 - alpha) * X_fake)
-    Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params)
-    gradInterp = tf.gradients(Y_interp, [X_interp])[0]
-
-    norm_gradInterp = tf.sqrt(
-        tf.reduce_sum(tf.square(gradInterp), reduction_indices=[1])
-    )
-    gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2)
-    C_loss += lambda_ * gradient_penalty
-
-    # extract gradient of Y_interp wrt. each layer output in critic
-    C_bwd_Y_interp = {}
-    for k, v in C_out_Y_interp.items():
-        C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0]
-
-    C_bwd_W = {}
-    for k, v in C_Y_interp_weights.items():
-        C_bwd_W[k] = tf.gradients(C_loss, [v])[0]
-
-    # get gradients
-    dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0]
-    dC_Y_real = tf.gradients(C_loss, [Y_real])[0]
-    dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0]
-    dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0]
-
-    with tf.Session() as session:
-        session.run(tf.global_variables_initializer())
-
-        for iteration in range(n_steps):
-            # Train critic
-            for i in range(c_updates_per_epoch):
-                _data = X
-                (
-                    _alpha,
-                    _X_interp,
-                    _Y_interp,
-                    _gradInterp,
-                    _norm_gradInterp,
-                    _gradient_penalty,
-                    _C_loss,
-                    _X_fake,
-                    _Y_fake,
-                    _Y_real,
-                    _dC_Y_fake,
-                    _dC_Y_real,
-                    _dC_gradInterp,
-                    _dG_Y_fake,
-                    _mean_fake,
-                    _mean_real,
-                    _G_weights_FC1,
-                    _G_weights_FC2,
-                    _G_weights_FC3,
-                    _G_weights_FC4,
-                    _G_fwd_X_fake_FC1,
-                    _G_fwd_X_fake_FC2,
-                    _G_fwd_X_fake_FC3,
-                    _G_fwd_X_fake_FC4,
-                    _C_weights_Y_fake_FC1,
-                    _C_weights_Y_fake_FC2,
-                    _C_weights_Y_fake_FC3,
-                    _C_weights_Y_fake_FC4,
-                    _C_fwd_Y_fake_FC1,
-                    _C_fwd_Y_fake_FC2,
-                    _C_fwd_Y_fake_FC3,
-                    _C_fwd_Y_fake_FC4,
-                    _C_weights_Y_real_FC1,
-                    _C_weights_Y_real_FC2,
-                    _C_weights_Y_real_FC3,
-                    _C_weights_Y_real_FC4,
-                    _C_fwd_Y_real_FC1,
-                    _C_fwd_Y_real_FC2,
-                    _C_fwd_Y_real_FC3,
-                    _C_fwd_Y_real_FC4,
-                    _C_weights_Y_interp_FC1,
-                    _C_weights_Y_interp_FC2,
-                    _C_weights_Y_interp_FC3,
-                    _C_weights_Y_interp_FC4,
-                    _C_dY_interp_wrt_FC1,
-                    _C_dY_interp_wrt_FC2,
-                    _C_dY_interp_wrt_FC3,
-                    _C_dY_interp_wrt_FC4,
-                    _C_fwd_Y_interp_FC1,
-                    _C_fwd_Y_interp_FC2,
-                    _C_fwd_Y_interp_FC3,
-                    _C_fwd_Y_interp_FC4,
-                    _C_dW_FC1,
-                    _C_db_FC1,
-                    _C_dW_FC2,
-                    _C_db_FC2,
-                    _C_dW_FC3,
-                    _C_db_FC3,
-                    _C_dW_FC4,
-                    _C_db_FC4,
-                ) = session.run(
-                    [
-                        alpha,
-                        X_interp,
-                        Y_interp,
-                        gradInterp,
-                        norm_gradInterp,
-                        gradient_penalty,
-                        C_loss,
-                        X_fake,
-                        Y_fake,
-                        Y_real,
-                        dC_Y_fake,
-                        dC_Y_real,
-                        dC_gradInterp,
-                        dG_Y_fake,
-                        mean_fake,
-                        mean_real,
-                        G_weights["FC1"],
-                        G_weights["FC2"],
-                        G_weights["FC3"],
-                        G_weights["FC4"],
-                        G_out_X_fake["FC1"],
-                        G_out_X_fake["FC2"],
-                        G_out_X_fake["FC3"],
-                        G_out_X_fake["FC4"],
-                        C_Y_fake_weights["FC1"],
-                        C_Y_fake_weights["FC2"],
-                        C_Y_fake_weights["FC3"],
-                        C_Y_fake_weights["FC4"],
-                        C_out_Y_fake["FC1"],
-                        C_out_Y_fake["FC2"],
-                        C_out_Y_fake["FC3"],
-                        C_out_Y_fake["FC4"],
-                        C_Y_real_weights["FC1"],
-                        C_Y_real_weights["FC2"],
-                        C_Y_real_weights["FC3"],
-                        C_Y_real_weights["FC4"],
-                        C_out_Y_real["FC1"],
-                        C_out_Y_real["FC2"],
-                        C_out_Y_real["FC3"],
-                        C_out_Y_real["FC4"],
-                        C_Y_interp_weights["FC1"],
-                        C_Y_interp_weights["FC2"],
-                        C_Y_interp_weights["FC3"],
-                        C_Y_interp_weights["FC4"],
-                        C_bwd_Y_interp["FC1"],
-                        C_bwd_Y_interp["FC2"],
-                        C_bwd_Y_interp["FC3"],
-                        C_bwd_Y_interp["FC4"],
-                        C_out_Y_interp["FC1"],
-                        C_out_Y_interp["FC2"],
-                        C_out_Y_interp["FC3"],
-                        C_out_Y_interp["FC4"],
-                        C_bwd_W["FC1"],
-                        C_bwd_W["FC1_b"],
-                        C_bwd_W["FC2"],
-                        C_bwd_W["FC2_b"],
-                        C_bwd_W["FC3"],
-                        C_bwd_W["FC3_b"],
-                        C_bwd_W["FC4"],
-                        C_bwd_W["FC4_b"],
-                    ],
-                    feed_dict={X_real: _data},
-                )
-
-            _G_loss = session.run(G_loss, feed_dict={X_real: _data})
-
-        grads = {
-            "X_real": _data,
-            "X_interp": _X_interp,
-            "G_weights_FC1": _G_weights_FC1,
-            "G_weights_FC2": _G_weights_FC2,
-            "G_weights_FC3": _G_weights_FC3,
-            "G_weights_FC4": _G_weights_FC4,
-            "G_fwd_X_fake_FC1": _G_fwd_X_fake_FC1,
-            "G_fwd_X_fake_FC2": _G_fwd_X_fake_FC2,
-            "G_fwd_X_fake_FC3": _G_fwd_X_fake_FC3,
-            "G_fwd_X_fake_FC4": _G_fwd_X_fake_FC4,
-            "X_fake": _X_fake,
-            "C_weights_Y_fake_FC1": _C_weights_Y_fake_FC1,
-            "C_weights_Y_fake_FC2": _C_weights_Y_fake_FC2,
-            "C_weights_Y_fake_FC3": _C_weights_Y_fake_FC3,
-            "C_weights_Y_fake_FC4": _C_weights_Y_fake_FC4,
-            "C_fwd_Y_fake_FC1": _C_fwd_Y_fake_FC1,
-            "C_fwd_Y_fake_FC2": _C_fwd_Y_fake_FC2,
-            "C_fwd_Y_fake_FC3": _C_fwd_Y_fake_FC3,
-            "C_fwd_Y_fake_FC4": _C_fwd_Y_fake_FC4,
-            "Y_fake": _Y_fake,
-            "C_weights_Y_real_FC1": _C_weights_Y_real_FC1,
-            "C_weights_Y_real_FC2": _C_weights_Y_real_FC2,
-            "C_weights_Y_real_FC3": _C_weights_Y_real_FC3,
-            "C_weights_Y_real_FC4": _C_weights_Y_real_FC4,
-            "C_fwd_Y_real_FC1": _C_fwd_Y_real_FC1,
-            "C_fwd_Y_real_FC2": _C_fwd_Y_real_FC2,
-            "C_fwd_Y_real_FC3": _C_fwd_Y_real_FC3,
-            "C_fwd_Y_real_FC4": _C_fwd_Y_real_FC4,
-            "Y_real": _Y_real,
-            "C_weights_Y_interp_FC1": _C_weights_Y_interp_FC1,
-            "C_weights_Y_interp_FC2": _C_weights_Y_interp_FC2,
-            "C_weights_Y_interp_FC3": _C_weights_Y_interp_FC3,
-            "C_weights_Y_interp_FC4": _C_weights_Y_interp_FC4,
-            "C_fwd_Y_interp_FC1": _C_fwd_Y_interp_FC1,
-            "C_fwd_Y_interp_FC2": _C_fwd_Y_interp_FC2,
-            "C_fwd_Y_interp_FC3": _C_fwd_Y_interp_FC3,
-            "C_fwd_Y_interp_FC4": _C_fwd_Y_interp_FC4,
-            "Y_interp": _Y_interp,
-            "dY_interp_wrt_FC1": _C_dY_interp_wrt_FC1,
-            "dY_interp_wrt_FC2": _C_dY_interp_wrt_FC2,
-            "dY_interp_wrt_FC3": _C_dY_interp_wrt_FC3,
-            "dY_interp_wrt_FC4": _C_dY_interp_wrt_FC4,
-            "gradInterp": _gradInterp,
-            "gradInterp_norm": _norm_gradInterp,
-            "G_loss": _G_loss,
-            "C_loss": _C_loss,
-            "dC_loss_dW_FC1": _C_dW_FC1,
-            "dC_loss_db_FC1": _C_db_FC1,
-            "dC_loss_dW_FC2": _C_dW_FC2,
-            "dC_loss_db_FC2": _C_db_FC2,
-            "dC_loss_dW_FC3": _C_dW_FC3,
-            "dC_loss_db_FC3": _C_db_FC3,
-            "dC_loss_dW_FC4": _C_dW_FC4,
-            "dC_loss_db_FC4": _C_db_FC4,
-            "dC_Y_fake": _dC_Y_fake,
-            "dC_Y_real": _dC_Y_real,
-            "dC_gradInterp": _dC_gradInterp,
-            "dG_Y_fake": _dG_Y_fake,
-        }
-    return grads
-
-
-def TFNCELoss(X, target_word, L):
-    from tensorflow.python.ops.nn_impl import _compute_sampled_logits
-    from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits
-
-    in_embed = tf.placeholder(tf.float32, shape=X.shape)
-    in_bias = tf.placeholder(tf.float32, shape=L.parameters["b"].flatten().shape)
-    in_weights = tf.placeholder(tf.float32, shape=L.parameters["W"].shape)
-    in_target_word = tf.placeholder(tf.int64)
-    in_neg_samples = tf.placeholder(tf.int32)
-    in_target_prob = tf.placeholder(tf.float32)
-    in_neg_samp_prob = tf.placeholder(tf.float32)
-
-    feed = {
-        in_embed: X,
-        in_weights: L.parameters["W"],
-        in_target_word: target_word,
-        in_bias: L.parameters["b"].flatten(),
-        in_neg_samples: L.derived_variables["noise_samples"][0],
-        in_target_prob: L.derived_variables["noise_samples"][1],
-        in_neg_samp_prob: L.derived_variables["noise_samples"][2],
-    }
-
-    # Compute the NCE loss, using a sample of the negative labels each time.
-    nce_unreduced = tf.nn.nce_loss(
-        weights=in_weights,
-        biases=in_bias,
-        labels=in_target_word,
-        inputs=in_embed,
-        sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),
-        num_sampled=L.num_negative_samples,
-        num_classes=L.n_classes,
-    )
-
-    loss = tf.reduce_sum(nce_unreduced)
-    dLdW = tf.gradients(loss, [in_weights])[0]
-    dLdb = tf.gradients(loss, [in_bias])[0]
-    dLdX = tf.gradients(loss, [in_embed])[0]
-
-    sampled_logits, sampled_labels = _compute_sampled_logits(
-        weights=in_weights,
-        biases=in_bias,
-        labels=in_target_word,
-        inputs=in_embed,
-        sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),
-        num_sampled=L.num_negative_samples,
-        num_classes=L.n_classes,
-        num_true=1,
-        subtract_log_q=True,
-    )
-
-    sampled_losses = sigmoid_cross_entropy_with_logits(
-        labels=sampled_labels, logits=sampled_logits
-    )
-
-    with tf.Session() as session:
-        session.run(tf.global_variables_initializer())
-        (
-            _final_loss,
-            _nce_unreduced,
-            _dLdW,
-            _dLdb,
-            _dLdX,
-            _sampled_logits,
-            _sampled_labels,
-            _sampled_losses,
-        ) = session.run(
-            [
-                loss,
-                nce_unreduced,
-                dLdW,
-                dLdb,
-                dLdX,
-                sampled_logits,
-                sampled_labels,
-                sampled_losses,
-            ],
-            feed_dict=feed,
-        )
-    tf.reset_default_graph()
-    return {
-        "final_loss": _final_loss,
-        "nce_unreduced": _nce_unreduced,
-        "dLdW": _dLdW,
-        "dLdb": _dLdb,
-        "dLdX": _dLdX,
-        "out_logits": _sampled_logits,
-        "out_labels": _sampled_labels,
-        "sampled_loss": _sampled_losses,
-    }
diff --git a/numpy_ml/ngram/tests.py b/numpy_ml/ngram/tests.py
deleted file mode 100644
index 09f03b4..0000000
--- a/numpy_ml/ngram/tests.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import nltk
-import numpy as np
-
-from ..preprocessing.nlp import tokenize_words
-from .ngram import AdditiveNGram, MLENGram
-
-
-class MLEGold:
-    def __init__(
-        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
-    ):
-        self.N = N
-        self.K = K
-        self.unk = unk
-        self.filter_stopwords = filter_stopwords
-        self.filter_punctuation = filter_punctuation
-
-        self.hyperparameters = {
-            "N": N,
-            "K": K,
-            "unk": unk,
-            "filter_stopwords": filter_stopwords,
-            "filter_punctuation": filter_punctuation,
-        }
-
-        super().__init__()
-
-    def train(self, corpus_fp, vocab=None, encoding=None):
-        N = self.N
-        H = self.hyperparameters
-        models, counts = {}, {}
-        grams = {n: [] for n in range(1, N + 1)}
-        gg = {n: [] for n in range(1, N + 1)}
-        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
-
-        n_words = 0
-        tokens = set([])
-
-        with open(corpus_fp, "r", encoding=encoding) as text:
-            for line in text:
-                words = tokenize_words(line, filter_punc, filter_stop)
-
-                if vocab is not None:
-                    words = vocab.filter(words, H["unk"])
-
-                if len(words) == 0:
-                    continue
-
-                n_words += len(words)
-                tokens.update(words)
-
-                # calculate n, n-1, ... 1-grams
-                for n in range(1, N + 1):
-                    grams[n].append(
-                        nltk.ngrams(
-                            words,
-                            n,
-                            pad_left=True,
-                            pad_right=True,
-                            left_pad_symbol="<bol>",
-                            right_pad_symbol="<eol>",
-                        )
-                    )
-
-                    gg[n].extend(
-                        list(
-                            nltk.ngrams(
-                                words,
-                                n,
-                                pad_left=True,
-                                pad_right=True,
-                                left_pad_symbol="<bol>",
-                                right_pad_symbol="<eol>",
-                            )
-                        )
-                    )
-
-        for n in range(1, N + 1):
-            counts[n] = nltk.FreqDist(gg[n])
-            models[n] = nltk.lm.MLE(order=n)
-            models[n].fit(grams[n], tokens)
-
-        self.counts = counts
-        self.n_words = n_words
-        self._models = models
-        self.n_tokens = len(vocab) if vocab is not None else len(tokens)
-
-    def log_prob(self, words, N):
-        assert N in self.counts, "You do not have counts for {}-grams".format(N)
-
-        if N > len(words):
-            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
-            raise ValueError(err)
-
-        total_prob = 0
-        for ngram in nltk.ngrams(words, N):
-            total_prob += self._log_ngram_prob(ngram)
-        return total_prob
-
-    def _log_ngram_prob(self, ngram):
-        N = len(ngram)
-        return self._models[N].logscore(ngram[-1], ngram[:-1])
-
-
-class AdditiveGold:
-    def __init__(
-        self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
-    ):
-        self.N = N
-        self.K = K
-        self.unk = unk
-        self.filter_stopwords = filter_stopwords
-        self.filter_punctuation = filter_punctuation
-
-        self.hyperparameters = {
-            "N": N,
-            "K": K,
-            "unk": unk,
-            "filter_stopwords": filter_stopwords,
-            "filter_punctuation": filter_punctuation,
-        }
-
-        super().__init__()
-
-    def train(self, corpus_fp, vocab=None, encoding=None):
-        N = self.N
-        H = self.hyperparameters
-        models, counts = {}, {}
-        grams = {n: [] for n in range(1, N + 1)}
-        gg = {n: [] for n in range(1, N + 1)}
-        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
-
-        n_words = 0
-        tokens = set()
-
-        with open(corpus_fp, "r", encoding=encoding) as text:
-            for line in text:
-                words = tokenize_words(line, filter_punc, filter_stop)
-
-                if vocab is not None:
-                    words = vocab.filter(words, H["unk"])
-
-                if len(words) == 0:
-                    continue
-
-                n_words += len(words)
-                tokens.update(words)
-
-                # calculate n, n-1, ... 1-grams
-                for n in range(1, N + 1):
-                    grams[n].append(
-                        nltk.ngrams(
-                            words,
-                            n,
-                            pad_left=True,
-                            pad_right=True,
-                            left_pad_symbol="<bol>",
-                            right_pad_symbol="<eol>",
-                        )
-                    )
-
-                    gg[n].extend(
-                        list(
-                            nltk.ngrams(
-                                words,
-                                n,
-                                pad_left=True,
-                                pad_right=True,
-                                left_pad_symbol="<bol>",
-                                right_pad_symbol="<eol>",
-                            )
-                        )
-                    )
-
-        for n in range(1, N + 1):
-            counts[n] = nltk.FreqDist(gg[n])
-            models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)
-            models[n].fit(grams[n], tokens)
-
-        self.counts = counts
-        self._models = models
-        self.n_words = n_words
-        self.n_tokens = len(vocab) if vocab is not None else len(tokens)
-
-    def log_prob(self, words, N):
-        assert N in self.counts, "You do not have counts for {}-grams".format(N)
-
-        if N > len(words):
-            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
-            raise ValueError(err)
-
-        total_prob = 0
-        for ngram in nltk.ngrams(words, N):
-            total_prob += self._log_ngram_prob(ngram)
-        return total_prob
-
-    def _log_ngram_prob(self, ngram):
-        N = len(ngram)
-        return self._models[N].logscore(ngram[-1], ngram[:-1])
-
-
-def test_mle():
-    N = np.random.randint(2, 5)
-    gold = MLEGold(N, unk=True, filter_stopwords=False, filter_punctuation=False)
-    mine = MLENGram(N, unk=True, filter_stopwords=False, filter_punctuation=False)
-
-    gold.train("russell.txt", encoding="utf-8-sig")
-    mine.train("russell.txt", encoding="utf-8-sig")
-
-    for k in mine.counts[N].keys():
-        if k[0] == k[1] and k[0] in ("<bol>", "<eol>"):
-            continue
-
-        err_str = "{}, mine: {}, gold: {}"
-        assert mine.counts[N][k] == gold.counts[N][k], err_str.format(
-            k, mine.counts[N][k], gold.counts[N][k]
-        )
-
-        M = mine.log_prob(k, N)
-        G = gold.log_prob(k, N) / np.log2(np.e)  # convert to log base e
-        np.testing.assert_allclose(M, G)
-        print("PASSED")
-
-
-def test_additive():
-    K = np.random.rand()
-    N = np.random.randint(2, 5)
-    gold = AdditiveGold(
-        N, K, unk=True, filter_stopwords=False, filter_punctuation=False
-    )
-    mine = AdditiveNGram(
-        N, K, unk=True, filter_stopwords=False, filter_punctuation=False
-    )
-
-    gold.train("russell.txt", encoding="utf-8-sig")
-    mine.train("russell.txt", encoding="utf-8-sig")
-
-    for k in mine.counts[N].keys():
-        if k[0] == k[1] and k[0] in ("<bol>", "<eol>"):
-            continue
-
-        err_str = "{}, mine: {}, gold: {}"
-        assert mine.counts[N][k] == gold.counts[N][k], err_str.format(
-            k, mine.counts[N][k], gold.counts[N][k]
-        )
-
-        M = mine.log_prob(k, N)
-        G = gold.log_prob(k, N) / np.log2(np.e)  # convert to log base e
-        np.testing.assert_allclose(M, G)
-        print("PASSED")
diff --git a/numpy_ml/nonparametric/tests.py b/numpy_ml/nonparametric/tests.py
deleted file mode 100644
index 9c8d443..0000000
--- a/numpy_ml/nonparametric/tests.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import numpy as np
-
-from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
-from sklearn.gaussian_process import GaussianProcessRegressor
-
-from .knn import KNN
-from .gp import GPRegression
-from ..utils.distance_metrics import euclidean
-
-
-def test_knn_regression():
-    while True:
-        N = np.random.randint(2, 100)
-        M = np.random.randint(2, 100)
-        k = np.random.randint(1, N)
-        ls = np.min([np.random.randint(1, 10), N - 1])
-        weights = np.random.choice(["uniform", "distance"])
-
-        X = np.random.rand(N, M)
-        X_test = np.random.rand(N, M)
-        y = np.random.rand(N)
-
-        knn = KNN(
-            k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights
-        )
-        knn.fit(X, y)
-        preds = knn.predict(X_test)
-
-        gold = KNeighborsRegressor(
-            p=2,
-            leaf_size=ls,
-            n_neighbors=k,
-            weights=weights,
-            metric="minkowski",
-            algorithm="ball_tree",
-        )
-        gold.fit(X, y)
-        gold_preds = gold.predict(X_test)
-
-        for mine, theirs in zip(preds, gold_preds):
-            np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-def test_knn_clf():
-    while True:
-        N = np.random.randint(2, 100)
-        M = np.random.randint(2, 100)
-        k = np.random.randint(1, N)
-        n_classes = np.random.randint(10)
-        ls = np.min([np.random.randint(1, 10), N - 1])
-        weights = "uniform"
-
-        X = np.random.rand(N, M)
-        X_test = np.random.rand(N, M)
-        y = np.random.randint(0, n_classes, size=N)
-
-        knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights)
-        knn.fit(X, y)
-        preds = knn.predict(X_test)
-
-        gold = KNeighborsClassifier(
-            p=2,
-            leaf_size=ls,
-            n_neighbors=k,
-            weights=weights,
-            metric="minkowski",
-            algorithm="ball_tree",
-        )
-        gold.fit(X, y)
-        gold_preds = gold.predict(X_test)
-
-        for mine, theirs in zip(preds, gold_preds):
-            np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-def test_gp_regression():
-    while True:
-        alpha = np.random.rand()
-        N = np.random.randint(2, 100)
-        M = np.random.randint(2, 100)
-        K = np.random.randint(1, N)
-        J = np.random.randint(1, 3)
-
-        X = np.random.rand(N, M)
-        y = np.random.rand(N, J)
-        X_test = np.random.rand(K, M)
-
-        gp = GPRegression(kernel="RBFKernel(sigma=1)", alpha=alpha)
-        gold = GaussianProcessRegressor(
-            kernel=None, alpha=alpha, optimizer=None, normalize_y=False
-        )
-
-        gp.fit(X, y)
-        gold.fit(X, y)
-
-        preds, _ = gp.predict(X_test)
-        gold_preds = gold.predict(X_test)
-        np.testing.assert_almost_equal(preds, gold_preds)
-
-        mll = gp.marginal_log_likelihood()
-        gold_mll = gold.log_marginal_likelihood()
-        np.testing.assert_almost_equal(mll, gold_mll)
-
-        print("PASSED")
diff --git a/numpy_ml/bandits/plots.py b/numpy_ml/plots/bandit_plots.py
similarity index 94%
rename from numpy_ml/bandits/plots.py
rename to numpy_ml/plots/bandit_plots.py
index 9dd99fc..e39a30f 100644
--- a/numpy_ml/bandits/plots.py
+++ b/numpy_ml/plots/bandit_plots.py
@@ -4,15 +4,20 @@
 
 import numpy as np
 
-from .bandits import (
+from numpy_ml.bandits import (
     MultinomialBandit,
     BernoulliBandit,
     ShortestPathBandit,
     ContextualLinearBandit,
 )
-from .trainer import BanditTrainer
-from .policies import EpsilonGreedy, UCB1, ThompsonSamplingBetaBinomial, LinUCB
-from ..utils.graphs import random_DAG, DiGraph, Edge
+from numpy_ml.bandits.trainer import BanditTrainer
+from numpy_ml.bandits.policies import (
+    EpsilonGreedy,
+    UCB1,
+    ThompsonSamplingBetaBinomial,
+    LinUCB,
+)
+from numpy_ml.utils.graphs import random_DAG, DiGraph, Edge
 
 
 def random_multinomial_mab(n_arms=10, n_choices_per_arm=5, reward_range=[0, 1]):
diff --git a/numpy_ml/gmm/plots.py b/numpy_ml/plots/gmm_plots.py
similarity index 98%
rename from numpy_ml/gmm/plots.py
rename to numpy_ml/plots/gmm_plots.py
index 00ed952..56f9232 100644
--- a/numpy_ml/gmm/plots.py
+++ b/numpy_ml/plots/gmm_plots.py
@@ -1,10 +1,9 @@
+# flake8: noqa
 import numpy as np
 from sklearn.datasets.samples_generator import make_blobs
 
 from scipy.stats import multivariate_normal
-import matplotlib
 
-matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt
 import seaborn as sns
 
@@ -13,7 +12,7 @@
 sns.set_style("white")
 sns.set_context("paper", font_scale=1)
 
-from .gmm import GMM
+from numpy_ml.gmm import GMM
 
 from matplotlib.colors import ListedColormap
 
diff --git a/numpy_ml/hmm/plots.py b/numpy_ml/plots/hmm_plots.py
similarity index 98%
rename from numpy_ml/hmm/plots.py
rename to numpy_ml/plots/hmm_plots.py
index f400d5d..7330c55 100644
--- a/numpy_ml/hmm/plots.py
+++ b/numpy_ml/plots/hmm_plots.py
@@ -1,8 +1,5 @@
 # flake8: noqa
 import numpy as np
-import matplotlib
-
-matplotlib.use("TkAgg")
 from matplotlib import pyplot as plt
 
 import seaborn as sns
@@ -13,7 +10,7 @@
 sns.set_context("notebook", font_scale=0.8)
 
 from hmmlearn.hmm import MultinomialHMM as MHMM
-from .hmm import MultinomialHMM
+from numpy_ml.hmm import MultinomialHMM
 
 
 def generate_training_data(params, n_steps=500, n_examples=15):
diff --git a/numpy_ml/lda/plots.py b/numpy_ml/plots/lda_plots.py
similarity index 97%
rename from numpy_ml/lda/plots.py
rename to numpy_ml/plots/lda_plots.py
index 51eda85..18b584a 100644
--- a/numpy_ml/lda/plots.py
+++ b/numpy_ml/plots/lda_plots.py
@@ -1,9 +1,5 @@
 # flake8: noqa
 import numpy as np
-
-import matplotlib
-
-matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt
 import seaborn as sns
 
@@ -14,7 +10,7 @@
 
 np.random.seed(12345)
 
-from .lda import LDA
+from numpy_ml.lda import LDA
 
 
 def generate_corpus():
diff --git a/numpy_ml/linear_models/plots.py b/numpy_ml/plots/lm_plots.py
similarity index 99%
rename from numpy_ml/linear_models/plots.py
rename to numpy_ml/plots/lm_plots.py
index 4a91d48..6f9d455 100644
--- a/numpy_ml/linear_models/plots.py
+++ b/numpy_ml/plots/lm_plots.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import numpy as np
 
 from sklearn.model_selection import train_test_split
@@ -6,9 +7,6 @@
 from sklearn.datasets import make_regression
 from sklearn.metrics import zero_one_loss
 
-import matplotlib
-
-matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt
 
 import seaborn as sns
@@ -19,7 +17,7 @@
 sns.set_context("paper", font_scale=0.5)
 
 
-from .lm import (
+from numpy_ml.linear_models import (
     RidgeRegression,
     LinearRegression,
     BayesianLinearRegressionKnownVariance,
diff --git a/numpy_ml/ngram/plots.py b/numpy_ml/plots/ngram_plots.py
similarity index 97%
rename from numpy_ml/ngram/plots.py
rename to numpy_ml/plots/ngram_plots.py
index 0123e5e..27b0c94 100644
--- a/numpy_ml/ngram/plots.py
+++ b/numpy_ml/plots/ngram_plots.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import numpy as np
 
 import matplotlib.pyplot as plt
@@ -8,7 +9,7 @@
 sns.set_style("white")
 sns.set_context("notebook", font_scale=1)
 
-from .ngram import MLENGram, AdditiveNGram, GoodTuringNGram
+from numpy_ml.ngram import MLENGram, AdditiveNGram, GoodTuringNGram
 
 
 def plot_count_models(GT, N):
diff --git a/numpy_ml/neural_nets/activations/plots.py b/numpy_ml/plots/nn_activations_plots.py
similarity index 95%
rename from numpy_ml/neural_nets/activations/plots.py
rename to numpy_ml/plots/nn_activations_plots.py
index c293f8a..b8db1fb 100644
--- a/numpy_ml/neural_nets/activations/plots.py
+++ b/numpy_ml/plots/nn_activations_plots.py
@@ -1,7 +1,5 @@
+# flake8: noqa
 import numpy as np
-import matplotlib
-
-matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt
 import seaborn as sns
 
@@ -10,7 +8,7 @@
 sns.set_style("white")
 sns.set_context("notebook", font_scale=0.7)
 
-from .activations import (
+from numpy_ml.neural_nets.activations import (
     Affine,
     ReLU,
     LeakyReLU,
diff --git a/numpy_ml/neural_nets/schedulers/plots.py b/numpy_ml/plots/nn_schedulers_plots.py
similarity index 98%
rename from numpy_ml/neural_nets/schedulers/plots.py
rename to numpy_ml/plots/nn_schedulers_plots.py
index de11150..e18149b 100644
--- a/numpy_ml/neural_nets/schedulers/plots.py
+++ b/numpy_ml/plots/nn_schedulers_plots.py
@@ -2,9 +2,6 @@
 
 import time
 import numpy as np
-import matplotlib
-
-matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt
 import seaborn as sns
 
@@ -13,7 +10,7 @@
 sns.set_style("white")
 sns.set_context("notebook", font_scale=0.7)
 
-from .schedulers import (
+from numpy_ml.neural_nets.schedulers import (
     ConstantScheduler,
     ExponentialScheduler,
     NoamScheduler,
diff --git a/numpy_ml/nonparametric/plots.py b/numpy_ml/plots/nonparametric_plots.py
similarity index 98%
rename from numpy_ml/nonparametric/plots.py
rename to numpy_ml/plots/nonparametric_plots.py
index d6027ce..5671a1f 100644
--- a/numpy_ml/nonparametric/plots.py
+++ b/numpy_ml/plots/nonparametric_plots.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import numpy as np
 
 import matplotlib.pyplot as plt
@@ -8,10 +9,8 @@
 sns.set_style("white")
 sns.set_context("paper", font_scale=0.5)
 
-from .gp import GPRegression
-from ..linear_models.lm import LinearRegression
-from .kernel_regression import KernelRegression
-from .knn import KNN
+from numpy_ml.nonparametric import GPRegression, KNN, KernelRegression
+from numpy_ml.linear_models.lm import LinearRegression
 
 from sklearn.model_selection import train_test_split
 
diff --git a/numpy_ml/rl_models/plots.py b/numpy_ml/plots/rl_plots.py
similarity index 96%
rename from numpy_ml/rl_models/plots.py
rename to numpy_ml/plots/rl_plots.py
index 8b5469c..30bfbb1 100644
--- a/numpy_ml/rl_models/plots.py
+++ b/numpy_ml/plots/rl_plots.py
@@ -1,8 +1,8 @@
 # flake8: noqa
 import gym
 
-from .trainer import Trainer
-from .agents import (
+from numpy_ml.rl_models.trainer import Trainer
+from numpy_ml.rl_models.agents import (
     CrossEntropyAgent,
     MonteCarloAgent,
     TemporalDifferenceAgent,
diff --git a/numpy_ml/trees/plots.py b/numpy_ml/plots/trees_plots.py
similarity index 98%
rename from numpy_ml/trees/plots.py
rename to numpy_ml/plots/trees_plots.py
index 4ede83a..74de5f5 100644
--- a/numpy_ml/trees/plots.py
+++ b/numpy_ml/plots/trees_plots.py
@@ -14,9 +14,7 @@
 sns.set_style("white")
 sns.set_context("paper", font_scale=0.9)
 
-from .gbdt import GradientBoostedDecisionTree
-from .dt import DecisionTree
-from .rf import RandomForest
+from numpy_ml.trees import GradientBoostedDecisionTree, DecisionTree, RandomForest
 
 
 def plot():
diff --git a/numpy_ml/preprocessing/tests.py b/numpy_ml/preprocessing/tests.py
deleted file mode 100644
index 333cc9a..0000000
--- a/numpy_ml/preprocessing/tests.py
+++ /dev/null
@@ -1,210 +0,0 @@
-from collections import Counter
-
-# gold-standard imports
-import huffman
-import numpy as np
-
-from scipy.fftpack import dct
-
-from sklearn.preprocessing import StandardScaler
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-from librosa.core.time_frequency import fft_frequencies
-from librosa.feature import mfcc as lr_mfcc
-from librosa.util import frame
-from librosa.filters import mel
-
-# numpy-ml implementations
-from .general import Standardizer
-from .nlp import HuffmanEncoder, TFIDFEncoder
-from .dsp import DCT, DFT, mfcc, to_frames, mel_filterbank, dft_bins
-from ..utils.testing import random_paragraph
-
-
-def test_huffman():
-    while True:
-        n_words = np.random.randint(1, 100)
-        para = random_paragraph(n_words)
-        HT = HuffmanEncoder()
-        HT.fit(para)
-        my_dict = HT._item2code
-        their_dict = huffman.codebook(Counter(para).items())
-
-        for k, v in their_dict.items():
-            fstr = "their_dict['{}'] = {}, but my_dict['{}'] = {}"
-            assert k in my_dict, "key `{}` not in my_dict".format(k)
-            assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k])
-        print("PASSED")
-
-
-def test_standardizer():
-    while True:
-        mean = bool(np.random.randint(2))
-        std = bool(np.random.randint(2))
-        N = np.random.randint(2, 100)
-        M = np.random.randint(2, 100)
-        X = np.random.rand(N, M)
-
-        S = Standardizer(with_mean=mean, with_std=std)
-        S.fit(X)
-        mine = S.transform(X)
-
-        theirs = StandardScaler(with_mean=mean, with_std=std)
-        gold = theirs.fit_transform(X)
-
-        np.testing.assert_almost_equal(mine, gold)
-        print("PASSED")
-
-
-def test_tfidf():
-    while True:
-        docs = []
-        n_docs = np.random.randint(1, 10)
-        for d in range(n_docs):
-            n_lines = np.random.randint(1, 1000)
-            lines = [random_paragraph(np.random.randint(1, 10)) for _ in range(n_lines)]
-            docs.append("\n".join([" ".join(l) for l in lines]))
-
-        smooth = bool(np.random.randint(2))
-
-        tfidf = TFIDFEncoder(
-            lowercase=True,
-            min_count=0,
-            smooth_idf=smooth,
-            max_tokens=None,
-            input_type="strings",
-            filter_stopwords=False,
-        )
-        gold = TfidfVectorizer(
-            input="content",
-            norm=None,
-            use_idf=True,
-            lowercase=True,
-            smooth_idf=smooth,
-            sublinear_tf=False,
-        )
-
-        tfidf.fit(docs)
-        mine = tfidf.transform(ignore_special_chars=True)
-        theirs = gold.fit_transform(docs).toarray()
-
-        np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-def test_dct():
-    while True:
-        N = np.random.randint(2, 100)
-        signal = np.random.rand(N)
-        ortho = bool(np.random.randint(2))
-        mine = DCT(signal, orthonormal=ortho)
-        theirs = dct(signal, norm="ortho" if ortho else None)
-
-        np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-def test_dft():
-    while True:
-        N = np.random.randint(2, 100)
-        signal = np.random.rand(N)
-        mine = DFT(signal)
-        theirs = np.fft.rfft(signal)
-
-        np.testing.assert_almost_equal(mine.real, theirs.real)
-        print("PASSED")
-
-
-def test_mfcc():
-    """Broken"""
-    while True:
-        N = np.random.randint(500, 100000)
-        fs = np.random.randint(50, 10000)
-        n_mfcc = 12
-        window_len = 100
-        stride_len = 50
-        n_filters = 20
-        window_dur = window_len / fs
-        stride_dur = stride_len / fs
-        signal = np.random.rand(N)
-        #  ff = frame(signal, frame_length=window_len, hop_length=stride_len).T
-        #  print(len(ff))
-
-        mine = mfcc(
-            signal,
-            fs=fs,
-            window="hann",
-            window_duration=window_dur,
-            stride_duration=stride_dur,
-            lifter_coef=0,
-            alpha=0,
-            n_mfccs=n_mfcc,
-            normalize=False,
-            center=True,
-            n_filters=n_filters,
-            replace_intercept=False,
-        )
-
-        theirs = lr_mfcc(
-            signal,
-            sr=fs,
-            n_mels=n_filters,
-            n_mfcc=n_mfcc,
-            n_fft=window_len,
-            hop_length=stride_len,
-            htk=True,
-        ).T
-
-        np.testing.assert_almost_equal(mine, theirs, decimal=5)
-        print("PASSED")
-
-
-def test_framing():
-    while True:
-        N = np.random.randint(500, 100000)
-        window_len = np.random.randint(10, 100)
-        stride_len = np.random.randint(1, 50)
-        signal = np.random.rand(N)
-
-        mine = to_frames(signal, window_len, stride_len, writeable=False)
-        theirs = frame(signal, frame_length=window_len, hop_length=stride_len).T
-
-        assert len(mine) == len(theirs), "len(mine) = {}, len(theirs) = {}".format(
-            len(mine), len(theirs)
-        )
-        np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-def test_dft_bins():
-    while True:
-        N = np.random.randint(500, 100000)
-        fs = np.random.randint(50, 1000)
-
-        mine = dft_bins(N, fs=fs, positive_only=True)
-        theirs = fft_frequencies(fs, N)
-        np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-def test_mel_filterbank():
-    while True:
-        fs = np.random.randint(50, 10000)
-        n_filters = np.random.randint(2, 20)
-        window_len = np.random.randint(10, 100)
-        norm = np.random.randint(2)
-
-        mine = mel_filterbank(
-            window_len, n_filters, fs, min_freq=0, max_freq=None, normalize=bool(norm)
-        )
-
-        theirs = mel(
-            fs,
-            n_fft=window_len,
-            n_mels=n_filters,
-            htk=True,
-            norm=norm if norm == 1 else None,
-        )
-
-        np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
diff --git a/numpy_ml/trees/tests.py b/numpy_ml/trees/tests.py
deleted file mode 100644
index 87130df..0000000
--- a/numpy_ml/trees/tests.py
+++ /dev/null
@@ -1,522 +0,0 @@
-import numpy as np
-
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-from sklearn.metrics import accuracy_score, mean_squared_error
-from sklearn.datasets import make_regression
-from sklearn.datasets.samples_generator import make_blobs
-from sklearn.model_selection import train_test_split
-
-import matplotlib
-
-matplotlib.use("TkAgg")
-import matplotlib.pyplot as plt
-
-# https://seaborn.pydata.org/generated/seaborn.set_context.html
-# https://seaborn.pydata.org/generated/seaborn.set_style.html
-import seaborn as sns
-
-sns.set_style("white")
-sns.set_context("paper", font_scale=0.9)
-
-from .gbdt import GradientBoostedDecisionTree
-from .dt import DecisionTree, Node, Leaf
-from .rf import RandomForest
-
-
-def random_tensor(shape, standardize=False):
-    eps = np.finfo(float).eps
-    offset = np.random.randint(-300, 300, shape)
-    X = np.random.rand(*shape) + offset
-
-    if standardize:
-        X = (X - X.mean(axis=0)) / (X.std(axis=0) + eps)
-    return X
-
-
-def clone_tree(dtree):
-    children_left = dtree.tree_.children_left
-    children_right = dtree.tree_.children_right
-    feature = dtree.tree_.feature
-    threshold = dtree.tree_.threshold
-    values = dtree.tree_.value
-
-    def grow(node_id):
-        l, r = children_left[node_id], children_right[node_id]
-        if l == r:
-            return Leaf(values[node_id].argmax())
-        n = Node(None, None, (feature[node_id], threshold[node_id]))
-        n.left = grow(l)
-        n.right = grow(r)
-        return n
-
-    node_id = 0
-    root = Node(None, None, (feature[node_id], threshold[node_id]))
-    root.left = grow(children_left[node_id])
-    root.right = grow(children_right[node_id])
-    return root
-
-
-def compare_trees(mine, gold):
-    clone = clone_tree(gold)
-    mine = mine.root
-
-    def test(mine, clone):
-        if isinstance(clone, Node) and isinstance(mine, Node):
-            assert mine.feature == clone.feature, "Node {} not equal".format(depth)
-            np.testing.assert_allclose(mine.threshold, clone.threshold)
-            test(mine.left, clone.left, depth + 1)
-            test(mine.right, clone.right, depth + 1)
-        elif isinstance(clone, Leaf) and isinstance(mine, Leaf):
-            np.testing.assert_allclose(mine.value, clone.value)
-            return
-        else:
-            raise ValueError("Nodes at depth {} are not equal".format(depth))
-
-    depth = 0
-    ok = True
-    while ok:
-        if isinstance(clone, Node) and isinstance(mine, Node):
-            assert mine.feature == clone.feature
-            np.testing.assert_allclose(mine.threshold, clone.threshold)
-            test(mine.left, clone.left, depth + 1)
-            test(mine.right, clone.right, depth + 1)
-        elif isinstance(clone, Leaf) and isinstance(mine, Leaf):
-            np.testing.assert_allclose(mine.value, clone.value)
-            return
-        else:
-            raise ValueError("Nodes at depth {} are not equal".format(depth))
-
-
-def test_DecisionTree():
-    i = 1
-    np.random.seed(12345)
-    while True:
-        n_ex = np.random.randint(2, 100)
-        n_feats = np.random.randint(2, 100)
-        max_depth = np.random.randint(1, 5)
-
-        classifier = np.random.choice([True, False])
-        if classifier:
-            # create classification problem
-            n_classes = np.random.randint(2, 10)
-            X, Y = make_blobs(
-                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
-            )
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
-
-            # initialize model
-            def loss(yp, y):
-                return 1 - accuracy_score(yp, y)
-
-            criterion = np.random.choice(["entropy", "gini"])
-            mine = DecisionTree(
-                classifier=classifier, max_depth=max_depth, criterion=criterion
-            )
-            gold = DecisionTreeClassifier(
-                criterion=criterion,
-                max_depth=max_depth,
-                splitter="best",
-                random_state=i,
-            )
-        else:
-            # create regeression problem
-            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
-
-            # initialize model
-            criterion = "mse"
-            loss = mean_squared_error
-            mine = DecisionTree(
-                criterion=criterion, max_depth=max_depth, classifier=classifier
-            )
-            gold = DecisionTreeRegressor(
-                criterion=criterion, max_depth=max_depth, splitter="best"
-            )
-
-        print("Trial {}".format(i))
-        print("\tClassifier={}, criterion={}".format(classifier, criterion))
-        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
-        if classifier:
-            print("\tn_classes: {}".format(n_classes))
-
-        # fit 'em
-        mine.fit(X, Y)
-        gold.fit(X, Y)
-
-        # get preds on training set
-        y_pred_mine = mine.predict(X)
-        y_pred_gold = gold.predict(X)
-
-        loss_mine = loss(y_pred_mine, Y)
-        loss_gold = loss(y_pred_gold, Y)
-
-        # get preds on test set
-        y_pred_mine_test = mine.predict(X_test)
-        y_pred_gold_test = gold.predict(X_test)
-
-        loss_mine_test = loss(y_pred_mine_test, Y_test)
-        loss_gold_test = loss(y_pred_gold_test, Y_test)
-
-        try:
-            np.testing.assert_almost_equal(loss_mine, loss_gold)
-            print("\tLoss on training: {}".format(loss_mine))
-        except AssertionError as e:
-            print("\tTraining losses not equal:\n{}".format(e))
-
-        try:
-            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
-            print("\tLoss on test: {}".format(loss_mine_test))
-        except AssertionError as e:
-            print("\tTest losses not equal:\n{}".format(e))
-        i += 1
-
-
-def test_RandomForest():
-    np.random.seed(12345)
-    i = 1
-    while True:
-        n_ex = np.random.randint(2, 100)
-        n_feats = np.random.randint(2, 100)
-        n_trees = np.random.randint(2, 100)
-        max_depth = np.random.randint(1, 5)
-
-        classifier = np.random.choice([True, False])
-        if classifier:
-            # create classification problem
-            n_classes = np.random.randint(2, 10)
-            X, Y = make_blobs(
-                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
-            )
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
-
-            # initialize model
-            def loss(yp, y):
-                return 1 - accuracy_score(yp, y)
-
-            # initialize model
-            criterion = np.random.choice(["entropy", "gini"])
-            mine = RandomForest(
-                classifier=classifier,
-                n_feats=n_feats,
-                n_trees=n_trees,
-                criterion=criterion,
-                max_depth=max_depth,
-            )
-            gold = RandomForestClassifier(
-                n_estimators=n_trees,
-                max_features=n_feats,
-                criterion=criterion,
-                max_depth=max_depth,
-                bootstrap=True,
-            )
-        else:
-            # create regeression problem
-            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
-
-            # initialize model
-            criterion = "mse"
-            loss = mean_squared_error
-            mine = RandomForest(
-                criterion=criterion,
-                n_feats=n_feats,
-                n_trees=n_trees,
-                max_depth=max_depth,
-                classifier=classifier,
-            )
-            gold = RandomForestRegressor(
-                n_estimators=n_trees,
-                max_features=n_feats,
-                criterion=criterion,
-                max_depth=max_depth,
-                bootstrap=True,
-            )
-
-        print("Trial {}".format(i))
-        print("\tClassifier={}, criterion={}".format(classifier, criterion))
-        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
-        if classifier:
-            print("\tn_classes: {}".format(n_classes))
-
-        # fit 'em
-        mine.fit(X, Y)
-        gold.fit(X, Y)
-
-        # get preds
-        y_pred_mine = mine.predict(X)
-        y_pred_gold = gold.predict(X)
-
-        loss_mine = loss(y_pred_mine, Y)
-        loss_gold = loss(y_pred_gold, Y)
-
-        # get preds on test set
-        y_pred_mine_test = mine.predict(X_test)
-        y_pred_gold_test = gold.predict(X_test)
-
-        loss_mine_test = loss(y_pred_mine_test, Y_test)
-        loss_gold_test = loss(y_pred_gold_test, Y_test)
-
-        try:
-            np.testing.assert_almost_equal(loss_mine, loss_gold)
-            print("\tLoss on training: {}".format(loss_mine))
-        except AssertionError as e:
-            print("\tTraining losses not equal:\n{}".format(e))
-
-        try:
-            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
-            print("\tLoss on test: {}".format(loss_mine_test))
-        except AssertionError as e:
-            print("\tTest losses not equal:\n{}".format(e))
-
-        print("PASSED")
-        i += 1
-
-
-def test_gbdt():
-    np.random.seed(12345)
-    i = 1
-    while True:
-        n_ex = np.random.randint(2, 100)
-        n_feats = np.random.randint(2, 100)
-        n_trees = np.random.randint(2, 100)
-        max_depth = np.random.randint(1, 5)
-
-        classifier = np.random.choice([True, False])
-        if classifier:
-            # create classification problem
-            n_classes = np.random.randint(2, 10)
-            X, Y = make_blobs(
-                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
-            )
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
-
-            # initialize model
-            def loss(yp, y):
-                return 1 - accuracy_score(yp, y)
-
-            # initialize model
-            criterion = np.random.choice(["entropy", "gini"])
-            mine = GradientBoostedDecisionTree(
-                classifier=classifier,
-                n_trees=n_trees,
-                max_depth=max_depth,
-                learning_rate=0.1,
-                loss="crossentropy",
-                step_size="constant",
-                split_criterion=criterion,
-            )
-            gold = RandomForestClassifier(
-                n_estimators=n_trees,
-                max_features=n_feats,
-                criterion=criterion,
-                max_depth=max_depth,
-                bootstrap=True,
-            )
-        else:
-            # create regeression problem
-            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
-
-            # initialize model
-            criterion = "mse"
-            loss = mean_squared_error
-            mine = GradientBoostedDecisionTree(
-                n_trees=n_trees,
-                max_depth=max_depth,
-                classifier=classifier,
-                learning_rate=0.1,
-                loss="mse",
-                step_size="constant",
-                split_criterion=criterion,
-            )
-            gold = RandomForestRegressor(
-                n_estimators=n_trees,
-                max_features=n_feats,
-                criterion=criterion,
-                max_depth=max_depth,
-                bootstrap=True,
-            )
-
-        print("Trial {}".format(i))
-        print("\tClassifier={}, criterion={}".format(classifier, criterion))
-        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
-        if classifier:
-            print("\tn_classes: {}".format(n_classes))
-
-        # fit 'em
-        mine.fit(X, Y)
-        gold.fit(X, Y)
-
-        # get preds
-        y_pred_mine = mine.predict(X)
-        y_pred_gold = gold.predict(X)
-
-        loss_mine = loss(y_pred_mine, Y)
-        loss_gold = loss(y_pred_gold, Y)
-
-        # get preds on test set
-        y_pred_mine_test = mine.predict(X_test)
-        y_pred_gold_test = gold.predict(X_test)
-
-        loss_mine_test = loss(y_pred_mine_test, Y_test)
-        loss_gold_test = loss(y_pred_gold_test, Y_test)
-
-        try:
-            np.testing.assert_almost_equal(loss_mine, loss_gold)
-            print("\tLoss on training: {}".format(loss_mine))
-        except AssertionError as e:
-            print("\tTraining losses not equal:\n{}".format(e))
-
-        try:
-            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
-            print("\tLoss on test: {}".format(loss_mine_test))
-        except AssertionError as e:
-            print("\tTest losses not equal:\n{}".format(e))
-
-        print("PASSED")
-        i += 1
-
-
-def plot():
-    fig, axes = plt.subplots(4, 4)
-    fig.set_size_inches(10, 10)
-    for ax in axes.flatten():
-        n_ex = 100
-        n_trees = 50
-        n_feats = np.random.randint(2, 100)
-        max_depth_d = np.random.randint(1, 100)
-        max_depth_r = np.random.randint(1, 10)
-
-        classifier = np.random.choice([True, False])
-        if classifier:
-            # create classification problem
-            n_classes = np.random.randint(2, 10)
-            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
-            n_feats = min(n_feats, X.shape[1])
-
-            # initialize model
-            def loss(yp, y):
-                return accuracy_score(yp, y)
-
-            # initialize model
-            criterion = np.random.choice(["entropy", "gini"])
-            mine = RandomForest(
-                classifier=classifier,
-                n_feats=n_feats,
-                n_trees=n_trees,
-                criterion=criterion,
-                max_depth=max_depth_r,
-            )
-            mine_d = DecisionTree(
-                criterion=criterion, max_depth=max_depth_d, classifier=classifier
-            )
-            mine_g = GradientBoostedDecisionTree(
-                n_trees=n_trees,
-                max_depth=max_depth_d,
-                classifier=classifier,
-                learning_rate=1,
-                loss="crossentropy",
-                step_size="constant",
-                split_criterion=criterion,
-            )
-
-        else:
-            # create regeression problem
-            X, Y = make_regression(n_samples=n_ex, n_features=1)
-            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
-            n_feats = min(n_feats, X.shape[1])
-
-            # initialize model
-            criterion = "mse"
-            loss = mean_squared_error
-            mine = RandomForest(
-                criterion=criterion,
-                n_feats=n_feats,
-                n_trees=n_trees,
-                max_depth=max_depth_r,
-                classifier=classifier,
-            )
-            mine_d = DecisionTree(
-                criterion=criterion, max_depth=max_depth_d, classifier=classifier
-            )
-            mine_g = GradientBoostedDecisionTree(
-                n_trees=n_trees,
-                max_depth=max_depth_d,
-                classifier=classifier,
-                learning_rate=1,
-                loss="mse",
-                step_size="adaptive",
-                split_criterion=criterion,
-            )
-
-        # fit 'em
-        mine.fit(X, Y)
-        mine_d.fit(X, Y)
-        mine_g.fit(X, Y)
-
-        # get preds on test set
-        y_pred_mine_test = mine.predict(X_test)
-        y_pred_mine_test_d = mine_d.predict(X_test)
-        y_pred_mine_test_g = mine_g.predict(X_test)
-
-        loss_mine_test = loss(y_pred_mine_test, Y_test)
-        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)
-        loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)
-
-        if classifier:
-            entries = [
-                ("RF", loss_mine_test, y_pred_mine_test),
-                ("DT", loss_mine_test_d, y_pred_mine_test_d),
-                ("GB", loss_mine_test_g, y_pred_mine_test_g),
-            ]
-            (lbl, test_loss, preds) = entries[np.random.randint(3)]
-            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
-            for i in np.unique(Y_test):
-                ax.scatter(
-                    X_test[preds == i, 0].flatten(),
-                    X_test[preds == i, 1].flatten(),
-                    #  s=0.5,
-                )
-        else:
-            X_ax = np.linspace(
-                np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100
-            ).reshape(-1, 1)
-            y_pred_mine_test = mine.predict(X_ax)
-            y_pred_mine_test_d = mine_d.predict(X_ax)
-            y_pred_mine_test_g = mine_g.predict(X_ax)
-
-            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
-            #  s=0.5)
-            ax.plot(
-                X_ax.flatten(),
-                y_pred_mine_test_g.flatten(),
-                #  linewidth=0.5,
-                label="GB".format(n_trees, n_feats, max_depth_d),
-                color="red",
-            )
-            ax.plot(
-                X_ax.flatten(),
-                y_pred_mine_test.flatten(),
-                #  linewidth=0.5,
-                label="RF".format(n_trees, n_feats, max_depth_r),
-                color="cornflowerblue",
-            )
-            ax.plot(
-                X_ax.flatten(),
-                y_pred_mine_test_d.flatten(),
-                #  linewidth=0.5,
-                label="DT".format(max_depth_d),
-                color="yellowgreen",
-            )
-            ax.set_title(
-                "GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format(
-                    loss_mine_test_g, loss_mine_test, loss_mine_test_d
-                )
-            )
-            ax.legend()
-        ax.xaxis.set_ticklabels([])
-        ax.yaxis.set_ticklabels([])
-    plt.savefig("plot.png", dpi=300)
-    plt.close("all")
diff --git a/numpy_ml/utils/tests.py b/numpy_ml/utils/tests.py
deleted file mode 100644
index 1089b0f..0000000
--- a/numpy_ml/utils/tests.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import numpy as np
-
-import scipy
-import networkx as nx
-
-from sklearn.neighbors import BallTree as sk_BallTree
-from sklearn.metrics.pairwise import rbf_kernel as sk_rbf
-from sklearn.metrics.pairwise import linear_kernel as sk_linear
-from sklearn.metrics.pairwise import polynomial_kernel as sk_poly
-
-
-from .distance_metrics import euclidean
-from .kernels import LinearKernel, PolynomialKernel, RBFKernel
-from .data_structures import BallTree
-from .graphs import DiGraph, UndirectedGraph, Edge, random_unweighted_graph, random_DAG
-
-#######################################################################
-#                               Kernels                               #
-#######################################################################
-
-
-def test_linear_kernel():
-    np.random.seed(12345)
-
-    while True:
-        N = np.random.randint(1, 100)
-        M = np.random.randint(1, 100)
-        C = np.random.randint(1, 1000)
-
-        X = np.random.rand(N, C)
-        Y = np.random.rand(M, C)
-
-        mine = LinearKernel()(X, Y)
-        gold = sk_linear(X, Y)
-
-        np.testing.assert_almost_equal(mine, gold)
-        print("PASSED")
-
-
-def test_polynomial_kernel():
-    np.random.seed(12345)
-
-    while True:
-        N = np.random.randint(1, 100)
-        M = np.random.randint(1, 100)
-        C = np.random.randint(1, 1000)
-        gamma = np.random.rand()
-        d = np.random.randint(1, 5)
-        c0 = np.random.rand()
-
-        X = np.random.rand(N, C)
-        Y = np.random.rand(M, C)
-
-        mine = PolynomialKernel(gamma=gamma, d=d, c0=c0)(X, Y)
-        gold = sk_poly(X, Y, gamma=gamma, degree=d, coef0=c0)
-
-        np.testing.assert_almost_equal(mine, gold)
-        print("PASSED")
-
-
-def test_radial_basis_kernel():
-    np.random.seed(12345)
-
-    while True:
-        N = np.random.randint(1, 100)
-        M = np.random.randint(1, 100)
-        C = np.random.randint(1, 1000)
-        gamma = np.random.rand()
-
-        X = np.random.rand(N, C)
-        Y = np.random.rand(M, C)
-
-        # sklearn (gamma) <-> mine (sigma) conversion:
-        # gamma = 1 / (2 * sigma^2)
-        # sigma = np.sqrt(1 / 2 * gamma)
-
-        mine = RBFKernel(sigma=np.sqrt(1 / (2 * gamma)))(X, Y)
-        gold = sk_rbf(X, Y, gamma=gamma)
-
-        np.testing.assert_almost_equal(mine, gold)
-        print("PASSED")
-
-
-#######################################################################
-#                          Distance Metrics                           #
-#######################################################################
-
-
-def test_euclidean():
-    np.random.seed(12345)
-
-    while True:
-        N = np.random.randint(1, 100)
-        x = np.random.rand(N)
-        y = np.random.rand(N)
-        mine = euclidean(x, y)
-        theirs = scipy.spatial.distance.euclidean(x, y)
-        np.testing.assert_almost_equal(mine, theirs)
-        print("PASSED")
-
-
-#######################################################################
-#                           Data Structures                           #
-#######################################################################
-
-
-def test_ball_tree():
-    np.random.seed(12345)
-
-    while True:
-        N = np.random.randint(2, 100)
-        M = np.random.randint(2, 100)
-        k = np.random.randint(1, N)
-        ls = np.min([np.random.randint(1, 10), N - 1])
-
-        X = np.random.rand(N, M)
-        BT = BallTree(leaf_size=ls, metric=euclidean)
-        BT.fit(X)
-
-        x = np.random.rand(M)
-        mine = BT.nearest_neighbors(k, x)
-        assert len(mine) == k
-
-        mine_neighb = np.array([n.key for n in mine])
-        mine_dist = np.array([n.distance for n in mine])
-
-        sort_ix = np.argsort(mine_dist)
-        mine_dist = mine_dist[sort_ix]
-        mine_neighb = mine_neighb[sort_ix]
-
-        sk = sk_BallTree(X, leaf_size=ls)
-        theirs_dist, ind = sk.query(x.reshape(1, -1), k=k)
-        sort_ix = np.argsort(theirs_dist.flatten())
-
-        theirs_dist = theirs_dist.flatten()[sort_ix]
-        theirs_neighb = X[ind.flatten()[sort_ix]]
-
-        for i in range(len(theirs_dist)):
-            np.testing.assert_almost_equal(mine_neighb[i], theirs_neighb[i])
-            np.testing.assert_almost_equal(mine_dist[i], theirs_dist[i])
-
-        print("PASSED")
-
-
-#######################################################################
-#                               Graphs                                #
-#######################################################################
-
-
-def from_networkx(G_nx):
-    """ Convert a networkx graph to my graph representation"""
-    V = list(G_nx.nodes)
-    edges = list(G_nx.edges)
-    is_weighted = "weight" in G_nx[edges[0][0]][edges[0][1]]
-
-    E = []
-    for e in edges:
-        if is_weighted:
-            E.append(Edge(e[0], e[1], G_nx[e[0]][e[1]]["weight"]))
-        else:
-            E.append(Edge(e[0], e[1]))
-
-    return DiGraph(V, E) if nx.is_directed(G_nx) else UndirectedGraph(V, E)
-
-
-def to_networkx(G):
-    """Convert my graph representation to a networkx graph"""
-    G_nx = nx.DiGraph() if G.is_directed else nx.Graph()
-    V = list(G._V2I.keys())
-    G_nx.add_nodes_from(V)
-
-    for v in V:
-        fr_i = G._V2I[v]
-        edges = G._G[fr_i]
-
-        for edge in edges:
-            G_nx.add_edge(edge.fr, edge.to, weight=edge._w)
-    return G_nx
-
-
-def test_all_paths():
-    np.random.seed(12345)
-
-    while True:
-        p = np.random.rand()
-        directed = np.random.rand() < 0.5
-        G = random_unweighted_graph(n_vertices=10, edge_prob=p, directed=directed)
-
-        nodes = G._I2V.keys()
-        G_nx = to_networkx(G)
-
-        # for each graph, test all_paths for all pairs of start and end
-        # vertices. note that graph is not guaranteed to be connected, so many
-        # paths will be empty
-        for s_i in nodes:
-            for e_i in nodes:
-                if s_i == e_i:
-                    continue
-
-                paths = G.all_paths(s_i, e_i)
-                paths_nx = nx.all_simple_paths(G_nx, source=s_i, target=e_i, cutoff=10)
-
-                paths = sorted(paths)
-                paths_nx = sorted(list(paths_nx))
-
-                for p1, p2 in zip(paths, paths_nx):
-                    np.testing.assert_array_equal(p1, p2)
-
-                print("PASSED")
-
-
-def test_random_DAG():
-    np.random.seed(12345)
-
-    while True:
-        p = np.random.uniform(0.25, 1)
-        n_v = np.random.randint(5, 50)
-
-        G = random_DAG(n_v, p)
-        G_nx = to_networkx(G)
-
-        assert nx.is_directed_acyclic_graph(G_nx)
-        print("PASSED")
-
-
-def test_topological_ordering():
-    np.random.seed(12345)
-
-    while True:
-        p = np.random.uniform(0.25, 1)
-        n_v = np.random.randint(5, 10)
-
-        G = random_DAG(n_v, p)
-        G_nx = to_networkx(G)
-
-        if nx.is_directed_acyclic_graph(G_nx):
-            topo_order = G.topological_ordering()
-
-            #  test topological order
-            seen_it = set()
-            for n_i in topo_order:
-                seen_it.add(n_i)
-                assert any([c_i in seen_it for c_i in G.get_neighbors(n_i)]) == False
-
-            print("PASSED")
-
-
-def test_is_acyclic():
-    np.random.seed(12345)
-
-    while True:
-        p = np.random.rand()
-        directed = np.random.rand() < 0.5
-        G = random_unweighted_graph(n_vertices=10, edge_prob=p, directed=True)
-        G_nx = to_networkx(G)
-
-        assert G.is_acyclic() == nx.is_directed_acyclic_graph(G_nx)
-        print("PASSED")

From 984495f4a6efe20feb53b5ae321ab59105e8c91e Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sun, 7 Jun 2020 23:38:30 -0400
Subject: [PATCH 09/18] remove test module

---
 numpy_ml/neural_nets/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy_ml/neural_nets/__init__.py b/numpy_ml/neural_nets/__init__.py
index a7bcf9e..d8e51ea 100644
--- a/numpy_ml/neural_nets/__init__.py
+++ b/numpy_ml/neural_nets/__init__.py
@@ -1,3 +1,4 @@
+"""A module of basic building blcoks for constructing neural networks"""
 from . import utils
 from . import losses
 from . import activations
@@ -8,4 +9,3 @@
 from . import initializers
 from . import modules
 from . import models
-from . import tests

From af962442d31d7a67583ebb9f44c6ad2283c04c17 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Tue, 9 Jun 2020 23:18:03 -0400
Subject: [PATCH 10/18] add warning for plotting and gym dependencies

---
 numpy_ml/bandits/bandits.py    |  2 +-
 numpy_ml/bandits/trainer.py    | 22 ++++++++++------
 numpy_ml/rl_models/rl_utils.py | 48 ++++++++++++++++++++++------------
 numpy_ml/utils/testing.py      | 14 +++++++---
 4 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/numpy_ml/bandits/bandits.py b/numpy_ml/bandits/bandits.py
index d66fbaa..8f04ac0 100644
--- a/numpy_ml/bandits/bandits.py
+++ b/numpy_ml/bandits/bandits.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from ..utils.testing import random_one_hot_matrix, is_number
+from numpy_ml.utils.testing import random_one_hot_matrix, is_number
 
 
 class Bandit(ABC):
diff --git a/numpy_ml/bandits/trainer.py b/numpy_ml/bandits/trainer.py
index 79211e0..4d591ae 100644
--- a/numpy_ml/bandits/trainer.py
+++ b/numpy_ml/bandits/trainer.py
@@ -1,16 +1,20 @@
 """A trainer/runner object for executing and comparing MAB policies."""
 
+import warnings
 import os.path as op
 from collections import defaultdict
 
 import numpy as np
 
+from numpy_ml.utils.testing import DependencyWarning
+
 try:
     import matplotlib.pyplot as plt
 
     _PLOTTING = True
 except ImportError:
-    print("Cannot import matplotlib. Plotting functionality disabled.")
+    fstr = "Cannot import matplotlib. Plotting functionality disabled."
+    warnings.warn(fstr, DependencyWarning)
     _PLOTTING = False
 
 
@@ -114,8 +118,11 @@ def compare(
             0.999.
         """  # noqa: E501
         self.init_logs(policies)
-        fig, all_axes = plt.subplots(len(policies), 2, sharex=True)
-        fig.set_size_inches(10.5, len(policies) * 5.25)
+
+        axes = None
+        if _PLOTTING:
+            fig, all_axes = plt.subplots(len(policies), 2, sharex=True)
+            fig.set_size_inches(10.5, len(policies) * 5.25)
 
         for policy, axes in zip(policies, all_axes):
             np.random.seed(seed)
@@ -146,11 +153,10 @@ def compare(
             a1.set_ylim(a1_min, a1_max)
             a2.set_ylim(a2_min, a2_max)
 
-        sdir = get_scriptdir()
-        plt.savefig("{}/img/{}.png".format(sdir, "comparison"), dpi=300)
-
-        plt.show()
-        plt.close("all")
+        if _PLOTTING:
+            sdir = get_scriptdir()
+            plt.savefig("{}/img/{}.png".format(sdir, "comparison"), dpi=300)
+            plt.close("all")
 
     def train(
         self,
diff --git a/numpy_ml/rl_models/rl_utils.py b/numpy_ml/rl_models/rl_utils.py
index 2d80680..245bb8e 100644
--- a/numpy_ml/rl_models/rl_utils.py
+++ b/numpy_ml/rl_models/rl_utils.py
@@ -1,11 +1,28 @@
+"""Utilities for training and evaluating RL models on OpenAI gym environments"""
+import warnings
 from itertools import product
 from collections import defaultdict
 
 import numpy as np
 
-import gym
-
-from .tiles.tiles3 import tiles, IHT
+from numpy_ml.utils.testing import DependencyWarning
+from numpy_ml.rl_models.tiles.tiles3 import tiles, IHT
+
+NO_PD = False
+try:
+    import pandas as pd
+except ModuleNotFoundError:
+    NO_PD = True
+
+try:
+    import gym
+except ModuleNotFoundError:
+    fstr = (
+        "Agents in `numpy_ml.rl_models` use the OpenAI gym for training. "
+        "To install the gym environments, run `pip install gym`. For more"
+        " information, see https://github.com/openai/gym."
+    )
+    warnings.warn(fstr, DependencyWarning)
 
 
 class EnvModel(object):
@@ -29,23 +46,24 @@ def __init__(self):
         self._model = defaultdict(lambda: defaultdict(lambda: 0))
 
     def __setitem__(self, key, value):
+        """Set self[key] to value"""
         s, a, r, s_ = key
         self._model[(s, a)][(r, s_)] = value
 
     def __getitem__(self, key):
+        """Return the value associated with key"""
         s, a, r, s_ = key
         return self._model[(s, a)][(r, s_)]
 
     def __contains__(self, key):
+        """True if EnvModel contains `key`, else False"""
         s, a, r, s_ = key
         p1 = (s, a) in self.state_action_pairs()
         p2 = (r, s_) in self.reward_outcome_pairs()
         return p1 and p2
 
     def state_action_pairs(self):
-        """
-        Return all (state, action) pairs in the environment model
-        """
+        """Return all (state, action) pairs in the environment model"""
         return list(self._model.keys())
 
     def reward_outcome_pairs(self, s, a):
@@ -166,7 +184,7 @@ def tile_state_space(
     scale = 1.0 / obs_range
 
     # scale (state-)observation vector
-    scale_obs = lambda obs: obs * scale
+    scale_obs = lambda obs: obs * scale  # noqa: E731
 
     n_tiles = np.prod(grid_size) * n_tilings
     n_states = np.prod([n_tiles - i for i in range(n_tilings)])
@@ -180,16 +198,12 @@ def encode_obs_as_tile(obs):
 
 
 def get_gym_environs():
-    """ List all valid OpenAI ``gym`` environment ids.  """
+    """List all valid OpenAI ``gym`` environment ids"""
     return [e.id for e in gym.envs.registry.all()]
 
 
 def get_gym_stats():
-    """ Return a pandas DataFrame of the environment IDs.  """
-    try:
-        import pandas as pd
-    except:
-        raise ImportError("Cannot import `pandas`; unable to run `get_gym_stats`")
+    """Return a pandas DataFrame of the environment IDs."""
     df = []
     for e in gym.envs.registry.all():
         print(e.id)
@@ -211,7 +225,7 @@ def get_gym_stats():
         "tuple_actions",
         "tuple_observations",
     ]
-    return pd.DataFrame(df)[cols]
+    return df if NO_PD else pd.DataFrame(df)[cols]
 
 
 def is_tuple(env):
@@ -305,13 +319,13 @@ def is_continuous(env, tuple_action, tuple_obs):
     Continuous = gym.spaces.box.Box
     if tuple_obs:
         spaces = env.observation_space.spaces
-        cont_obs = all([isinstance(s, Continuous) for s in spaces])
+        cont_obs = all(isinstance(s, Continuous) for s in spaces)
     else:
         cont_obs = isinstance(env.observation_space, Continuous)
 
     if tuple_action:
         spaces = env.action_space.spaces
-        cont_action = all([isinstance(s, Continuous) for s in spaces])
+        cont_action = all(isinstance(s, Continuous) for s in spaces)
     else:
         cont_action = isinstance(env.action_space, Continuous)
     return cont_action, cont_obs
@@ -432,7 +446,7 @@ def env_stats(env):
     cont_action, cont_obs = is_continuous(env, tuple_action, tuple_obs)
 
     n_actions_per_dim, action_ids, action_dim = action_stats(
-        env, md_action, cont_action
+        env, md_action, cont_action,
     )
     n_obs_per_dim, obs_ids, obs_dim = obs_stats(env, md_obs, cont_obs)
 
diff --git a/numpy_ml/utils/testing.py b/numpy_ml/utils/testing.py
index 0d45dbf..c56d395 100644
--- a/numpy_ml/utils/testing.py
+++ b/numpy_ml/utils/testing.py
@@ -1,3 +1,4 @@
+"""Utilities for writing unit tests"""
 import numbers
 import numpy as np
 
@@ -13,9 +14,7 @@ def is_symmetric(X):
 
 
 def is_symmetric_positive_definite(X):
-    """
-    Check that a matrix `X` is a symmetric and positive-definite.
-    """
+    """Check that a matrix `X` is a symmetric and positive-definite."""
     if is_symmetric(X):
         try:
             # if matrix is symmetric, check whether the Cholesky decomposition
@@ -133,3 +132,12 @@ def random_paragraph(n_words, vocab=None):
             "gubergren",
         ]
     return [np.random.choice(vocab) for _ in range(n_words)]
+
+
+#######################################################################
+#                           Custom Warnings                           #
+#######################################################################
+
+
+class DependencyWarning(RuntimeWarning):
+    pass

From 88191b842e0350639f5ec2f5241683ced21672a3 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Wed, 10 Jun 2020 00:26:19 -0400
Subject: [PATCH 11/18] add setup.py and misc requirements for pip packaging

---
 MANIFEST.in           |  4 ++++
 requirements-dev.txt  | 17 +++++++++++++++++
 requirements-test.txt | 14 ++++++++++++++
 requirements.txt      |  2 ++
 setup.py              | 42 ++++++++++++++++++++++++++++++++++++++++++
 tox.ini               |  6 ++++++
 6 files changed, 85 insertions(+)
 create mode 100644 MANIFEST.in
 create mode 100644 requirements-dev.txt
 create mode 100644 requirements-test.txt
 create mode 100644 requirements.txt
 create mode 100644 setup.py
 create mode 100644 tox.ini

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..56d25c9
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include README.md
+include requirements*.txt
+include docs/*.rst
+include docs/img/*.png
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..6c9f229
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,17 @@
+numpy
+scipy
+sklearn
+torch
+networkx
+matplotlib
+seaborn
+tensorflow
+gym
+keras
+huffman
+librosa
+nltk
+hmmlearn
+pre-commit
+tox
+pytest
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 0000000..40de83f
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,14 @@
+numpy
+scipy
+sklearn
+torch
+networkx
+tensorflow
+keras
+gym
+huffman
+librosa
+nltk
+hmmlearn
+tox
+pytest
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..6bad103
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b699c9a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,42 @@
+# flake8: noqa
+from codecs import open
+
+from setuptools import setup, find_packages
+
+with open("README.md", encoding="utf-8") as f:
+    LONG_DESCRIPTION = f.read()
+
+with open("requirements.txt") as requirements:
+    REQUIREMENTS = [r.strip() for r in requirements if r != "\n"]
+
+PROJECT_URLS = {
+    "Bug Tracker": "https://github.com/ddbourgin/numpy-ml/issues",
+    "Documentation": "https://numpy-ml.readthedocs.io/en/latest/",
+    "Source": "https://github.com/ddbourgin/numpy-ml",
+}
+
+setup(
+    name="numpy-ml-test7",
+    version="0.1.0",
+    author="David Bourgin",
+    author_email="ddbourgin@gmail.com",
+    project_urls=PROJECT_URLS,
+    url="https://github.com/ddbourgin/numpy_ml",
+    description="Machine learning in NumPy",
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
+    install_requires=REQUIREMENTS,
+    packages=find_packages(),
+    license="GPLv3+",
+    include_package_data=True,
+    extras_require={"rl": ["gym", "matplotlib"]},
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Developers",
+        "Topic :: Scientific/Engineering",
+        "License :: OSI Approved :: GNU General Public License (GPL)",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+    ],
+)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..0c65edf
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,6 @@
+[tox]
+envlist = py36,py38
+skip_missing_interpreters=true
+[testenv]
+deps = -rrequirements-test.txt
+commands = pytest

From e2d474324d8166d3cf9aa11e135a7c94070de293 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Wed, 10 Jun 2020 00:55:14 -0400
Subject: [PATCH 12/18] fix package name and update readme

---
 README.md | 26 ++++++++++++++++++++++++--
 setup.py  |  2 +-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3615b09..3b34a5a 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,31 @@
 # numpy-ml
 Ever wish you had an inefficient but somewhat legible collection of machine
-learning algorithms implemented exclusively in numpy? No?
+learning algorithms implemented exclusively in NumPy? No?
+
+## Installation
+
+### For rapid experimentation
+To use the code in this repo as a starting point for ML prototyping and
+experimentation, it is easiest to just clone/fork the repository and begin
+hacking. To clone the repo and create a fresh
+[virtualenv](https://pypi.org/project/virtualenv/) for development, you can run:
+
+```sh
+$ git clone https://github.com/ddbourgin/numpy-ml.git
+$ cd numpy-ml && virtualenv npml && source npml/bin/activate
+$ pip install -r requirements-dev.txt
+```
+
+### For use as a package
+If you don't plan to modify the source much, you can also install numpy-ml as a
+Python package via pip: `pip install -u numpy_ml`.
+
+The reinforcement learning agents train on environments defined in the [OpenAI
+gym](https://github.com/openai/gym). To install these alongside numpy-ml, you
+can use `pip install -u numpy_ml[rl]`.
 
 ## Documentation
-To see all of the available models, take a look at the [project documentation](https://numpy-ml.readthedocs.io/) or see [here](https://github.com/ddbourgin/numpy-ml/blob/master/numpy_ml/README.md).
+To see the available models, take a look at the [project documentation](https://numpy-ml.readthedocs.io/) or see [here](https://github.com/ddbourgin/numpy-ml/blob/master/numpy_ml/README.md).
 
 ## Contributing
 
diff --git a/setup.py b/setup.py
index b699c9a..a3fc94c 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
 }
 
 setup(
-    name="numpy-ml-test7",
+    name="numpy-ml",
     version="0.1.0",
     author="David Bourgin",
     author_email="ddbourgin@gmail.com",

From 9ccb746e543e0fda649b05dd1d5a2e65ca617395 Mon Sep 17 00:00:00 2001
From: David Bourgin <ddbourgin@users.noreply.github.com>
Date: Wed, 10 Jun 2020 21:20:17 -0400
Subject: [PATCH 13/18] Update README.md

---
 README.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 3b34a5a..7a7e66f 100644
--- a/README.md
+++ b/README.md
@@ -5,10 +5,7 @@ learning algorithms implemented exclusively in NumPy? No?
 ## Installation
 
 ### For rapid experimentation
-To use the code in this repo as a starting point for ML prototyping and
-experimentation, it is easiest to just clone/fork the repository and begin
-hacking. To clone the repo and create a fresh
-[virtualenv](https://pypi.org/project/virtualenv/) for development, you can run:
+To use this code as a starting point for ML prototyping / experimentation, just clone the repository, create a new [virtualenv](https://pypi.org/project/virtualenv/), and start hacking:
 
 ```sh
 $ git clone https://github.com/ddbourgin/numpy-ml.git
@@ -17,8 +14,8 @@ $ pip install -r requirements-dev.txt
 ```
 
 ### For use as a package
-If you don't plan to modify the source much, you can also install numpy-ml as a
-Python package via pip: `pip install -u numpy_ml`.
+If you don't plan to modify the source, you can also install numpy-ml as a
+Python package: `pip install -u numpy_ml`.
 
 The reinforcement learning agents train on environments defined in the [OpenAI
 gym](https://github.com/openai/gym). To install these alongside numpy-ml, you

From 8fe01bb45cb732e46b4ed0059a5f2e80792e32a3 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Wed, 10 Jun 2020 22:26:43 -0400
Subject: [PATCH 14/18] add available models to top-level readme

---
 README.md | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 154 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7a7e66f..0951f7e 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ $ cd numpy-ml && virtualenv npml && source npml/bin/activate
 $ pip install -r requirements-dev.txt
 ```
 
-### For use as a package
+### As a package
 If you don't plan to modify the source, you can also install numpy-ml as a
 Python package: `pip install -u numpy_ml`.
 
@@ -22,7 +22,159 @@ gym](https://github.com/openai/gym). To install these alongside numpy-ml, you
 can use `pip install -u numpy_ml[rl]`.
 
 ## Documentation
-To see the available models, take a look at the [project documentation](https://numpy-ml.readthedocs.io/) or see [here](https://github.com/ddbourgin/numpy-ml/blob/master/numpy_ml/README.md).
+For more details on the available models, see the [project documentation](https://numpy-ml.readthedocs.io/).
+
+## Examples
+Coming soon!
+
+## Available models
+1. **Gaussian mixture model**
+    - EM training
+
+2. **Hidden Markov model**
+    - Viterbi decoding
+    - Likelihood computation
+    - MLE parameter estimation via Baum-Welch/forward-backward algorithm
+
+3. **Latent Dirichlet allocation** (topic model)
+    - Standard model with MLE parameter estimation via variational EM
+    - Smoothed model with MAP parameter estimation via MCMC
+
+4. **Neural networks**
+    * Layers / Layer-wise ops
+        - Add
+        - Flatten
+        - Multiply
+        - Softmax
+        - Fully-connected/Dense
+        - Sparse evolutionary connections
+        - LSTM
+        - Elman-style RNN
+        - Max + average pooling
+        - Dot-product attention
+        - Embedding layer
+        - Restricted Boltzmann machine (w. CD-n training)
+        - 2D deconvolution (w. padding and stride)
+        - 2D convolution (w. padding, dilation, and stride)
+        - 1D convolution (w. padding, dilation, stride, and causality)
+    * Modules
+        - Bidirectional LSTM
+        - ResNet-style residual blocks (identity and convolution)
+        - WaveNet-style residual blocks with dilated causal convolutions
+        - Transformer-style multi-headed scaled dot product attention
+    * Regularizers
+        - Dropout
+    * Normalization
+        - Batch normalization (spatial and temporal)
+        - Layer normalization (spatial and temporal)
+    * Optimizers
+        - SGD w/ momentum
+        - AdaGrad
+        - RMSProp
+        - Adam
+    * Learning Rate Schedulers
+        - Constant
+        - Exponential
+        - Noam/Transformer
+        - Dlib scheduler
+    * Weight Initializers
+        - Glorot/Xavier uniform and normal
+        - He/Kaiming uniform and normal
+        - Standard and truncated normal
+    * Losses
+        - Cross entropy
+        - Squared error
+        - Bernoulli VAE loss
+        - Wasserstein loss with gradient penalty
+        - Noise contrastive estimation loss
+    * Activations
+        - ReLU
+        - Tanh
+        - Affine
+        - Sigmoid
+        - Leaky ReLU
+        - ELU
+        - SELU
+        - Exponential
+        - Hard Sigmoid
+        - Softplus
+    * Models
+        - Bernoulli variational autoencoder
+        - Wasserstein GAN with gradient penalty
+        - word2vec encoder with skip-gram and CBOW architectures
+    * Utilities
+        - `col2im` (MATLAB port)
+        - `im2col` (MATLAB port)
+        - `conv1D`
+        - `conv2D`
+        - `deconv2D`
+        - `minibatch`
+
+5. **Tree-based models**
+    - Decision trees (CART)
+    - [Bagging] Random forests
+    - [Boosting] Gradient-boosted decision trees
+
+6. **Linear models**
+    - Ridge regression
+    - Logistic regression
+    - Ordinary least squares
+    - Bayesian linear regression w/ conjugate priors
+        - Unknown mean, known variance (Gaussian prior)
+        - Unknown mean, unknown variance (Normal-Gamma / Normal-Inverse-Wishart prior)
+
+7. **n-Gram sequence models**
+    - Maximum likelihood scores
+    - Additive/Lidstone smoothing
+    - Simple Good-Turing smoothing
+
+8. **Multi-armed bandit models**
+    - UCB1
+    - LinUCB
+    - Epsilon-greedy
+    - Thompson sampling w/ conjugate priors
+        - Beta-Bernoulli sampler
+    - LinUCB
+
+8. **Reinforcement learning models**
+    - Cross-entropy method agent
+    - First visit on-policy Monte Carlo agent
+    - Weighted incremental importance sampling Monte Carlo agent
+    - Expected SARSA agent
+    - TD-0 Q-learning agent
+    - Dyna-Q / Dyna-Q+ with prioritized sweeping
+
+9. **Nonparameteric models**
+    - Nadaraya-Watson kernel regression
+    - k-Nearest neighbors classification and regression
+    - Gaussian process regression
+
+10. **Matrix factorization**
+    - Regularized alternating least-squares
+    - Non-negative matrix factorization
+
+11. **Preprocessing**
+    - Discrete Fourier transform (1D signals)
+    - Discrete cosine transform (type-II) (1D signals)
+    - Bilinear interpolation (2D signals)
+    - Nearest neighbor interpolation (1D and 2D signals)
+    - Autocorrelation (1D signals)
+    - Signal windowing
+    - Text tokenization
+    - Feature hashing
+    - Feature standardization
+    - One-hot encoding / decoding
+    - Huffman coding / decoding
+    - Term frequency-inverse document frequency (TF-IDF) encoding
+    - MFCC encoding
+
+12. **Utilities**
+    - Similarity kernels
+    - Distance metrics
+    - Priority queue
+    - Ball tree
+    - Discrete sampler
+    - Graph processing and generators
 
 ## Contributing
 

From 337a5fd9401e76aad20217bdea638cd4705b0d0b Mon Sep 17 00:00:00 2001
From: David Bourgin <ddbourgin@users.noreply.github.com>
Date: Wed, 10 Jun 2020 22:40:29 -0400
Subject: [PATCH 15/18] Update README.md

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index 0951f7e..6654fb5 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,6 @@ can use `pip install -u numpy_ml[rl]`.
 ## Documentation
 For more details on the available models, see the [project documentation](https://numpy-ml.readthedocs.io/).
 
-## Examples
-Coming soon!
-
 ## Available models
 1. **Gaussian mixture model**
     - EM training

From 080870e76b205310030871457e6c1100ed143797 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sat, 20 Jun 2020 00:54:49 -0400
Subject: [PATCH 16/18] update installation documentation

---
 README.md | 9 +++------
 setup.py  | 3 ++-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 0951f7e..0ecc1e8 100644
--- a/README.md
+++ b/README.md
@@ -10,23 +10,20 @@ To use this code as a starting point for ML prototyping / experimentation, just
 ```sh
 $ git clone https://github.com/ddbourgin/numpy-ml.git
 $ cd numpy-ml && virtualenv npml && source npml/bin/activate
-$ pip install -r requirements-dev.txt
+$ pip3 install -r requirements-dev.txt
 ```
 
 ### As a package
 If you don't plan to modify the source, you can also install numpy-ml as a
-Python package: `pip install -u numpy_ml`.
+Python package: `pip3 install -u numpy_ml`.
 
 The reinforcement learning agents train on environments defined in the [OpenAI
 gym](https://github.com/openai/gym). To install these alongside numpy-ml, you
-can use `pip install -u numpy_ml[rl]`.
+can use `pip3 install -u 'numpy_ml[rl]'`.
 
 ## Documentation
 For more details on the available models, see the [project documentation](https://numpy-ml.readthedocs.io/).
 
-## Examples
-Coming soon!
-
 ## Available models
 1. **Gaussian mixture model**
     - EM training
diff --git a/setup.py b/setup.py
index a3fc94c..311d26f 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 
 setup(
     name="numpy-ml",
-    version="0.1.0",
+    version="0.1.1",
     author="David Bourgin",
     author_email="ddbourgin@gmail.com",
     project_urls=PROJECT_URLS,
@@ -29,6 +29,7 @@
     packages=find_packages(),
     license="GPLv3+",
     include_package_data=True,
+    python_requires=">=3.5",
     extras_require={"rl": ["gym", "matplotlib"]},
     classifiers=[
         "Development Status :: 3 - Alpha",

From d26f422570137bc9fdd84e02d01f77d12c027554 Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sat, 20 Jun 2020 00:55:31 -0400
Subject: [PATCH 17/18] add best_arm to bandit oracle output

---
 numpy_ml/bandits/bandits.py |  35 ++++++---
 numpy_ml/bandits/trainer.py | 144 ++++++++++++++++++++----------------
 2 files changed, 108 insertions(+), 71 deletions(-)

diff --git a/numpy_ml/bandits/bandits.py b/numpy_ml/bandits/bandits.py
index 8f04ac0..e5a14c3 100644
--- a/numpy_ml/bandits/bandits.py
+++ b/numpy_ml/bandits/bandits.py
@@ -104,6 +104,7 @@ def __init__(self, payoffs, payoff_probs):
         self.payoff_probs = payoff_probs
         self.arm_evs = np.array([sum(p * v) for p, v in zip(payoff_probs, payoffs)])
         self.best_ev = np.max(self.arm_evs)
+        self.best_arm = np.argmax(self.arm_evs)
 
     @property
     def hyperparameters(self):
@@ -127,8 +128,10 @@ def oracle_payoff(self, context=None):
         -------
         optimal_rwd : float
             The expected reward under an optimal policy.
+        optimal_arm : float
+            The arm ID with the largest expected reward.
         """
-        return self.best_ev
+        return self.best_ev, self.best_arm
 
     def _pull(self, arm_id, context):
         payoffs = self.payoffs[arm_id]
@@ -159,6 +162,7 @@ def __init__(self, payoff_probs):
 
         self.arm_evs = self.payoff_probs
         self.best_ev = np.max(self.arm_evs)
+        self.best_arm = np.argmax(self.arm_evs)
 
     @property
     def hyperparameters(self):
@@ -181,8 +185,10 @@ def oracle_payoff(self, context=None):
         -------
         optimal_rwd : float
             The expected reward under an optimal policy.
+        optimal_arm : float
+            The arm ID with the largest expected reward.
         """
-        return self.best_ev
+        return self.best_ev, self.best_arm
 
     def _pull(self, arm_id, context):
         return int(np.random.rand() <= self.payoff_probs[arm_id])
@@ -217,6 +223,7 @@ def __init__(self, payoff_dists, payoff_probs):
         self.payoff_probs = payoff_probs
         self.arm_evs = np.array([mu for (mu, var) in payoff_dists])
         self.best_ev = np.max(self.arm_evs)
+        self.best_arm = np.argmax(self.arm_evs)
 
     @property
     def hyperparameters(self):
@@ -249,8 +256,10 @@ def oracle_payoff(self, context=None):
         -------
         optimal_rwd : float
             The expected reward under an optimal policy.
+        optimal_arm : float
+            The arm ID with the largest expected reward.
         """
-        return self.best_ev
+        return self.best_ev, self.best_arm
 
 
 class ShortestPathBandit(Bandit):
@@ -282,6 +291,7 @@ def __init__(self, G, start_vertex, end_vertex):
 
         self.arm_evs = self._calc_arm_evs()
         self.best_ev = np.max(self.arm_evs)
+        self.best_arm = np.argmax(self.arm_evs)
 
         placeholder = [None] * len(self.paths)
         super().__init__(placeholder, placeholder)
@@ -309,8 +319,10 @@ def oracle_payoff(self, context=None):
         -------
         optimal_rwd : float
             The expected reward under an optimal policy.
+        optimal_arm : float
+            The arm ID with the largest expected reward.
         """
-        return self.best_ev
+        return self.best_ev, self.best_arm
 
     def _calc_arm_evs(self):
         I2V = self.G.get_vertex
@@ -353,7 +365,8 @@ def __init__(self, context_probs):
 
         self.context_probs = context_probs
         self.arm_evs = self.context_probs
-        self.best_ev = self.arm_evs.max(axis=1)
+        self.best_evs = self.arm_evs.max(axis=1)
+        self.best_arms = self.arm_evs.argmax(axis=1)
 
     @property
     def hyperparameters(self):
@@ -386,15 +399,17 @@ def oracle_payoff(self, context):
         Parameters
         ----------
         context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
-            The current context matrix for each of the bandit arms, if
-            applicable. Default is None.
+            The current context matrix for each of the bandit arms.
 
         Returns
         -------
         optimal_rwd : float
             The expected reward under an optimal policy.
+        optimal_arm : float
+            The arm ID with the largest expected reward.
         """
-        return context[:, 0] @ self.best_ev
+        context_id = context[:, 0].argmax()
+        return self.best_evs[context_id], self.best_arms[context_id]
 
     def _pull(self, arm_id, context):
         D, K = self.context_probs.shape
@@ -499,9 +514,11 @@ def oracle_payoff(self, context):
         -------
         optimal_rwd : float
             The expected reward under an optimal policy.
+        optimal_arm : float
+            The arm ID with the largest expected reward.
         """
         best_arm = np.argmax(self.arm_evs)
-        return self.arm_evs[best_arm]
+        return self.arm_evs[best_arm], best_arm
 
     def _pull(self, arm_id, context):
         K, thetas = self.K, self.thetas
diff --git a/numpy_ml/bandits/trainer.py b/numpy_ml/bandits/trainer.py
index 4d591ae..f925d32 100644
--- a/numpy_ml/bandits/trainer.py
+++ b/numpy_ml/bandits/trainer.py
@@ -88,11 +88,12 @@ def compare(
         self,
         policies,
         bandit,
-        ep_length,
-        n_episodes,
+        n_trials,
         n_duplicates,
-        seed=12345,
+        plot=True,
+        seed=None,
         smooth_weight=0.999,
+        out_dir=None,
     ):
         """
         Compare the performance of multiple policies on the same bandit
@@ -104,41 +105,49 @@ def compare(
             The multi-armed bandit policies to compare.
         bandit : :class:`Bandit <numpy_ml.bandits.bandits.Bandit>` instance
             The environment to train the policies on.
-        ep_length : int
-            The number of pulls allowed in each episode
-        n_episodes : int
-            The number of episodes per run
+        n_trials : int
+            The number of trials per run.
         n_duplicates: int
-            The number of runs to evaluate
+            The number of times to evaluate each policy on the bandit
+            environment. Larger values permit a better estimate of the
+            variance in payoff / cumulative regret for each policy.
+        plot : bool
+            Whether to generate a plot of the policy's average reward and
+            regret across the episodes. Default is True.
         seed : int
-            The seed for the random number generator. Default is 12345.
+            The seed for the random number generator. Default is None.
         smooth_weight : float in [0, 1]
             The smoothing weight. Values closer to 0 result in less smoothing,
             values closer to 1 produce more aggressive smoothing. Default is
             0.999.
+        out_dir : str or None
+            Plots will be saved to this directory if `plot` is True. If
+            `out_dir` is None, plots will not be saved. Default is None.
         """  # noqa: E501
         self.init_logs(policies)
 
-        axes = None
-        if _PLOTTING:
+        all_axes = [None] * len(policies)
+        if plot and _PLOTTING:
             fig, all_axes = plt.subplots(len(policies), 2, sharex=True)
             fig.set_size_inches(10.5, len(policies) * 5.25)
 
         for policy, axes in zip(policies, all_axes):
-            np.random.seed(seed)
+            if seed:
+                np.random.seed(seed)
+
             bandit.reset()
             policy.reset()
 
             self.train(
                 policy,
                 bandit,
-                ep_length,
-                n_episodes,
+                n_trials,
                 n_duplicates,
                 axes=axes,
-                plot=True,
+                plot=plot,
                 verbose=False,
-                smooth_weight=0.999,
+                out_dir=out_dir,
+                smooth_weight=smooth_weight,
             )
 
         # enforce the same y-ranges across plots for straightforward comparison
@@ -153,23 +162,23 @@ def compare(
             a1.set_ylim(a1_min, a1_max)
             a2.set_ylim(a2_min, a2_max)
 
-        if _PLOTTING:
-            sdir = get_scriptdir()
-            plt.savefig("{}/img/{}.png".format(sdir, "comparison"), dpi=300)
-            plt.close("all")
+        if plot and _PLOTTING:
+            if out_dir is not None:
+                plt.savefig(op.join(out_dir, "bandit_comparison.png"), dpi=300)
+            plt.show()
 
     def train(
         self,
         policy,
         bandit,
-        ep_length,
-        n_episodes,
+        n_trials,
         n_duplicates,
         plot=True,
         axes=None,
         verbose=True,
         print_every=100,
         smooth_weight=0.999,
+        out_dir=None,
     ):
         """
         Train a MAB policies on a multi-armed bandit problem, logging training
@@ -181,10 +190,8 @@ def train(
             The multi-armed bandit policy to train.
         bandit : :class:`Bandit <numpy_ml.bandits.bandits.Bandit>` instance
             The environment to run the policy on.
-        ep_length : int
-            The number of pulls allowed in each episode
-        n_episodes : int
-            The number of episodes per run
+        n_trials : int
+            The number of trials per run.
         n_duplicates: int
             The number of runs to evaluate
         plot : bool
@@ -203,6 +210,9 @@ def train(
             The smoothing weight. Values closer to 0 result in less smoothing,
             values closer to 1 produce more aggressive smoothing. Default is
             0.999.
+        out_dir : str or None
+            Plots will be saved to this directory if `plot` is True. If
+            `out_dir` is None, plots will not be saved. Default is None.
 
         Returns
         -------
@@ -224,33 +234,34 @@ def train(
             policy.reset()
 
             avg_oracle_reward, cregret = 0, 0
-            for e_id in range(n_episodes):
-                oracle_reward, ep_reward = 0, 0
-
-                for s in range(ep_length):
-                    rwd, arm, orwd = self._train_step(bandit, policy)
-                    ep_reward += rwd
-                    oracle_reward += orwd
+            for trial_id in range(n_trials):
+                rwd, arm, orwd, oarm = self._train_step(bandit, policy)
 
                 loss = mse(bandit, policy)
-                regret = oracle_reward - ep_reward
-                avg_oracle_reward += oracle_reward / n_episodes
+                regret = orwd - rwd
+
+                avg_oracle_reward += orwd
                 cregret += regret
 
-                L[p]["mse"][e_id + 1].append(loss)
-                L[p]["regret"][e_id + 1].append(regret)
-                L[p]["cregret"][e_id + 1].append(cregret)
-                L[p]["reward"][e_id + 1].append(ep_reward)
+                L[p]["mse"][trial_id + 1].append(loss)
+                L[p]["reward"][trial_id + 1].append(rwd)
+                L[p]["regret"][trial_id + 1].append(regret)
+                L[p]["cregret"][trial_id + 1].append(cregret)
+                L[p]["optimal_arm"][trial_id + 1].append(oarm)
+                L[p]["selected_arm"][trial_id + 1].append(arm)
+                L[p]["optimal_reward"][trial_id + 1].append(orwd)
+
+                if (trial_id + 1) % print_every == 0 and verbose:
+                    fstr = "Trial {}/{}, {}/{}, Regret: {:.4f}"
+                    print(fstr.format(trial_id + 1, n_trials, d + 1, D, regret))
 
-                if (e_id + 1) % print_every == 0 and verbose:
-                    fstr = "Ep. {}/{}, {}/{}, Regret: {:.4f}"
-                    print(fstr.format(e_id + 1, n_episodes, d + 1, D, regret))
+            avg_oracle_reward /= n_trials
 
             if verbose:
                 self._print_run_summary(bandit, policy, regret)
 
-        if plot:
-            self._plot_reward(avg_oracle_reward, policy, smooth_weight, axes)
+        if plot and _PLOTTING:
+            self._plot_reward(avg_oracle_reward, policy, smooth_weight, axes, out_dir)
 
         return policy
 
@@ -258,8 +269,8 @@ def _train_step(self, bandit, policy):
         P, B = policy, bandit
         C = B.get_context() if hasattr(B, "get_context") else None
         rwd, arm = P.act(B, C)
-        oracle_rwd = B.oracle_payoff(C)
-        return rwd, arm, oracle_rwd
+        oracle_rwd, oracle_arm = B.oracle_payoff(C)
+        return rwd, arm, oracle_rwd, oracle_arm
 
     def init_logs(self, policies):
         """
@@ -267,20 +278,30 @@ def init_logs(self, policies):
 
         Notes
         -----
-        In the logs, keys are episode numbers, and values are lists of length
-        ``n_duplicates`` holding the metric values for each duplicate of that
-        episode. For example, ``logs['regret'][3][1]`` holds the regret value
-        accrued on the 2nd duplicate of the 4th episode.
+        Training logs are represented as a nested set of dictionaries with the
+        following structure:
+
+            log[model_id][metric][trial_number][duplicate_number]
+
+        For example, ``logs['model1']['regret'][3][1]`` holds the regret value
+        accrued on the 3rd trial of the 2nd duplicate run for model1.
+
+        Available fields are 'regret', 'cregret' (cumulative regret), 'reward',
+        'mse' (mean-squared error between estimated arm EVs and the true EVs),
+        'optimal_arm', 'selected_arm', and 'optimal_reward'.
         """
         if not isinstance(policies, list):
             policies = [policies]
 
         self.logs = {
             str(p): {
+                "mse": defaultdict(lambda: []),
                 "regret": defaultdict(lambda: []),
-                "cregret": defaultdict(lambda: []),
                 "reward": defaultdict(lambda: []),
-                "mse": defaultdict(lambda: []),
+                "cregret": defaultdict(lambda: []),
+                "optimal_arm": defaultdict(lambda: []),
+                "selected_arm": defaultdict(lambda: []),
+                "optimal_reward": defaultdict(lambda: []),
             }
             for p in policies
         }
@@ -299,11 +320,7 @@ def _print_run_summary(self, bandit, policy, regret):
         fstr = "\nFinal MSE: {:.4f}\nFinal Regret: {:.4f}\n\n"
         print(fstr.format(np.mean(se), regret))
 
-    def _plot_reward(self, optimal_rwd, policy, smooth_weight, axes=None):
-        if not _PLOTTING:
-            print("Cannot import matplotlib. Plotting functionality disabled.")
-            return
-
+    def _plot_reward(self, optimal_rwd, policy, smooth_weight, axes=None, out_dir=None):
         L = self.logs[str(policy)]
         smds = self._smoothed_metrics(policy, optimal_rwd, smooth_weight)
 
@@ -341,12 +358,10 @@ def _plot_reward(self, optimal_rwd, policy, smooth_weight, axes=None):
             fig.suptitle(str(policy))
             fig.tight_layout()
 
-            sdir = get_scriptdir()
-            bid = policy.hyperparameters["id"]
-            plt.savefig("{}/img/{}.png".format(sdir, bid), dpi=300)
-
+            if out_dir is not None:
+                bid = policy.hyperparameters["id"]
+                plt.savefig(op.join(out_dir, f"{bid}.png"), dpi=300)
             plt.show()
-            plt.close("all")
         return ax1, ax2
 
     def _smoothed_metrics(self, policy, optimal_rwd, smooth_weight):
@@ -355,6 +370,9 @@ def _smoothed_metrics(self, policy, optimal_rwd, smooth_weight):
         # pre-allocate smoothed data structure
         smds = {}
         for m in L.keys():
+            if m == "selections":
+                continue
+
             smds["sm_{}_avg".format(m)] = np.zeros(len(L["reward"]))
             smds["sm_{}_avg".format(m)][0] = np.mean(L[m][1])
 
@@ -364,6 +382,8 @@ def _smoothed_metrics(self, policy, optimal_rwd, smooth_weight):
         smoothed = {m: L[m][1] for m in L.keys()}
         for e_id in range(2, len(L["reward"]) + 1):
             for m in L.keys():
+                if m == "selections":
+                    continue
                 prev, cur = smoothed[m], L[m][e_id]
                 smoothed[m] = [smooth(p, c, smooth_weight) for p, c in zip(prev, cur)]
                 smds["sm_{}_avg".format(m)][e_id - 1] = np.mean(smoothed[m])

From bd438cacd342bf1934bbeedbd182311c583e164e Mon Sep 17 00:00:00 2001
From: ddbourgin <ddbourgin@gmail.com>
Date: Sat, 20 Jun 2020 00:58:04 -0400
Subject: [PATCH 18/18] fix github link

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 311d26f..45ff20b 100644
--- a/setup.py
+++ b/setup.py
@@ -17,11 +17,11 @@
 
 setup(
     name="numpy-ml",
-    version="0.1.1",
+    version="0.1.2",
     author="David Bourgin",
     author_email="ddbourgin@gmail.com",
     project_urls=PROJECT_URLS,
-    url="https://github.com/ddbourgin/numpy_ml",
+    url="https://github.com/ddbourgin/numpy-ml",
     description="Machine learning in NumPy",
     long_description=LONG_DESCRIPTION,
     long_description_content_type="text/markdown",