<a href="https://colab.research.google.com/github/davidwhogg/GenerativeVsDiscriminative/blob/master/ipynb/generative_vs_discriminative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generative *vs* Discriminative models for inference

- Do generative models generally outperform discriminative models in various ways?
- How do both of these compare to information-theory-optimal estimators?
- Are generative models less subject to adversarial attack?
- Are generative and discriminative models equally good for de-noising noisy labels?
- ... and related matters.

## authors
- **David W Hogg** *(NYU) (MPIA) (Flatiron)*
- **Soledad Villar** *(NYU)*

## license
Copyright 2019, 2020 the authors. All rights reserved (for now).

## notes and to-do items
- The latent space Gaussian should have a non-zero mean and the discriminative model needs `np.ones()` appended to the `X`.
- Decide what are our main results / observations.
- Fix colors in plots so objects of similar ontology are in similar colors and styles?
- It's cool that he noise contributions are mean zero, but I (DWH) don't think the latent pdf for `Z` should be mean zero.

In [0]:
import numpy as np
import pylab as plt
%matplotlib inline

In [0]:
# set integers

# set data dimensionality
maximalD = 2 ** 9 # ESLII calls this "p"
fiducialD = 2 ** 7
M = 1 # ESLII always sets this to 1

# set training and test set sizes
fiducialN = 2 ** 6
maximalN = maximalD # maximum possible size of a training set
Ntest = 2 ** 8 # number of points to use to compute biases and variances
Ntrial = 2 ** 4 # number of training trials to use to compute biases and variances

In [0]:
# set God's X-Y covariance matrix (M+D x M+D)

# start with a random matrix
np.random.seed(13)
foo = np.random.normal(size=(M + maximalD, M + maximalD))
bar = foo @ foo.T

# and then manipulate eigenvalues
u, ss, v = np.linalg.svd(bar)
# s = 1.e5 * np.exp(-3.00 * np.arange(M + maximalD)) \
#   + 0.01 * np.exp(-0.01 * np.arange(M + maximalD))
s = 1. * ss
s[:8] = 1.e4 * s[:8]
maximalC = (u * s) @ v

"""
GodK = 16
Q = np.random.normal(size=(maximalD, GodK))
P = np.random.normal(size=(M, GodK))
foo = np.random.normal(size=(GodK, GodK + 4))
V = foo @ foo.T
foo = np.random.normal(size=(maximalD, 2 * maximalD))
Cx = 2 ** -4 * foo @ foo.T
foo = np.random.normal(size=(M, 2 * M))
Cy = 2 ** -4 * foo @ foo.T
maximalC = np.vstack((np.hstack((Q @ V @ Q.T + Cx, Q @ V @ P.T)),
                      np.hstack((P @ V @ Q.T, P @ V @ P.T + Cy))))
"""

In [0]:
def make_data_set(N, C, M):
  """
  actually make the random data, using the variance of God.

  Needs to be updated to take God's mean.
  """
  MplusD, foo = C.shape
  D = MplusD - M
  xys = np.random.multivariate_normal(np.zeros(MplusD), C, size=N)
  return xys[:, :D], xys[:, D:]

In [0]:
# make maximal train and test sets
np.random.seed(8675309 + 1)
maximalX = np.zeros((Ntrial, maximalN, maximalD))
maximalY = np.zeros((Ntrial, maximalN, M))
for trial in range(Ntrial):
  maximalX[trial], maximalY[trial] = make_data_set(maximalN, maximalC, M)
maximalXtest = np.zeros((Ntrial, Ntest, maximalD))
maximalYtest = np.zeros((Ntrial, Ntest, M))
for trial in range(Ntrial):
  maximalXtest[trial], maximalYtest[trial] = make_data_set(Ntest, maximalC, M)
print(maximalX.shape, maximalY.shape)

In [0]:
u, s, v = np.linalg.svd(maximalX[0, :fiducialN, -fiducialD:], full_matrices=False)
plt.plot(s ** 2, "ko")
C = maximalC[-fiducialD-M:, -fiducialD-M:]
uu, ss, vv = np.linalg.svd(C)
plt.plot((fiducialN * ss)[:fiducialN], "ko", alpha=0.24)
print(np.sum(ss), np.trace(C))
plt.semilogy()
plt.ylabel("contribution to variance")
plt.xlabel("component number")
plt.title("SVD of the fiducial data set, zeroth trial")

In [0]:
def gods_estimator(C, M):
  """
  WARNING: This function needs to know about God's mean mu, and
  be fixed to incorporate it.

  make the best possible estimator, in terms of mean squared error on y
  (this, we hope, is what everything approaches, in the limit N -> infinity)
  """
  MplusD, foo = C.shape
  D = MplusD - M
  Cxx = C[:D, :D]
  Cxy = C[:D, D:]
  return np.linalg.lstsq(Cxx, Cxy, rcond=None)[0].T

In [0]:
def train_discriminative_model(xs, ys, lamb=0.):
  """
  train discriminative model y = B x + noise
  
  ## inputs:
  - xs - array of training data
  - ys - array of training labels

  ## comments
  Informally speaking, this finds the B that minimizes || ys - B . xs || (plus
  some regularization), and returns the discriminative matrix B.

  ## bugs:
  - Properly, this should take in an estimate of the inverse variances. These
    might be some processing of the xivars and yivars; we need to figure that
    out.
  - Doesn't fit for a bias term (that is, a zero-offset).
  """
  N, D = xs.shape
  NN, M = ys.shape
  assert N == NN
  return np.linalg.lstsq(xs.T @ xs + lamb * np.eye(D), xs.T @ ys, rcond=None)[0].T

def train_trivial_generative_model(xs, ys):
  """
  say stuff here

  ## bugs:
  - Doesn't fit for a bias term (that is, a zero-offset).
  """
  return np.linalg.lstsq(ys, xs, rcond=None)[0]

def initialize_a_generative_model(Y, P):
  Z = np.linalg.lstsq(P, Y.T, rcond=None)[0]
  proj_P = P.T @ np.linalg.lstsq(P @ P.T, P, rcond=None)[0]
  M, K = P.shape
  proj_Pperp = np.identity(K) - proj_P
  return proj_P @ Z, proj_Pperp @ np.random.normal(size=Z.shape), proj_Pperp

def train_msv_generative_model(X, Y, P, maxiter=10000, Z=None):
  """
  ## Inputs:
  X: N x D
  Y: N x M
  P: M x K

  ## Comments:
  among all solutions Z such that Y=Z*P.T (ie Y.T=P*Z.T)
  we minimize ||X - Z*A|| by alternating minimization
  we write Z = Z0 + Z1, proj_P*Z=Z0, proj_Pperp*Z = Z1 

  ## bugs:
  - Doesn't use xivars, yivars.
  - Doesn't fit for a bias term (that is, a zero-offset).
  """
  M, K = P.shape
  
  Z0, Z1, proj_Pperp = initialize_a_generative_model(Y, P)
  if Z is None:
    Z = Z0 + Z1
  tiny = 1.e-9
  unconverged = True
  last_cost=np.inf
  for i in range(maxiter):
    # normalize the perpendicular part of Z
    Z1 = proj_Pperp @ Z
    Z -= Z1
    Z1 /= tiny + np.sqrt(np.mean(Z1 ** 2, axis=1))[:, None]
    Z += Z1

    #fix Z optimize for A
    A=np.linalg.lstsq(Z.T, X, rcond=None)[0]

    #fix A optimize for Z
    a = A.T @ proj_Pperp.T
    b = X.T - A.T @ Z0
    Z1 = np.linalg.lstsq(a, b, rcond=None)[0]
    Z1 = proj_Pperp @ Z1

    #convergence
    Z_old = Z
    Z = Z0 + Z1
    #BUG: unstable! 
    #print('convergence, cost')
    cost= np.linalg.norm(X.T - A.T @ Z)

    if cost >= last_cost:
      unconverged = False
      break
    last_cost = cost
    #print((np.linalg.norm(Z-Z_old), np.linalg.norm(X.T- np.matmul(A.T,Z)) + np.linalg.norm(Y.T -np.matmul(P,Z)) ))
  if unconverged: print("train_msv_generative_model(): WARNING: did not converge.")
  return  P @ np.linalg.lstsq(A @ A.T, A, rcond=None)[0], Z.T

In [0]:
def z_step(X, Y, A, P, sqrtCxinv, sqrtCyinv):
  bigA = np.vstack((sqrtCxinv @ A, sqrtCyinv @ P))
  bigX = np.hstack((X @ sqrtCxinv, Y @ sqrtCyinv))
  return np.linalg.lstsq(bigA, bigX.T, rcond=None)[0].T

def a_step(X, Z):
  """
  BUG: This doesn't currently use the Cx, Cy correctly.
  BUG: meaning: It treats the data as homoskedastic.
  """
  return np.linalg.lstsq(Z, X, rcond=None)[0].T

def dwh_cost(X, Y, Z, A, P, sqrtCxinv, sqrtCyinv):
  """
  ## Bugs:
  - Untested!
  """
  xchi = sqrtCxinv @ (X.T - (A @ Z.T))
  ychi = sqrtCyinv @ (Y.T - (P @ Z.T))
  return np.sum(xchi ** 2) + np.sum(ychi ** 2)

def train_dwh_generative_model(X, Y, P, Cx, Cy, maxiter=10000, lltol=0.00001, Z=None):
  """
  ## Bugs:
  - Initialization bad here. This should probably initialize at the internal
    state of the MSV generative model optimized earlier.
  - Takes forever to converge when K = M, apparently. This may be related to
    initialization also.
  """
  # get read the square roots of the inverses of Cx, Cy
  u, s, v = np.linalg.svd(Cx)
  sqrtCxinv = (u / np.sqrt(s)) @ v
  Cxinv = (u / s) @ v
  u, s, v = np.linalg.svd(Cy)
  sqrtCyinv = (u / np.sqrt(s)) @ v

  # initialize loop
  Z0, Z1, proj_P_perp = initialize_a_generative_model(Y, P)
  if Z is None:
    Z = (Z0 + Z1).T
  tiny = 1.e-9
  cost = np.inf
  unconverged = True
  A = 0.

  # loop
  for ii in range(maxiter):
    old_cost = 1. * cost
    oldA = 1. * A

    # normalization step
    Zperp = (proj_P_perp @ Z.T).T
    Z -= Zperp
    Zperp /= tiny + np.sqrt(np.mean(Zperp ** 2, axis=0))[None, :]
    Z += Zperp

    # standard iteration
    A = a_step(X, Z)
    Z = z_step(X, Y, A, P, sqrtCxinv, sqrtCyinv)

    # convergence control
    cost = dwh_cost(X, Y, Z, A, P, sqrtCxinv, sqrtCyinv)
    if cost > 1. and cost > (old_cost + lltol): # note insane condition
      print("train_dwh_generative_model(): WARNING: cost went the wrong way!")
      print(ii, cost, old_cost)
      A = oldA # restore
    if cost > (old_cost - lltol):
      unconverged = False
      break
  if unconverged: print("train_dwh_generative_model(): WARNING: did not converge.")
  wA = np.linalg.lstsq(Cx, A, rcond=None)[0]
  return P @ np.linalg.lstsq(wA.T @ A, wA.T, rcond=None)[0], Z

In [0]:
def train_pca_generative_model(X, Y, K):
  """
  DOESN'T DEAL WITH MEAN X CORRECTLY.
  """
  u, s, v = np.linalg.svd(X, full_matrices=False)
  B = train_discriminative_model((u * s)[:, :K], Y)
  return B @ v[:K, :]

def wow_a_step(X, Z):
  """
  internal function for `train_wow_generative_model()`.
  """
  return np.linalg.lstsq(Z, X, rcond=None)[0].T

def wow_z_step(X, Y, P, wyoverwx, A):
  """
  internal function for `train_wow_generative_model()`.
  """
  wx, wy = 1. / np.sqrt(wyoverwx), np.sqrt(wyoverwx)
  bigA = np.vstack((A * np.sqrt(wx), P * np.sqrt(wy)))
  bigX = np.hstack((X * np.sqrt(wx), Y * np.sqrt(wy)))
  return np.linalg.lstsq(bigA, bigX.T, rcond=None)[0].T

def train_wow_generative_model(X, Y, P, wyoverwx, maxiter=10000, Z=None):
  """
  ## Bugs:
  - Initialization bad here. This should probably initialize at the internal
    state of the MSV generative model optimized earlier.
  - Also it converges very very slowly when wyoverwx is small.
  """
  # initialize loop
  Z0, Z1, proj_P_perp = initialize_a_generative_model(Y, P)
  if Z is None:
    Z = (Z0 + Z1).T
  tiny = 1.e-9
  cost = np.inf
  unconverged = True
  A = 0.

  # loop
  for ii in range(maxiter):
    old_cost = 1. * cost
    oldA = 1. * A

    # normalization step
    Zperp = (proj_P_perp @ Z.T).T
    Z -= Zperp
    Zperp /= tiny + np.sqrt(np.mean(Zperp ** 2, axis=0))[None, :]
    Z += Zperp

    # standard iteration
    A = wow_a_step(X, Z)
    Z = wow_z_step(X, Y, P, wyoverwx, A)

    # convergence control
    if np.allclose(A, oldA):
      unconverged = False
      break
  if unconverged: print("train_wow_generative_model(): WARNING: did not converge.")
  return P @ np.linalg.lstsq(A.T @ A, A.T, rcond=None)[0], Z


In [0]:
# run on the first trial case, for the fiducials
xs_train = maximalX[0, :fiducialN, -fiducialD:]
ys_train = maximalY[0, :fiducialN, :]
xs_test = maximalXtest[0, :, -fiducialD:]
ys_test = maximalYtest[0, :, :]
C = maximalC[-fiducialD-M:, -fiducialD-M:]
W = gods_estimator(C, M)

print(xs_train.shape, ys_train.shape)
B = train_discriminative_model(xs_train, ys_train)
Hdagger = train_trivial_generative_model(xs_train, ys_train)
print(B.shape, Hdagger.shape)

K = 16 # stupid
assert K >= M
P = np.eye(K)[:M, :]
G_msv, zs = train_msv_generative_model(xs_train, ys_train, P)
G_pca = train_pca_generative_model(xs_train, ys_train, K)
G_wow, zs = train_wow_generative_model(xs_train, ys_train, P, 1000., Z = zs)
# hack: Use MSV's latents to initialize DWH's model
# G_dwh, zs = train_dwh_generative_model(xs_train, ys_train, P, Cx, Cy, Z=zs)

ys_efficient_test = (W @ xs_test.T).T #W is information theory optimal
ys_msv_test = (G_msv @ xs_test.T).T
ys_pca_test = (G_pca @ xs_test.T).T
ys_wow_test = (G_wow @ xs_test.T).T
ys_discriminative_test = (B @ xs_test.T).T

In [0]:
def rms(x):
  return np.sqrt(np.mean(x * x))

m = 0
print("best possible", rms(ys_test[:, m] - ys_efficient_test[:, m]))
print("discriminative", rms(ys_test[:, m] - ys_discriminative_test[:, m]))
print("MSV generative", rms(ys_test[:, m] - ys_msv_test[:, m]))
print("PCA generative", rms(ys_test[:, m] - ys_pca_test[:, m]))
print("WOW generative", rms(ys_test[:, m] - ys_wow_test[:, m]))

In [0]:
plt.figure(figsize=(20,4))
plt.subplot(151)
foo, bins, bar = plt.hist(ys_test[:, m] - ys_discriminative_test[:, m], bins=32)
plt.title("discriminative")
plt.subplot(155)
foo, bins, bar = plt.hist(ys_test[:, m] - ys_efficient_test[:, m], bins=bins)
plt.title("God's best")
plt.subplot(152)
plt.hist(ys_test[:, m] - ys_msv_test[:, m], bins=bins)
plt.title("MSV generative")
plt.subplot(153)
plt.hist(ys_test[:, m] - ys_wow_test[:, m], bins=bins)
plt.title("WOW generative")
plt.subplot(154)
plt.hist(ys_test[:, m] - ys_pca_test[:, m], bins=bins)
plt.title("PCA generative")

In [0]:
# make K plot at fiducialN, fiducialD
D = fiducialD
N = fiducialN
Ks = np.unique(np.round(2. ** np.arange(0., np.log2(D) + 0.76, 0.25)).astype(int))
dys_K = np.zeros((len(Ks), 5, Ntrial, Ntest, M)) + np.nan
for ii, K in enumerate(Ks):
  print("starting trials for K =", K)
  P = np.eye(K)[:M, :]
  C = maximalC[-D-M:, -D-M:]
  W = gods_estimator(C, M)
  Bs, Gs_msv, Gs_pca = [], [], []
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, -D:]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, -D:]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    Bs.append(B)
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, P)
    Gs_msv.append(G_msv)
    G_pca = train_pca_generative_model(xs_train, ys_train, K)
    Gs_pca.append(G_msv)
    dys_K[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_K[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_K[ii, 2, trial] = ys_test - (G_pca @ xs_test.T).T
    dys_K[ii, 3, trial] = ys_test - (W @ xs_test.T).T


In [0]:
mses_K = np.mean(np.mean(np.mean(dys_K ** 2, axis=-1), axis=-1), axis=-1)
bestK = Ks[np.argmin(np.nanmin(mses_K[:, 1:3], axis=-1))]
K = bestK
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.nanmin(mses_K)
y2 = np.nanmax(mses_K) / 0.7
yt = np.exp(np.log(y1) + 0.95 * np.log(y2 / y1))
plt.fill_between(Ks, mses_K[:, 3], y1 + 0. * mses_K[:, 3], color="k", alpha=0.1)
plt.plot(Ks, mses_K[:, 0], "k.", label="discriminative")
plt.plot(Ks, mses_K[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(Ks, mses_K[:, 2], "go", mfc="none", ms=10, label="PCA generative")
plt.plot(Ks, mses_K[:, 3], "k-", lw=0.5, label="best possible MSE")
plt.loglog()
plt.axvline(fiducialD, color="k", alpha=0.5)
plt.text(fiducialD, yt, " D", color="k", alpha=0.5)
plt.axvline(fiducialN, color="b", alpha=0.5)
plt.text(fiducialN, yt, " N", color="b", alpha=0.5)
plt.axvline(K, color="r", alpha=0.5)
plt.text(K, yt, " K", color="r", alpha=0.5)
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of held-out y")
plt.xlabel("size K of human-created latent space")
plt.title("varying K at fixed D, N, C")

In [0]:
# make D plot at fiducialN
K = bestK
N = fiducialN
Ds = np.unique(np.round(2 ** np.arange(0., np.log2(maximalD) + 0.01, 0.25)).astype(int))
dys_D = np.zeros((len(Ds), 6, Ntrial, Ntest, M)) + np.nan
Ws, Bs, Gs_msv, Gs_pca = [], [], [], []
for ii, D in enumerate(Ds):
  print("starting trials for D =", D)
  C = maximalC[-D-M:, -D-M:]
  W = gods_estimator(C, M)
  Ws.append(W)
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, -D:]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, -D:]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    if trial == 0:
      Bs.append(B)
    P = np.eye(K)[:M, :]
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, P)
    if trial == 0:
      Gs_msv.append(G_msv)
    G_pca = train_pca_generative_model(xs_train, ys_train, K)
    if trial == 0:
      Gs_pca.append(G_pca)
    dys_D[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_D[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_D[ii, 2, trial] = ys_test - (G_pca @ xs_test.T).T
    dys_D[ii, 3, trial] = ys_test - (W @ xs_test.T).T

In [0]:
mses_D = np.mean(np.mean(np.mean(dys_D ** 2, axis=-1), axis=-1), axis=-1)
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.nanmin(mses_D)
y2 = np.nanmax(mses_D) / 0.7
yt = np.exp(np.log(y1) + 0.95 * np.log(y2 / y1))
plt.fill_between(Ds, mses_D[:, 3], y1 + 0. * mses_D[:, 3], color="k", alpha=0.1)
plt.plot(Ds, mses_D[:, 0], "k.", label="discriminative")
plt.plot(Ds, mses_D[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(Ds, mses_D[:, 2], "go", mfc="none", ms=10, label="PCA generative")
plt.plot(Ds, mses_D[:, 3], "k-", lw=0.5, label="God's best MSE")
plt.loglog()
plt.axvline(fiducialN, color="b", alpha=0.5)
plt.text(fiducialN, yt, " N", color="b", alpha=0.5)
plt.axvline(K, color="r", alpha=0.5)
plt.text(K, yt, " K", color="r", alpha=0.5)
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of held-out y")
plt.xlabel("number of image pixels D")
plt.title("varying D at fixed K, N; truncating or extending C")

In [0]:
# make N plot at fiducialD
K = bestK
D = fiducialD
Ns = np.unique(np.round(2 ** np.arange(0., np.log2(maximalN) + 0.01, 0.25)).astype(int))
dys_N = np.zeros((len(Ns), 5, Ntrial, Ntest, M)) + np.nan
for ii, N in enumerate(Ns):
  print("starting trials for N =", N)
  C = maximalC[-D-M:, -D-M:]
  W = gods_estimator(C, M)
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, -D:]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, -D:]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    P = np.eye(K)[:M, :]
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, P)
    G_pca = train_pca_generative_model(xs_train, ys_train, K)
    dys_N[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_N[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_N[ii, 2, trial] = ys_test - (G_pca @ xs_test.T).T
    dys_N[ii, 3, trial] = ys_test - (W @ xs_test.T).T

In [0]:
mses_N = np.mean(np.mean(np.mean(dys_N ** 2, axis=-1), axis=-1), axis=-1)
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.nanmin(mses_N)
y2 = np.nanmax(mses_N) / 0.7
yt = np.exp(np.log(y1) + 0.95 * np.log(y2 / y1))
plt.fill_between(Ns, mses_N[:, 3], y1 + 0. * mses_N[:, 3], color="k", alpha=0.1)
plt.plot(Ns, mses_N[:, 0], "k.", label="discriminative")
plt.plot(Ns, mses_N[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(Ns, mses_N[:, 2], "go", mfc="none", ms=10, label="PCA generative")
plt.plot(Ns, mses_N[:, 3], "k-", lw=0.5, label="best possible MSE")
plt.loglog()
plt.axvline(fiducialD, color="k", alpha=0.5)
plt.text(fiducialD, yt, " D", color="k", alpha=0.5)
plt.axvline(K, color="r", alpha=0.5)
plt.text(K, yt, " K", color="r", alpha=0.5)
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of held-out y")
plt.xlabel("number of training-set objects N")
plt.title("varying N at fixed K, D, C")

In [0]:
K = bestK
D = fiducialD
N = fiducialN
wows = 2. ** np.arange(-2., 10.01, 1.)
dys_w = np.zeros((len(wows), 5, Ntrial, Ntest, M))
for ii, wow in enumerate(wows):
  print("starting trials for wow =", wow)
  P = np.eye(K)[:M, :]
  C = maximalC[-D-M:, -D-M:]
  W = gods_estimator(C, M)
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, -D:]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, -D:]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, P)
    G_wow, zs = train_wow_generative_model(xs_train, ys_train, P, wow, Z=zs)
    G_pca = train_pca_generative_model(xs_train, ys_train, K)
    dys_w[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_w[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_w[ii, 2, trial] = ys_test - (G_pca @ xs_test.T).T
    dys_w[ii, 3, trial] = ys_test - (W @ xs_test.T).T
    dys_w[ii, 4, trial] = ys_test - (G_wow @ xs_test.T).T

In [0]:
mses_w = np.mean(np.mean(np.mean(dys_w ** 2, axis=-1), axis=-1), axis=-1)
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.nanmin(mses_w)
y2 = np.nanmax(mses_w) / 0.7
plt.fill_between(wows, mses_w[:, 3], y1 + 0. * mses_w[:, 3], color="k", alpha=0.1)
plt.plot(wows, mses_w[:, 0], "k.", label="discriminative")
plt.plot(wows, mses_w[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(wows, mses_w[:, 4], "gx", ms=10, label="WOW generative")
plt.plot(wows, mses_w[:, 2], "go", mfc="none", ms=10, label="PCA generative")
plt.plot(wows, mses_w[:, 3], "k-", lw=0.5, label="best possible MSE")
plt.loglog()
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of held-out y")
plt.xlabel("ratio of weights")
plt.title("varying weight ratio at fixed K, D, N, C")