<a href="https://colab.research.google.com/github/davidwhogg/GenerativeVsDiscriminative/blob/master/ipynb/generative_vs_discriminative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generative *vs* Discriminative models for inference

- Do generative models generally outperform discriminative models in various ways?
- How do both of these compare to information-theory-optimal estimators?
- Are generative models less subject to adversarial attack?
- Are generative and discriminative models equally good for de-noising noisy labels?
- ... and related matters.

## authors
- **David W Hogg** *(NYU) (MPIA) (Flatiron)*
- **Soledad Villar** *(NYU)*

## license
Copyright 2019, 2020 the authors. All rights reserved (for now).

## notes and to-do items
- Replace all `pseudo_solve()` and `pseudo_inverse()` calls with `np.linalg.lstsq()[0]` calls.
- Consider making a method that fits a Gaussian to the joint of `X` and `Y`.
- Rearrange things so we can loop over some amplitudes of `C_x` and `C_y` as well.
- Decide what are our main results / observations.

In [0]:
import numpy as np
import pylab as plt
#matplotlib inline

In [0]:
# set up the data-generating matrices of God
# BUG: This should probably be a function call.

# set God's integers
maximalD = 2 ** 9 # ESLII calls this "p"
fiducialD = 2 ** 7
M = 1 # ESLII always sets this to 1
K = 2 ** 4
assert(M <= K)

# set training set size
maximalN = maximalD
fiducialN = 2 ** 6
Ntest = 2 ** 8 # number of points to use to compute biases and variances
Ntrial = 2 ** 4 # number of training trials to use to compute biases and variances

# set God's matrices
# the true world can be a (K-1)-order Legendre (orthogonal) or Chebyshev (non-orthogonal) polynomial
#Q = np.vstack([np.polynomial.legendre.legval(np.arange(-1. + 1./D, 1., 2./D),
#                                             np.eye(K)[k])
#    for k in range(K)]).T
# or the true world can be a set of random matrices
np.random.seed(42)
maximalQ = np.random.normal(size=(maximalD, K))
np.random.seed(17)
P = np.random.normal(size=(M, K))

# make noise covariance matrices
np.random.seed(13)
foo = np.random.normal(size=(maximalD, 2 * maximalD))
maximalCx = 2 ** -2 * foo @ foo.T
foo = np.random.normal(size=(M, 2 * M))
Cy = 2 ** -2 * foo @ foo.T
foo = np.random.normal(size=(K, 2 * K))
V = foo @ foo.T

In [0]:
# define functions to take and use pseudo-inverses.

def pseudo_solve(A, x, Cinv=None, weights=None, rcond=None, lamb=0.):
  """
  return solve(ATA, ATx) but be somewhat clever about numerics

  ## inputs:
  - A: rectangular matrix
  - Cinv (optional): square non-negative semi-definite matrix that can multiply
      AT on the right or A on the left, and x on the left.
  - weight (optional): diagonal of Cinv if Cinv is diagonal (conflicts with
      Cinv).
  - rcond (optional): a regularization parameter for np.linalg.lstsq().
  - lamb (optional): L2 regularization parameter (use at your own risk).

  ## outputs:
  - Pseudo-inverse of A applied to x.
  """
  foo, bar = A.shape # need this to decide whether to do ATA or AAT

  # use weights or Cinv to weight A.T
  if weights is not None:
    assert Cinv is None, "pseudo_inverse(): You can't set both Cinv and weight"
    wAT = weights * A.T
  elif Cinv is not None:
    wAT = A.T @ Cinv
  else:
    wAT = A.T

  if foo < bar:
    AwAT = A @ wAT
    if x is None:
      return np.linalg.lstsq(AwAT + lamb * np.eye(foo), wAT.T, rcond=rcond)[0].T
    return wAT @ np.linalg.lstsq(AwAT + lamb * np.eye(foo), x, rcond=rcond)[0] # There are many ways to write this line

  if x is None:
    wATx = wAT
  else:
    wATx = wAT @ x

  wATA = wAT @ A
  return np.linalg.lstsq(wATA + lamb * np.eye(bar), wATx, rcond=None)[0]

def pseudo_inverse(A, Cinv=None, weights=None, rcond=None, lamb=0.):
  """
  return solve(ATA, AT) but be somewhat clever about numerics
  """
  return pseudo_solve(A, None, Cinv=Cinv, weights=weights, rcond=rcond, lamb=lamb)

In [0]:
def gods_estimator(P, Q, V, Cx):
  """
  make the best possible estimator, in terms of mean squared error on Y
  (this, we hope, is what everything approaches, in the limit N -> infinity)
  """
  return np.linalg.lstsq(Q @ V @ Q.T + Cx, Q @ V @ P.T,
                         rcond=None)[0].T
  """
  # OLD STUFF
  # BUGS:
  # - REQUIRES M=1.
  D, K = Q.shape
  M, KK = P.shape
  assert K == KK
  assert M == 1 # holy mother of God
  print(np.diag(1. / xivars).shape, Q.shape, Q.T.shape, np.zeros((K, K)).shape)
  foo = np.vstack((np.hstack((np.diag(1. / xivars), Q)),
                   np.hstack((Q.T, np.zeros((K, K))))))
  bar = np.append(np.zeros(D), P.flatten())
  bar = np.linalg.lstsq(foo, bar, rcond=None)[0]
  return bar[:D].reshape((D, M)).T
  """

In [0]:
maximalW = gods_estimator(P, maximalQ, V, maximalCx)
maximalW.shape

In [0]:
def make_data_set(N, P, Q, V, Cx, Cy):
  """
  actually make the random data, using the matrices of God
  """
  M, K = P.shape
  D, KK = Q.shape
  assert K == KK
  zs = np.random.multivariate_normal(np.zeros(K), V, size=N)
  godxs = (Q @ zs.T).T # "true" xs
  xs = godxs + np.random.multivariate_normal(np.zeros(D), Cx, size=N)
  godys = (P @ zs.T).T # "true" ys
  ys = godys + np.random.multivariate_normal(np.zeros(M), Cy, size=N)
  return xs, ys

In [0]:
# make maximal train and test sets
# NOTE THAT THE TEST Y HAS INFINITE yivars OR ZERO NOISE
np.random.seed(8675309 + 1)
maximalX = np.zeros((Ntrial, maximalN, maximalD))
maximalY = np.zeros((Ntrial, maximalN, M))
for trial in range(Ntrial):
  maximalX[trial], maximalY[trial] = make_data_set(maximalN, P, maximalQ, V, maximalCx, Cy)
maximalXtest = np.zeros((Ntrial, Ntest, maximalD))
maximalYtest = np.zeros((Ntrial, Ntest, M))
for trial in range(Ntrial):
  # zero-noise test labels maximalYtest
  maximalXtest[trial], maximalYtest[trial] = make_data_set(Ntest, P, maximalQ, V, maximalCx, 0. * Cy)
print(maximalX.shape, maximalY.shape)

In [0]:
u, s, v = np.linalg.svd(maximalX[0, :fiducialN, :fiducialD], full_matrices=False)
plt.axvline(K)
plt.plot(s, "ko")
plt.semilogy()
plt.ylabel("contribution to variance")
plt.xlabel("component number")
plt.title("SVD of the fiducial data set, zeroth trial")

In [0]:
#examples = np.arange(4)
#colors=['red', 'purple', 'green', 'blue']
#for i,n in enumerate(examples):
#  plt.step(np.arange(maximalD), xs_train[n], linestyle='-',alpha=0.5, color=colors[i], where="mid")

In [0]:
def train_discriminative_model(xs, ys):
  """
  train discriminative model y = B x + noise
  
  ## inputs:
  - xs - array of training data
  - ys - array of training labels

  ## comments
  Informally speaking, this finds the B that minimizes || ys - B . xs || (plus
  some regularization), and returns the discriminative matrix B.

  ## bugs:
  - Properly, this should take in an estimate of the inverse variances. These
    might be some processing of the xivars and yivars; we need to figure that
    out.
  - Doesn't fit for a bias term (that is, a zero-offset).
  """
  return np.linalg.lstsq(xs, ys, rcond=None)[0].T

def train_trivial_generative_model(xs, ys):
  """
  say stuff here

  ## bugs:
  - Doesn't fit for a bias term (that is, a zero-offset).
  """
  return np.linalg.lstsq(ys, xs, rcond=None)[0]

def initialize_a_generative_model(Y, P):
  Z = np.linalg.lstsq(P, Y.T, rcond=None)[0]
  P1 = P.T
  proj_P = P1 @ pseudo_inverse(P1)
  M, humanK = P.shape
  proj_Pperp = np.identity(humanK) - proj_P
  return proj_P @ Z, proj_Pperp @ np.random.normal(size=Z.shape), proj_Pperp

def train_msv_generative_model(X, Y, P, maxiter=10000, Z=None):
  """
  ## Inputs:
  X: N x D
  Y: N x M
  P: M x humanK

  ## Comments:
  among all solutions Z such that Y=Z*P.T (ie Y.T=P*Z.T)
  we minimize ||X - Z*A|| by alternating minimization
  we write Z = Z0 + Z1, proj_P*Z=Z0, proj_Pperp*Z = Z1 

  ## bugs:
  - Can be simplified with pseudo_inverse().
  - Doesn't use xivars, yivars.
  - Doesn't fit for a bias term (that is, a zero-offset).
  """
  M, humanK = P.shape
  
  Z0, Z1, proj_Pperp = initialize_a_generative_model(Y, P)
  if Z is None:
    Z = Z0 + Z1
  tiny = 1.e-9
  unconverged = True
  last_cost=np.inf
  for i in range(maxiter):
    # normalize the perpendicular part of Z
    Z1 = proj_Pperp @ Z
    Z -= Z1
    Z1 /= tiny + np.sqrt(np.mean(Z1 ** 2, axis=1))[:, None]
    Z += Z1

    #fix Z optimize for A
    A=np.linalg.lstsq(Z.T, X, rcond=None)[0]

    #fix A optimize for Z
    a = A.T @ proj_Pperp.T
    b = X.T - A.T @ Z0
    Z1 = np.linalg.lstsq(a, b, rcond=None)[0]
    Z1 = proj_Pperp @ Z1

    #convergence
    Z_old = Z
    Z = Z0 + Z1
    #BUG: unstable! 
    #print('convergence, cost')
    cost= np.linalg.norm(X.T - A.T @ Z)

    if cost >= last_cost:
      unconverged = False
      break
    last_cost = cost
    #print((np.linalg.norm(Z-Z_old), np.linalg.norm(X.T- np.matmul(A.T,Z)) + np.linalg.norm(Y.T -np.matmul(P,Z)) ))
  if unconverged: print("train_msv_generative_model(): WARNING: did not converge.")
  return  P @ pseudo_inverse(A).T, Z.T

In [0]:
def z_step(X, Y, A, P, sqrtCxinv, sqrtCyinv):
  bigA = np.vstack((sqrtCxinv @ A, sqrtCyinv @ P))
  bigX = np.hstack((X @ sqrtCxinv, Y @ sqrtCyinv))
  return np.linalg.lstsq(bigA, bigX.T, rcond=None)[0].T

def a_step(X, Z):
  """
  BUG: This doesn't currently use the Cx, Cy correctly.
  BUG: meaning: It treats the data as homoskedastic.
  """
  return np.linalg.lstsq(Z, X, rcond=None)[0].T

def dwh_cost(X, Y, Z, A, P, sqrtCxinv, sqrtCyinv):
  """
  ## Bugs:
  - Untested!
  """
  xchi = sqrtCxinv @ (X.T - (A @ Z.T))
  ychi = sqrtCyinv @ (Y.T - (P @ Z.T))
  return np.sum(xchi ** 2) + np.sum(ychi ** 2)

def train_dwh_generative_model(X, Y, P, Cx, Cy, maxiter=10000, lltol=0.00001, Z=None):
  """
  ## Bugs:
  - Initialization bad here. This should probably initialize at the internal
    state of the MSV generative model optimized earlier.
  - Takes forever to converge when K = M, apparently. This may be related to
    initialization also.
  """
  # get read the square roots of the inverses of Cx, Cy
  u, s, v = np.linalg.svd(Cx)
  sqrtCxinv = (u / np.sqrt(s)) @ v
  Cxinv = (u / s) @ v
  u, s, v = np.linalg.svd(Cy)
  sqrtCyinv = (u / np.sqrt(s)) @ v

  # initialize loop
  Z0, Z1, proj_P_perp = initialize_a_generative_model(Y, P)
  if Z is None:
    Z = (Z0 + Z1).T
  tiny = 1.e-9
  cost = np.inf
  unconverged = True
  A = 0.

  # loop
  for ii in range(maxiter):
    old_cost = 1. * cost
    oldA = 1. * A

    # normalization step
    Zperp = (proj_P_perp @ Z.T).T
    Z -= Zperp
    Zperp /= tiny + np.sqrt(np.mean(Zperp ** 2, axis=0))[None, :]
    Z += Zperp

    # standard iteration
    A = a_step(X, Z)
    Z = z_step(X, Y, A, P, sqrtCxinv, sqrtCyinv)

    # convergence control
    cost = dwh_cost(X, Y, Z, A, P, sqrtCxinv, sqrtCyinv)
    if cost > 1. and cost > (old_cost + lltol): # note insane condition
      print("train_dwh_generative_model(): WARNING: cost went the wrong way!")
      print(ii, cost, old_cost)
      A = oldA # restore
    if cost > (old_cost - lltol):
      unconverged = False
      break
  if unconverged: print("train_dwh_generative_model(): WARNING: did not converge.")
  return P @ pseudo_inverse(A, Cinv=Cxinv), Z

In [0]:
# run on the first trial case, for the fiducials
xs_train = maximalX[0, :fiducialN, :fiducialD]
ys_train = maximalY[0, :fiducialN, :]
xs_test = maximalXtest[0, :, :fiducialD]
ys_test = maximalYtest[0, :, :]
Cx = maximalCx[:fiducialD, :fiducialD]
W = gods_estimator(P, maximalQ[:fiducialD, :], V, Cx)

print(xs_train.shape, ys_train.shape)
B = train_discriminative_model(xs_train, ys_train)
Hdagger = train_trivial_generative_model(xs_train, ys_train)
print(B.shape, Hdagger.shape)

# humanP = P
# humanP = np.hstack((P, np.zeros((M, 2))))
# humanP = P[:, :9]
humanK = K # stupid
assert humanK >= M
humanP = np.eye(humanK)[:M, :]
print(P.shape, humanP.shape)
G_msv, zs = train_msv_generative_model(xs_train, ys_train, humanP)
# hack: Use MSV's latents to initialize DWH's model
G_dwh, zs = train_dwh_generative_model(xs_train, ys_train, humanP, Cx, Cy, Z=zs)

ys_efficient_test = (W @ xs_test.T).T #W is information theory optimal
ys_msv_test = (G_msv @ xs_test.T).T
ys_dwh_test = (G_dwh @ xs_test.T).T
ys_discriminative_test = (B @ xs_test.T).T
ys_trivial_test = (Hdagger @ xs_test.T).T

In [0]:
def rms(x):
  return np.sqrt(np.mean(x * x))

m = 0
print("best possible", rms(ys_test[:, m] - ys_efficient_test[:, m]))
print("discriminative", rms(ys_test[:, m] - ys_discriminative_test[:, m]))
print("alt-trivial generative", rms(ys_test[:, m] - ys_trivial_test[:, m]))
print("MSV generative", rms(ys_test[:, m] - ys_msv_test[:, m]))
print("DWH generative", rms(ys_test[:, m] - ys_dwh_test[:, m]))

In [0]:
plt.figure(figsize=(20,4))
plt.subplot(151)
foo, bins, bar = plt.hist(ys_test[:, m] - ys_discriminative_test[:, m], bins=32)
plt.title("discriminative")
plt.subplot(155)
foo, bins, bar = plt.hist(ys_test[:, m] - ys_efficient_test[:, m], bins=bins)
plt.title("God's best")
plt.subplot(152)
plt.hist(ys_test[:, m] - ys_trivial_test[:, m], bins=bins)
plt.title("alt-trivial generative")
plt.subplot(153)
plt.hist(ys_test[:, m] - ys_msv_test[:, m], bins=bins)
plt.title("MSV generative")
plt.subplot(154)
plt.hist(ys_test[:, m] - ys_dwh_test[:, m], bins=bins)
plt.title("DWH generative")

In [0]:
def compute_mean_variance(ixs):
  xs = np.array(ixs)
  mean = np.mean(xs)
  var = np.mean(xs ** 2) - mean ** 2
  return mean, var

In [0]:
# make D plot at fiducialN

humanK = K # stupid
N = fiducialN
Ds = np.round((maximalD * 2 ** np.arange(-6.5, 0.01, 0.25))).astype(int)
dys_D = np.zeros((len(Ds), 4, Ntrial, Ntest, M))
Ws, Bs, Gs_msv, Gs_dwh = [], [], [], []
for ii, D in enumerate(Ds):
  print("starting trials for D =", D)
  Cx = maximalCx[:D, :D]
  W = gods_estimator(P, maximalQ[:D, :], V, Cx)
  Ws.append(W)
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, :D]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, :D]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    if trial == 0:
      Bs.append(B)
    humanP = np.eye(humanK)[:M, :]
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, humanP)
    if trial == 0:
      Gs_msv.append(G_msv)
    G_dwh, zs = train_dwh_generative_model(xs_train, ys_train, humanP, Cx, Cy, Z=zs)
    if trial == 0:
      Gs_dwh.append(G_dwh)
    dys_D[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_D[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_D[ii, 2, trial] = ys_test - (G_dwh @ xs_test.T).T
    dys_D[ii, 3, trial] = ys_test - (W @ xs_test.T).T

In [0]:
mses_D = np.mean(np.mean(np.mean(dys_D ** 2, axis=-1), axis=-1), axis=-1)
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.min(mses_D)
y2 = np.max(mses_D) / 0.7
yt = np.exp(np.log(y1) + 0.95 * np.log(y2 / y1))
plt.fill_between(Ds, mses_D[:, 3], y1 + 0. * mses_D[:, 3], color="k", alpha=0.1)
plt.plot(Ds, mses_D[:, 0], "k.", label="discriminative")
plt.plot(Ds, mses_D[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(Ds, mses_D[:, 2], "go", mfc="none", ms=10, label="DWH generative")
plt.plot(Ds, mses_D[:, 3], "k-", lw=0.5, label="best possible")
biases = [(W @ maximalQ[:D, :] - P).flatten() for D, W in zip(Ds, Ws)]
foo = [np.dot(bias, bias) for bias in biases]
# plt.plot(Ds, variances, "r-", lw=0.5, label="expected variances")
plt.plot(Ds, foo, "b-", lw=0.5, label="bias of the best")
plt.loglog()
plt.axvline(fiducialN, color="b", alpha=0.5)
plt.text(fiducialN, yt, " N", color="b", alpha=0.5)
plt.axvline(K, color="r", alpha=0.5)
plt.text(K, yt, " K = humanK", color="r", alpha=0.5)
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of TRUE y")
plt.xlabel("number of image pixels D")
plt.title("varying D at fixed N, K, P, C_y; truncating or extending Q, C_x")

In [0]:
# make N plot at fiducialD

humanK = K # stupid
D = fiducialD
Ns = np.round((maximalN * 2 ** np.arange(-6., 0.01, 0.25))).astype(int)
dys_N = np.zeros((len(Ns), 4, Ntrial, Ntest, M))
for ii, N in enumerate(Ns):
  print("starting trials for N =", N)
  Cx = maximalCx[:D, :D]
  W = gods_estimator(P, maximalQ[:D, :], V, Cx)
  Bs, Gs_msv, Gs_dwh = [], [], []
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, :D]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, :D]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    Bs.append(B)
    humanP = np.eye(humanK)[:M, :]
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, humanP)
    Gs_msv.append(G_msv)
    G_dwh, zs = train_dwh_generative_model(xs_train, ys_train, humanP, Cx, Cy, Z=zs)
    Gs_dwh.append(G_dwh)
    # re-run as a test
    # G_msv, zs = train_msv_generative_model(xs_train, ys_train, humanP, Z=zs.T)
    dys_N[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_N[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_N[ii, 2, trial] = ys_test - (G_dwh @ xs_test.T).T
    dys_N[ii, 3, trial] = ys_test - (W @ xs_test.T).T

In [0]:
mses_N = np.mean(np.mean(np.mean(dys_N ** 2, axis=-1), axis=-1), axis=-1)
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.min(mses_N)
y2 = np.max(mses_N) / 0.7
yt = np.exp(np.log(y1) + 0.95 * np.log(y2 / y1))
plt.fill_between(Ns, mses_N[:, 3], y1 + 0. * mses_N[:, 3], color="k", alpha=0.1)
plt.plot(Ns, mses_N[:, 0], "k.", label="discriminative")
plt.plot(Ns, mses_N[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(Ns, mses_N[:, 2], "go", mfc="none", ms=10, label="DWH generative")
plt.plot(Ns, mses_N[:, 3], "k-", lw=0.5, label="best possible")
plt.loglog()
plt.axvline(fiducialD, color="k", alpha=0.5)
plt.text(fiducialD, yt, " D", color="k", alpha=0.5)
plt.axvline(K, color="r", alpha=0.5)
plt.text(K, yt, " K = humanK", color="r", alpha=0.5)
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of TRUE y")
plt.xlabel("number of training-set objects N")
plt.title("varying N at fixed D, K, P, Q, C_x, C_y")

In [0]:
# make humanK plot at fiducialN, fiducialD

D = fiducialD
N = fiducialN
humanKs = np.unique(np.round(2. ** np.arange(np.log2(K) - 1.5, np.log2(D) + 1.01, 0.25)).astype(int))
dys_K = np.zeros((len(humanKs), 4, Ntrial, Ntest, M))
for ii, humanK in enumerate(humanKs):
  print("starting trials for humanK =", humanK)
  humanP = np.eye(humanK)[:M, :]
  Cx = maximalCx[:D, :D]
  W = gods_estimator(P, maximalQ[:D, :], V, Cx)
  Bs, Gs_msv, Gs_dwh = [], [], []
  for trial in range(Ntrial):
    xs_train = maximalX[trial, :N, :D]
    ys_train = maximalY[trial, :N, :]
    xs_test = maximalXtest[trial, :, :D]
    ys_test = maximalYtest[trial, :, :]
    B = train_discriminative_model(xs_train, ys_train)
    Bs.append(B)
    G_msv, zs = train_msv_generative_model(xs_train, ys_train, humanP)
    Gs_msv.append(G_msv)
    G_dwh, zs = train_dwh_generative_model(xs_train, ys_train, humanP, Cx, Cy, Z=zs)
    Gs_dwh.append(G_dwh)
    dys_K[ii, 0, trial] = ys_test - (B @ xs_test.T).T
    dys_K[ii, 1, trial] = ys_test - (G_msv @ xs_test.T).T
    dys_K[ii, 2, trial] = ys_test - (G_dwh @ xs_test.T).T
    dys_K[ii, 3, trial] = ys_test - (W @ xs_test.T).T

In [0]:
mses_K = np.mean(np.mean(np.mean(dys_K ** 2, axis=-1), axis=-1), axis=-1)
plt.figure(figsize=(7, 5))
plt.subplot(111)
y1 = 0.7 * np.min(mses_K)
y2 = np.max(mses_K) / 0.7
yt = np.exp(np.log(y1) + 0.95 * np.log(y2 / y1))
plt.fill_between(humanKs, mses_K[:, 3], y1 + 0. * mses_K[:, 3], color="k", alpha=0.1)
plt.plot(humanKs, mses_K[:, 0], "k.", label="discriminative")
plt.plot(humanKs, mses_K[:, 1], "ro", mfc="none", label="MSV generative")
plt.plot(humanKs, mses_K[:, 2], "go", mfc="none", ms=10, label="DWH generative")
plt.plot(humanKs, mses_K[:, 3], "k-", lw=0.5, label="best possible")
plt.loglog()
plt.axvline(fiducialD, color="k", alpha=0.5)
plt.text(fiducialD, yt, " D", color="k", alpha=0.5)
plt.axvline(fiducialN, color="b", alpha=0.5)
plt.text(fiducialN, yt, " N", color="b", alpha=0.5)
plt.axvline(K, color="r", alpha=0.5)
plt.text(K, yt, " K", color="r", alpha=0.5)
plt.legend()
plt.ylim(y1, y2)
plt.ylabel("MSE (variance + bias^2) in prediction of TRUE y")
plt.xlabel("size humanK of human-created latent space")
plt.title("varying humanK at fixed D, N, K, P, Q, C_y, C_x")

# OLD STUFF

# OLD STUFF FOLLOWS

In [0]:
# make things to histogram
Bs, Gs_msv, Gs_dwh = trainings # unpack
Ntrials, M, D = Bs.shape
Ntest = 2 ** 5 + 5
dys_discriminative = np.zeros((Ntrials, Ntest, M))
dys_msv = np.zeros((Ntrials, Ntest, M))
dys_dwh = np.zeros((Ntrials, Ntest, M))
for nn in range(Ntrials):
  xtest, ytest = make_data_set(Ntest, P, Q, xivars, yivars) # generate with God's knowledge
  dys_discriminative[nn] = ytest - (Bs[nn] @ xtest.T).T
  dys_msv[nn] = ytest - (Gs_msv[nn] @ xtest.T).T
  dys_dwh[nn] = ytest - (Gs_dwh[nn] @ xtest.T).T

In [0]:
plt.figure(figsize=(15,5))
plt.subplot(131)
foo, bins, bar = plt.hist(dys_discriminative.reshape((Ntrials, Ntest)).T, bins=Ntest // 5, stacked=True)
plt.title("discriminative prediction of y")
plt.subplot(132)
plt.hist(dys_msv.reshape((Ntrials, Ntest)).T, bins=bins, stacked=True)
plt.title("MSV generative prediction of y")
plt.subplot(133)
plt.hist(dys_dwh.reshape((Ntrials, Ntest)).T, bins=bins, stacked=True)
plt.title("DWH generative prediction of y")

In [0]:
# compute means and variances
means = np.mean(trainings, axis=1)
mean2s = np.mean(trainings ** 2, axis=1)
variances = mean2s - means ** 2
variances = np.sum(np.sum(variances, axis=-1), axis=-1)
print(variances)

In [0]:
# OLD plotting macro
def plot_inference(labels, inference, title, name):
  foo = np.array([-20, 20])
  bar = 1.1 * np.max(np.abs(labels))
  plt.figure(figsize=(15,5))
  plt.subplot(131)
  plt.scatter(labels, inference)
  plt.title(title + " inference (vs labels) for " + name)
  plt.xlabel("label slope")
  plt.ylabel("inferred slope")
  plt.plot(foo, foo, "k-", alpha=0.25)
  plt.xlim(-bar, bar)
  plt.ylim(-bar, bar)
  print("RMSE vs labels:", rms(labels - inference))
  #plt.subplot(132)
  #plt.scatter(truth, inference)
  #plt.title(title + " inference (vs truth) for " + name)
  #plt.xlabel("true slope")
  #plt.ylabel("inferred slope")
  #plt.plot(foo, foo, "k-", alpha=0.25)
  #plt.xlim(-bar, bar)
  #plt.ylim(-bar, bar)
  #print("RMSE vs truth:", rms(truth - inference))
  plt.subplot(132)
  plt.scatter(labels, inference - labels)
  plt.title(title + " inference for " + name)
  plt.xlabel("label slope")
  plt.ylabel("residual (inferred - label)")
  plt.plot(foo, 0. * foo, "k-", alpha=0.25)
  plt.xlim(-bar, bar)
  plt.ylim(-0.3 * bar, 0.3 * bar)
  plt.tight_layout()

In [0]:
# plot and report outcomes for the efficient estimator
# this should be the best you can possibly do
m = 0
plot_inference(ys_test[:, m], ys_efficient_test[:, m],
               "", "the efficient estimator")

In [0]:
# plot and report outcomes for the discriminative model
plot_inference(ys_test[:, m], ys_discriminative_test[:, m],
               "", "the discriminative model")

In [0]:
plot_inference(ys_test[:, m], ys_msv_test[:, m],
               "", "the MSV generative model")

In [0]:
plot_inference(ys_test[:, m], ys_trivial_test[:, m],
               "", "the alt-trivial generative model")

In [0]:
G_dwh = train_dwh_generative_model(xs_train, ys_train, humanP, xivars, yivars)

In [0]:
ys_dwh_test = (G_dwh @ xs_test.T).T
plot_inference(ys_test[:, m], ys_dwh_test[:, m],
               "", "the DWH generative model")

In [0]:
fig, ax = plt.subplots(1,4,sharey=True, figsize=(18,4.5))
for m in range(M):
  ax[0].plot(W[m, :], alpha=0.75)
ax[0].set_title("true (information-theory-optimal) result")
for m in range(M):
  ax[1].plot(B[m, :], alpha=0.75)
ax[1].set_title("discriminative result")
for m in range(M):
  ax[2].plot(G_msv[m, :], alpha=0.75)
ax[2].set_title("MSV generative result")
for m in range(M):
  ax[3].plot(Hdagger[m, :], alpha=0.75)
ax[3].set_title("alt-trivial generative result")
ax[3].set_ylim(np.min(W), np.max(W))

In [0]:
fig, ax = plt.subplots(1,4,sharey=True, figsize=(18,4.5))
for m in range(M):
  ax[0].plot(W[m, :], alpha=0.75)
ax[0].set_title("true (information-theory-optimal) result")
for m in range(M):
  ax[1].plot(G_msv[m, :], alpha=0.75)
ax[1].set_title("MSV generative result")
for m in range(M):
  ax[2].plot(G_dwh[m, :], alpha=0.75)
ax[2].set_title("DWH generative result")
for m in range(M):
  ax[3].plot(Hdagger[m, :], alpha=0.75)
ax[3].set_title("alt-trivial generative result")
ax[3].set_ylim(np.min(W), np.max(W))

In [0]:
def train_many_times(Ntrain, humanP, P, Q, xivars, yivars, Ntrials=2**5):
  """
  Perform many trainings of each model for the purposes of empirically testing
  biases and variances.

  ## inputs:
  - Ntrain - training set size N
  - humanP - human-set projector P
  - P - God's projector P
  - Q - God's embeddor Q
  - xivars - diagonal of C_x\inv
  - yivars - diagonal of C_y\inv
  - Ntrials (optional) - number of independent trials
  """
  Bs, Gs_msv, Gs_dwh = [], [], []
  for trial in range(Ntrials):
    xtrain, ytrain = make_data_set(Ntrain, P, Q, xivars, yivars) # generate with God's knowledge
    Bs.append(train_discriminative_model(xtrain, ytrain))
    Gs_msv.append(train_msv_generative_model(xtrain, ytrain, humanP, iters=1000)) # infer with human choices
    Gs_dwh.append(train_dwh_generative_model(xtrain, ytrain, humanP, xivars, yivars, maxiter=1000))
  return np.array([np.array(Bs), np.array(Gs_msv), np.array(Gs_dwh)])