# Learning Using NTK - v2.0

In [1]:
#Autocomplete
%config Completer.use_jedi = False

In [2]:
%matplotlib inline
%time

import numpy as np
import matplotlib.pyplot as plt
import os, time, glob
from tqdm.notebook import tqdm
from sklearn import svm
from scipy import signal

Wall time: 0 ns


## Fully Connected

Here we do not assume the width to go to infinty. Instead we fix $H_0$ using its parameters, and learning a convex optimization problem.

More specifically, let $g(x)= \frac{df(x;\theta)}{d\theta}$, then let's learn a linear separator (using SVM) on $g(x)$.

Here we will assume a three-layer fully-connected network:
$$ f(x) = sign(u^T \sigma (V \sigma (Wx)))$$
and compute $g$ for each layer separately.

In three layers we have:
$$ \frac{df}{dW} = u^T diag(\dot\sigma(V\sigma(Wx))) V diag(\dot\sigma(Wx))\cdot(I_d \otimes x)$$
$$ \frac{df}{dV} = u^T diag(\dot\sigma(V\sigma(Wx)))\cdot(I_d\otimes \sigma(Wx))$$
$$ \frac{df}{du} = \sigma(V\sigma(Wx))$$

### Experiments

In [3]:
relu = lambda A: np.maximum(A, 0)
drelu = lambda A: (A > 0).astype(np.float)
sgn = lambda A: (A > 0).astype(np.float)*2 - 1

In [4]:
def calc_zero_one_error (y_est, y):
    return (y_est != y).mean()

In [5]:
def calc_zero_one_network_error (X, y, u, V, W):
    y_est = sgn(u.T @ relu(V @ relu(W @ X))).T
    return calc_zero_one_error(y_est, y)

In [6]:
d = 25
m = 100
m_test = 100000

# ground truth
np.random.seed(124)
X = np.random.randn(d, m)
X_test = np.random.randn(d, m_test)
W_real = np.random.randn(d, d) / np.sqrt(d)
V_real = np.random.randn(d, d) / np.sqrt(d)
u_real = np.random.randn(d, 1) / np.sqrt(d)
y = sgn(u_real.T @ relu(V_real @ relu(W_real @ X))).T
y_test = sgn(u_real.T @ relu(V_real @ relu(W_real @ X_test))).T

In [7]:
def map_to_df_dW(X, u, V, W):
    m = X.shape[1]
    out = np.zeros((W.size, m))
    for s in range(m):
        x = X[:,s]
        a = drelu(V @ relu(W @ x))
        b = drelu(W @ x)
        c = np.tensordot(a,b,axes=0) * V
        d = np.squeeze(u.T @ c)
        res = np.ndarray.flatten(np.tensordot(d,x,axes=0))
        
        out[:, s] = res
    return out

In [8]:
def map_to_df_dV(X, u, V, W):
    m = X.shape[1]
    out = np.zeros((W.size, m))
    for s in range(m):
        x = X[:,s]
        a = drelu(V @ relu(W @ x))
        b = relu(W @ x)
        c = np.squeeze(u) * np.squeeze(a)
        d = np.tensordot(c,b,axes=0)
        res = np.ndarray.flatten(d)
        
        out[:, s] = res
    return out

In [9]:
def map_to_df_du(X, u, V, W):
    return relu(V @ relu(W @ X))

In [10]:
# Learn W
np.random.seed(0)
W_0 = np.random.randn(d, d) / np.sqrt(d)
V_0 = np.random.randn(d, d) / np.sqrt(d)
u_0 = np.random.randn(d, 1) / np.sqrt(d)
X_g = map_to_df_dW(X, u_0, V_0, W_0)
X_test_g = map_to_df_dW(X_test, u_0, V_0, W_0)

In [11]:
%%time

clfW = svm.SVC(kernel='linear', C = 1.0)
clfW.fit(X_g.T, np.squeeze(y))

train_err = calc_zero_one_error(clfW.predict(X_g.T), np.squeeze(y))
test_err = calc_zero_one_error(clfW.predict(X_test_g.T), np.squeeze(y_test))
train_err, test_err

Wall time: 1.34 s


(0.0, 0.31055)

In [12]:
# Check which error to use
get_err = lambda W: [calc_zero_one_network_error(X, y, u_0, V_0, W), calc_zero_one_network_error(X_test, y_test, u_0, V_0, W)]
W_clf = np.reshape(clfW.coef_,[25,25])
[get_err(W_0), get_err(W_clf), get_err(-W_clf), get_err(W_0 * W_clf), get_err(- W_0 * W_clf)]

[[0.47, 0.57134],
 [0.34, 0.44753],
 [0.69, 0.67522],
 [0.62, 0.6172],
 [0.45, 0.52883]]

It seems that the parameters I should use for the new W is hyperplane learned by the SVM.

In [13]:
%%time

# Learn V
np.random.seed(1)
V_0 = np.random.randn(d, d) / np.sqrt(d)
u_0 = np.random.randn(d, 1) / np.sqrt(d)
X_g = map_to_df_dV(X, u_0, V_0, W_clf)
X_test_g = map_to_df_dV(X_test, u_0, V_0, W_clf)

clfV = svm.SVC(kernel='linear', C = 1.0)
clfV.fit(X_g.T, np.squeeze(y))

train_err = calc_zero_one_error(clfV.predict(X_g.T), np.squeeze(y))
test_err = calc_zero_one_error(clfV.predict(X_test_g.T), np.squeeze(y_test))
train_err, test_err

Wall time: 4.57 s


(0.01, 0.3046)

In [14]:
# Check which error to use
get_err = lambda V: [calc_zero_one_network_error(X, y, u_0, V, W_clf), calc_zero_one_network_error(X_test, y_test, u_0, V, W_clf)]
V_clf = np.reshape(clfV.coef_,[25,25])
[get_err(V_0), get_err(V_clf), get_err(-V_clf), get_err(V_0 * V_clf), get_err(- V_0 * V_clf)]

[[0.39, 0.43147],
 [0.32, 0.41944],
 [0.63, 0.52795],
 [0.28, 0.35861],
 [0.65, 0.5797]]

In [15]:
%%time

# Learn u
np.random.seed(2)
u_0 = np.random.randn(d, 1) / np.sqrt(d)
X_g = map_to_df_du(X, u_0, V_clf, W_clf)
X_test_g = map_to_df_du(X_test, u_0, V_clf, W_clf)

clfu = svm.SVC(kernel='linear', C = 1.0)
clfu.fit(X_g.T, np.squeeze(y))

train_err = calc_zero_one_error(clfu.predict(X_g.T), np.squeeze(y))
test_err = calc_zero_one_error(clfu.predict(X_test_g.T), np.squeeze(y_test))
train_err, test_err

Wall time: 201 ms


(0.1, 0.30021)

In [16]:
# Check which error to use
get_err = lambda u: [calc_zero_one_network_error(X, y, u, V_clf, W_clf), calc_zero_one_network_error(X_test, y_test, u, V_clf, W_clf)]
u_clf = np.reshape(clfu.coef_,[25,1])
[get_err(u_0), get_err(u_clf), get_err(-u_clf), get_err(u_0 * u_clf), get_err(- u_0 * u_clf)]

[[0.51, 0.44247],
 [0.24, 0.36317],
 [0.76, 0.63683],
 [0.28, 0.36707],
 [0.72, 0.63293]]

## Learning with the Loss

Maybe we should not use $g(x)= \frac{df(x;\theta)}{d\theta}$,
but rather
$$g(x)=-\frac{d\ell}{d\theta} = -\frac{d\ell}{df(x;\theta)} \cdot \frac{df(x;\theta)}{d\theta}.$$

Let's try this.

We will assume that $y\in\{0,1\}$, hence use the cross entropy loss:
$$ \ell(f, (x,y))=-y \log f(x) -(1-y)\log(1-f(x))$$
$$ \frac{d\ell}{df} = -\frac{y}{f(x)} + \frac{1-y}{1-f(x)}. $$

Instead of sign, I will use softmax:
$$ softmax(f, x) = \frac{1}{1+\exp( -f(x))} $$
$$ \frac{d softmax}{df} = softmax(f)(1-softmax(f))$$

### Experiments

In [57]:
def calc_crossentropy_error (y_est, y):
    return (-y * np.log(y_est) - (1-y) * np.log(1-y_est)).mean()

In [30]:
def calc_crossentropy_network_error (X, y, u, V, W):
    y_est = (u.T @ relu(V @ relu(W @ X))).T
    return calc_crossentropy_error(y_est, y)

In [52]:
def calc_gradient_crossentropy_network_error (X, y, u, V, W):
    y_est = (u.T @ relu(V @ relu(W @ X))).T
    return np.squeeze(-y / y_est + (1-y)/(1-y_est))

In [53]:
def calc_softmax (y):
    return (1 + np.exp(-y))**(-1)

In [54]:
def calc_softmax_network (X, u, V, W):
    y_est = (u.T @ relu(V @ relu(W @ X))).T
    return calc_softmax(y_est)

In [55]:
def calc_gradient_softmax_network (X, u, V, W):
    softmax = calc_softmax_network(X, u, V, W)
    return np.squeeze(softmax * (1 - softmax))

In [60]:
np.unique(clfW.predict(X_g.T))

array([-1.,  1.])

In [58]:
%%time

# Learn W
np.random.seed(0)
W_0 = np.random.randn(d, d) / np.sqrt(d)
V_0 = np.random.randn(d, d) / np.sqrt(d)
u_0 = np.random.randn(d, 1) / np.sqrt(d)
X_g = -map_to_df_dW(X, u_0, V_0, W_0) * \
      calc_gradient_crossentropy_network_error(X, y, u_0, V_0, W_0) * \
      calc_gradient_softmax_network(X, u_0, V_0, W_0)
X_test_g = -map_to_df_dW(X_test, u_0, V_0, W_0) * \
           calc_gradient_logistic_network_error(X_test, y_test, u_0, V_0, W_0) * \
           calc_gradient_softmax_network(X_test, u_0, V_0, W_0)

clfW = svm.SVC(kernel='linear', C = 1.0)
clfW.fit(X_g.T, np.squeeze(y))

train_err = calc_crossentropy_error(clfW.predict(X_g.T), np.squeeze(y))
test_err = calc_crossentropy_error(clfW.predict(X_test_g.T), np.squeeze(y_test))
train_err, test_err

  return (-y * np.log(y_est) - (1-y) * np.log(1-y_est)).mean()
  return (-y * np.log(y_est) - (1-y) * np.log(1-y_est)).mean()
  return (-y * np.log(y_est) - (1-y) * np.log(1-y_est)).mean()


Wall time: 7.94 s


  return (-y * np.log(y_est) - (1-y) * np.log(1-y_est)).mean()
  return (-y * np.log(y_est) - (1-y) * np.log(1-y_est)).mean()


(nan, nan)

In [36]:
# Check which error to use
get_err = lambda W: [calc_crossentropy_network_error(X, y, u_0, V_0, W), calc_crossentropy_network_error(X_test, y_test, u_0, V_0, W)]
W_clf = np.reshape(clfW.coef_,[25,25])
[get_err(W_0), get_err(W_clf), get_err(-W_clf), get_err(W_0 * W_clf), get_err(- W_0 * W_clf)]

[[0.4981982288238005, 0.5244541269264624],
 [0.4680629266194377, 0.4997948678116393],
 [0.5697477566238739, 0.6043530391595552],
 [0.5041212218975728, 0.5128685118451799],
 [0.4991570466706234, 0.5256086173236509]]

In [22]:
%%time

# Learn V
np.random.seed(1)
V_0 = np.random.randn(d, d) / np.sqrt(d)
u_0 = np.random.randn(d, 1) / np.sqrt(d)
X_g = -map_to_df_dV(X, u_0, V_0, W_clf) * calc_gradient_logistic_network_error(X, y, u_0, V_0, W_clf)
X_test_g = -map_to_df_dV(X_test, u_0, V_0, W_clf) * calc_gradient_logistic_network_error(X_test, y_test, u_0, V_0, W_clf)

clfV = svm.SVC(kernel='linear', C = 1.0)
clfV.fit(X_g.T, np.squeeze(y))

train_err = calc_zero_one_error(clfV.predict(X_g.T), np.squeeze(y))
test_err = calc_zero_one_error(clfV.predict(X_test_g.T), np.squeeze(y_test))
train_err, test_err

Wall time: 4.66 s


(0.0, 2e-05)

In [23]:
# Check which error to use
get_err = lambda V: [calc_logistic_network_error(X, y, u_0, V, W_clf), calc_logistic_network_error(X_test, y_test, u_0, V, W_clf)]
V_clf = np.reshape(clfV.coef_,[25,25])
[get_err(V_0), get_err(V_clf), get_err(-V_clf), get_err(V_0 * V_clf), get_err(- V_0 * V_clf)]

[[0.49771350381225027, 0.48997952391950983],
 [0.5123109366021568, 0.556989813233313],
 [0.47744995694044995, 0.4262528199773297],
 [0.5000159734448995, 0.505321195821316],
 [0.49900676168480856, 0.49082802883311355]]

In [24]:
%%time

# Learn u
np.random.seed(2)
u_0 = np.random.randn(d, 1) / np.sqrt(d)
X_g = -map_to_df_du(X, u_0, V_clf, W_clf) * calc_gradient_logistic_network_error(X, y, u_0, V_clf, W_clf)
X_test_g = -map_to_df_du(X_test, u_0, V_clf, W_clf) * calc_gradient_logistic_network_error(X_test, y_test, u_0, V_clf, W_clf)

clfu = svm.SVC(kernel='linear', C = 1.0)
clfu.fit(X_g.T, np.squeeze(y))

train_err = calc_zero_one_error(clfu.predict(X_g.T), np.squeeze(y))
test_err = calc_zero_one_error(clfu.predict(X_test_g.T), np.squeeze(y_test))
train_err, test_err

Wall time: 181 ms


(0.0, 0.00018)

In [25]:
# Check which error to use
get_err = lambda u: [calc_logistic_network_error(X, y, u, V_clf, W_clf), calc_logistic_network_error(X_test, y_test, u, V_clf, W_clf)]
u_clf = np.reshape(clfu.coef_,[25,1])
[get_err(u_0), get_err(u_clf), get_err(-u_clf), get_err(u_0 * u_clf), get_err(- u_0 * u_clf)]

[[0.49584718772502756, 0.5008252216856964],
 [0.5278108760267323, 0.5831306185572848],
 [0.4721891239732677, 0.4168693814427154],
 [0.49781226959470337, 0.5011523064269487],
 [0.5021877304052967, 0.4988476935730513]]