In [1]:
import numpy as np
import scipy.integrate as sci
import scipy.optimize as sco
import theano.tensor as T
import theano
import downhill
import climate
import itertools as it
import matplotlib
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

Details for gaussian distribution as an exponential family

In [2]:
def s_1(x):
    return x

def s_2(x):
    return -x*x

def F_1D(theta1,theta2):
    return 0.25*theta1*theta1/theta2 + 0.5*np.log(np.pi) - 0.5*np.log(theta2) 

def gradF_1_1D(theta_1,theta_2):
    return 0.5*theta_1/theta_2

def gradF_2_1D(theta_1, theta_2):
    temp_1 = 0.5 / theta_2
    temp_2 = temp_1 * theta_1
    return -1. * (temp_2 * temp_2 + temp_1)

def gradF_1_nD(theta_1,theta_2):
    return 0.5*np.dot(np.inv(theta_2), theta_1)

def gradF_2_nD(theta_1, theta_2):
    temp_1 = 0.5*np.inv(theta_2)
    temp_2 = np.dot(temp1,theta_1)
    return - np.outer(temp_2,temp_2) - temp_1

def gradG_1_1D(eta_1,eta_2):
    return eta1 / (-eta_1 * eta1 - eta2)

def gradG_2_1D(eta_1,eta_2):
    return 0.5  / (-eta_1 * eta1 - eta2)

def gradG_1_nD(eta_1,eta_2):
    return np.dot(np.inv(-np.outer(eta_1, eta1) - eta2), eta1)

def gradG_2_nD(eta_1,eta_2):
    return 0.5*np.inv(-np.outer(eta_1, eta1) - eta2)


Cas 1D - Data

In [3]:
seed = 13
np.random.seed(seed)
N, batch_size = 2000, 100
mu_true, sigma_true = 1, 2
sigma2_true = sigma_true*sigma_true
theta1_true = mu_true / sigma2_true
theta2_true = 0.5 / sigma2_true
eta1_true = mu_true
eta2_true = - (mu_true*mu_true + sigma2_true)

X = np.random.normal(mu_true,np.sqrt(sigma2_true), N)

def batches(Y, bs):
    nb = np.int(np.ceil(len(Y) / bs))
    Yb = [Y[i * bs : (i+1) * bs] for i in range(nb)]
    def __temp(i):
        return Yb[i]
    return nb, Yb, __temp

Xt = X[: N//4]
Nt = len(Xt)
NtB, XtB, XtB_f = batches(Xt, batch_size)
Xv = X[N//4 : N//2]
Nv = len(Xv)
NvB, XvB, XvB_f = batches(Xv, batch_size)
Xtest = X[N//2:]
# print (Xt, XtB, NtB, XtB_f(1))

In [4]:
def ll(x, mu, sigma2):
    return -(x - mu)**2 /(2 * sigma2) - np.log(np.sqrt(2 * sigma2 * np.pi)) 

def ll_theta(x, theta1, theta2):
    return s_1(x) * theta1 + s_2(x) * theta2 - F_1D(theta1, theta2)

def ll_eta(x, eta1, eta2):
    theta1 = - eta1 / (eta1*eta1 + eta2)
    theta2 = - 0.5  / (eta1*eta1 + eta2)
    return ll_theta(x, theta1, theta2)
    

def ave_ll(mu, sigma2, chi):
    N = len(chi)
    return (1. / N) * sum(ll(x, mu, sigma2) for x in chi)
    
def C_N(mu, sigma2, chi):
    return -ave_ll(mu, sigma2, chi)

np.testing.assert_allclose(ll(4, mu_true, sigma2_true),
                          ll_theta(4, theta1_true, theta2_true))
np.testing.assert_allclose(ll(4, mu_true, sigma2_true),
                          ll_eta(4, eta1_true, eta2_true))

In [5]:
class ListTable(list):
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            for col in row:
                html.append("<td>{0}</td>".format(col))
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)
    
def disp_results(results):
    n = len(results)
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 5))
    colors = plt.cm.jet(np.linspace(0, 1, n))
    table = ListTable()
    table.append(['', 'Train', 'Test', 'mu', 'sigma^2'])
    table.append(['True', ave_ll(mu_true, sigma2_true, Xt), 
                  ave_ll(mu_true, sigma2_true, Xtest),
                  str(mu_true), str(sigma_true**2)])
    for c, result in enumerate(results):
        train, test, mu, sig, method = result
        axes[0,0].plot(train, color=colors[c], linewidth=1, linestyle='-', label=method)
        axes[0,0].set_title('Average likelihood on train set')
        axes[0,0].legend(loc=4)
        axes[0,1].plot(test, color=colors[c], linewidth=1, linestyle='-', label=method)
        axes[0,1].set_title('Average likelihood on test set')
        axes[0,1].legend(loc=4)
        axes[1,0].plot(mu, color=colors[c], linewidth=1, linestyle='-', label=method)
        axes[1,0].set_title('Estimates of $\mu$')
        axes[1,0].legend(loc=4)
        axes[1,1].plot(sig, color=colors[c], linewidth=1, linestyle='-', label=method)
        axes[1,1].set_title('Estimates of $\sigma^2$')
        axes[1,1].legend(loc=4)
        table.append([method, train[-1], test[-1],mu[-1], sig[-1]])
    display(HTML(table._repr_html_()))

# Points stationnaires de $C(\theta) = \mathbb{E}_{\pi} [C(\theta,x)] = \mathbb{E}_{\pi} [-\log p(x;\theta)]$ par Robbins-Monro


### Robbins-Monro

On dispose d'une fonction inconnue (supposée monotone) $M(\theta)$ telle que 
$$M(\theta) = \mathbb{E}_{\pi(\beta|\theta)} [\beta]$$ 
avec $\beta$ une v.a désignant des observations bruitées de $M(\theta)$.

On cherche la valeur $\theta^*$ telle que $M(\theta^*) = \alpha$.

Suite convergente de Robbins-Monro : $$\theta^{(t+1)} - \theta^{(t)} = a^{(t)} (\alpha - \beta^{(t)})$$

### Robbins-Monro  et gradient stochastique

$M(\theta) := \nabla_{\theta} C(\theta)$ est le gradient d'une fonction inconnue $C$. 

On cherche la valeur $\theta^*$ telle que $M(\theta^*) = \nabla C(\theta^*) = 0$.

Suite convergente de Robbins-Monro : $$\theta^{(t+1)} = \theta^{(t)} - a^{(t)} \beta^{(t)}$$
où $\beta^{(t)}$ est une observation bruitée de $\nabla_{\theta} C(\theta^{(t)})$.

### Pour notre cas
La fonction à minimiser est:
$$C(\theta) = \mathbb{E}_{\pi} [- \log p(x;\theta)]$$
où $\pi$ est la distribution inconnue dont on cherche une approximation $p$ paramétrée par $\theta$ (identique à la minimisation sur $\theta$ de $KL(\pi || p(.;\theta))$)

Son équivalent en discret:
$$C_N(\theta) = - N^{-1} \sum_i \log p(x_i;\theta)$$
avec $\lim_{N \rightarrow +\infty} C_N(\theta) = C(\theta)$ 

Sous conditions de regularité et dans la famille exponentielle, 
$$\nabla_{\theta} C(\theta) = \mathbb{E}_{\pi} [- \nabla_{\theta}\log p(x;\theta)]  = \mathbb{E}_{\pi} [- s(x) + \nabla_{\theta} F (\theta)]$$

Pour le relier à Robbin-Monro, on a une observation bruitée
$$\beta^{(t)} = - s(x^{(t)}) + \nabla_{\theta} F (\theta^{(t)})$$
et donc la suite convergente $$\theta^{(t+1)} = \theta^{(t)} + a^{(t)} (s(x^{(t)}) - \nabla_{\theta} F (\theta^{(t)}))$$

Est ce que la formulation suivante est équivalente ? sans doute non ...
$$\eta^{(t+1)} = \eta^{(t)} + a^{(t)} (s(x^{(t)}) - \eta^{(t)}))$$

Dans l'espace $H$ (paramètre d'espérance), la même optimisation:
$$C(\eta) = \mathbb{E}_{\pi} [- \log p(x;\eta)] = \mathbb{E}_{\pi} [B_{F^*}(s(x) : \eta) - F^*(s(x)) - k(x)]$$
$$\nabla_{\eta} C(\eta) = \mathbb{E}_{\pi} [\nabla_{\eta} B_{F^*}(s(x) : \eta)] = \mathbb{E}_{\pi} [\nabla_{\eta} (F^*(s(x)) - F^*(\eta) - <s(x) - \eta, \nabla_\eta F^*(\eta)>)]$$
$$ = \mathbb{E}_{\pi} [ - \nabla_{\eta} F^*(\eta) - [-1,0;0,-1]*\nabla_\eta F^*(\eta) - Hess F^*(\eta) (s(x) - \eta)]$$
$$ = \mathbb{E}_{\pi} [ - \nabla_{\eta} F^*(\eta) + \nabla_{\eta} F^*(\eta) - Hess F^*(\eta) (s(x) - \eta)] = \mathbb{E}_{\pi} [- Hess F^*(\eta) (s(x) - \eta)]$$
et donc la suite convergente 
$$\eta^{(t+1)} = \eta^{(t)} + a^{(t)} Hess F^*(\eta^{(t)}) (s(x^{(t)}) - \eta^{(t)})$$

In [6]:
from sympy import Function, Derivative, var, simplify
from sympy.abc import x, y, z, t
from sympy import diff, log, pi
s1 = Function("s1")(x,y)
s2 = Function("s2")(x,y)
F = Function("F")(z,t)
F1 = Derivative(F, z)
F2 = Derivative(F, t)
expr = -((s1 - z) * F1 + (s2 - t)*F2)
print (diff(expr, z))
print (diff(expr, t))
x, eta1, eta2 = var('x eta1 eta2')
s1 = x
s2 = -x*x
ld = log(2*(- eta1*eta1 - eta2))
F =  - 0.5 * (1 + log(pi) + ld)
dF1 = diff(F, eta1)
dF2 = diff(F, eta2)
dF11 = diff(dF1, eta1)
dF12 = diff(dF1, eta2)
dF21 = diff(dF2, eta1)
dF22 = diff(dF2, eta2)
print (dF11)
print (dF12)
print (dF21)
print (dF22)
print (-simplify(dF11*(s1-eta1)+dF12*(s2-eta2)))
print (-simplify(dF21*(s1-eta1)+dF22*(s2-eta2)))

-(-t + s2(x, y))*Derivative(F(z, t), t, z) - (-z + s1(x, y))*Derivative(F(z, t), z, z) + Derivative(F(z, t), z)
-(-t + s2(x, y))*Derivative(F(z, t), t, t) - (-z + s1(x, y))*Derivative(F(z, t), t, z) + Derivative(F(z, t), t)
8.0*eta1**2/(-2*eta1**2 - 2*eta2)**2 + 2.0/(-2*eta1**2 - 2*eta2)
4.0*eta1/(-2*eta1**2 - 2*eta2)**2
4.0*eta1/(-2*eta1**2 - 2*eta2)**2
2.0/(-2*eta1**2 - 2*eta2)**2
(1.0*eta1*(eta2 + x**2) + (eta1 - x)*(8.0*eta1**2 - 8.0*eta2)/8)/(eta1**2 + eta2)**2
(1.0*eta1*(eta1 - x) + 0.5*eta2 + 0.5*x**2)/(eta1**2 + eta2)**2


# Debut des manips

In [7]:
results = []

## Points stationnaires de $C_N(\theta)$ par dérivation exacte

   $$\nabla C_N(\theta) = 0 \equiv -N^{-1} \sum_i (s(x_i) - \nabla F(\theta)) = 0 \equiv  \nabla F(\theta) = N^{-1} \sum_i s(x_i)$$

In [8]:
gradF_pt_stat_1 = np.sum(s_1(x) for x in Xt) / Nt
gradF_pt_stat_2 = np.sum(s_2(x) for x in Xt) / Nt
pt_stat = gradF_pt_stat_1, - gradF_pt_stat_1**2 - gradF_pt_stat_2
print ((mu_true, sigma2_true), ' vs ', pt_stat)
print('average ll on training set : ', 
      ave_ll(mu_true, sigma2_true, Xt), ' vs ',
      ave_ll(pt_stat[0], pt_stat[1], Xt))
print('average ll on test set : ', 
      ave_ll(mu_true, sigma2_true, Xtest), ' vs ',
      ave_ll(pt_stat[0], pt_stat[1], Xtest))
gradF_pt_stat_1 = np.cumsum([s_1(x) for x in Xt]) / np.arange(1,Nt+1)
gradF_pt_stat_2 = np.cumsum([s_2(x) for x in Xt]) / np.arange(1,Nt+1)
mu_list = gradF_pt_stat_1
sig_list = -gradF_pt_stat_1**2 -gradF_pt_stat_2
ave_ll_train = [ave_ll(mu, sig, Xt) for mu,sig in zip(mu_list, sig_list)]
ave_ll_test = [ave_ll(mu, sig, Xtest) for mu,sig in zip(mu_list, sig_list)]
this_result = [ave_ll_train, ave_ll_test, mu_list, sig_list, 'Exact']
results.append(this_result)

((1, 4), ' vs ', (0.93141726306687489, 3.5299590612896603))
('average ll on training set : ', -2.0539185454014794, ' vs ', -2.0495816699594678)
('average ll on test set : ', -2.1015292277818531, ' vs ', -2.1047983045813878)


In [9]:
disp_results([this_result])

0,1,2,3,4
,Train,Test,mu,sigma^2
True,-2.0539185454,-2.10152922778,1,4
Exact,-2.04958166996,-2.10479830458,0.931417263067,3.52995906129


# Optimization via scipy

In [10]:
def fun_C_N(mu_sigma2):
    return C_N(mu_sigma2[0], mu_sigma2[1], Xt)
np.random.rand(seed)
x0 = (np.random.randn(), np.random.random())
bnds = ((-np.inf, np.inf), (1e-6, np.inf))   # variance is positive
res_C_N = sco.minimize(fun_C_N, x0, bounds=bnds) #, options={'gtol': 1e-6, 'disp': True})
print(res_C_N)
print('average ll on training set : ', 
      ave_ll(mu_true, sigma2_true, Xt), ' vs ',
      ave_ll(res_C_N.x[0], res_C_N.x[1], Xt))
print('average ll on test set : ', 
      ave_ll(mu_true, sigma2_true, Xtest), ' vs ',
      ave_ll(res_C_N.x[0], res_C_N.x[1], Xtest))

   status: 0
  success: True
     njev: 38
     nfev: 152
 hess_inv: array([[  3.72127750e+00,  -8.51409550e-03],
       [ -8.51409550e-03,   2.47521288e+01]])
      fun: 2.049581669992999
        x: array([ 0.9314326 ,  3.52996235])
  message: 'Optimization terminated successfully.'
      jac: array([  4.20212746e-06,   2.38418579e-07])
('average ll on training set : ', -2.0539185454014794, ' vs ', -2.0495816699929992)
('average ll on test set : ', -2.1015292277818531, ' vs ', -2.1047979700467199)




## Optimization via Stochastic Gradient Descent  (natural space)
mostly fail:
- $\alpha = 0.01$

success:
- $\alpha = 0.001$

In [11]:
np.random.seed(seed)

alpha = 0.001
epochs = 10

mu_0, sigma2_0 = np.random.randn(), np.random.random()
theta1, theta2 = mu_0 / sigma2_0, 0.5 / sigma2_0

def grad_nll_1(x, theta1, theta2):
    return - (s_1(x) - gradF_1_1D(theta1, theta2))

def grad_nll_2(x, theta1, theta2):
    return - (s_2(x) - gradF_2_1D(theta1, theta2))
    
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for __ in range(epochs):
    for xt in Xt:
        #print("avant theta",theta1, theta2, " x ", xt)
        #print ("grad 1 ", gradF_1_1D(theta1, theta2), grad_nll_1(xt, theta1, theta2))
        #print ("grad 2 ", gradF_2_1D(theta1, theta2), grad_nll_2(xt, theta1, theta2))
        theta1 -= alpha*grad_nll_1(xt, theta1, theta2)
        theta2 -= alpha*grad_nll_2(xt, theta1, theta2)
        #print("apres theta", theta1, theta2)
        mu_est, sigma2_est = 0.5*theta1/theta2, 0.5/theta2
        #print("apres mu_sd2",   mu_est, sigma2_est)
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest)) 
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'SGD(T)-'+str(alpha)]
results.append(this_result)

In [12]:
#disp_results([this_result])
disp_results(results)

0,1,2,3,4
,Train,Test,mu,sigma^2
True,-2.0539185454,-2.10152922778,1,4
Exact,-2.04958166996,-2.10479830458,0.931417263067,3.52995906129
SGD(T)-0.001,-2.07042166281,-2.14411145006,0.723824689204,2.82863963155


## Optimization via Stochastic Gradient Descent  

Il s'agit d'un réécriture simple en remplacant $\nabla F$ par $\eta$. 

Cela ressemble dans la forme à l'approximation stochastique du Online EM.

L'algorihme converge car les deux optimisations (avec celles du dessus) sont liées. 

(J'ai fait le calcul $\eta^{(n+1)} - \eta^{(n)}$ vers $\theta^{(n+1)} - \theta^{(n)}$)

Il ne correspond pas à un SGD dand $H$.

In [13]:
np.random.seed(seed)

alpha = 0.001
epochs = 10
mu_0, sigma2_0 = np.random.randn(), np.random.random()
eta1, eta2 = mu_0 , -(mu_0*mu_0 + sigma2_0)
def grad_nll_1(x, eta1, eta2):
    return -(s_1(x) - eta1)

def grad_nll_2(x, eta1, eta2):
    return -(s_2(x) - eta2)
    
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for __ in range(epochs):
    for x in Xt:
        #print("avant eta",eta1, eta2, " x ", x)
        #print ("grad 1 ", grad_nll_1(x, eta1, eta2))
        #print ("grad 2 ", grad_nll_2(x, eta1, eta2))
        eta1 -= alpha*grad_nll_1(x, eta1, eta2)
        eta2 -= alpha*grad_nll_2(x, eta1, eta2)
        #print("apres eta", eta1, eta2)
        mu_est, sigma2_est = eta1, -(eta1*eta1+eta2)
        #print("apres mu_sd2", mu_est, sd_est)
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest))
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'SGD(M)-'+str(alpha)]
results.append(this_result)

In [14]:
#disp_results([this_result])
disp_results(results)

0,1,2,3,4
,Train,Test,mu,sigma^2
True,-2.0539185454,-2.10152922778,1,4
Exact,-2.04958166996,-2.10479830458,0.931417263067,3.52995906129
SGD(T)-0.001,-2.07042166281,-2.14411145006,0.723824689204,2.82863963155
SGD(M)-0.001,-2.04961404007,-2.10498674198,0.916552594579,3.53750540054


## Optimization via Stochastic Gradient Descent  (Expectation space)

In [15]:
np.random.seed(seed)

alpha = 0.001
epochs = 10
mu_0, sigma2_0 = np.random.randn(), np.random.random()
eta1, eta2 = mu_0 , -(mu_0*mu_0 + sigma2_0)
def grad_1(x, eta1, eta2):
    return (1.0*eta1*(eta2 + x**2) + (eta1 - x)*(8.0*eta1**2 - 8.0*eta2)/8.)/(eta1**2 + eta2)**2

def grad_2(x, eta1, eta2):
    return (1.0*eta1*(eta1 - x) + 0.5*eta2 + 0.5*x**2)/(eta1**2 + eta2)**2
    
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for __ in range(epochs):
    for x in Xt:
        #print("avant eta",eta1, eta2, " x ", x)
        #print ("grad 1 ", grad_nll_1(x, eta1, eta2))
        #print ("grad 2 ", grad_nll_2(x, eta1, eta2))
        eta1 -= alpha*grad_1(x, eta1, eta2)
        eta2 -= alpha*grad_2(x, eta1, eta2)
        #print("apres eta", eta1, eta2)
        mu_est, sigma2_est = eta1, -(eta1*eta1+eta2)
        #print("apres mu_sd2", mu_est, sd_est)
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest))
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'SGD(H)-'+str(alpha)]
results.append(this_result)

In [16]:
#disp_results([this_result])
disp_results(results)

0,1,2,3,4
,Train,Test,mu,sigma^2
True,-2.0539185454,-2.10152922778,1,4
Exact,-2.04958166996,-2.10479830458,0.931417263067,3.52995906129
SGD(T)-0.001,-2.07042166281,-2.14411145006,0.723824689204,2.82863963155
SGD(M)-0.001,-2.04961404007,-2.10498674198,0.916552594579,3.53750540054
SGD(H)-0.001,-2.1278333041,-2.22108770547,0.544059656752,2.36082847621


# Theano

In [17]:
def gradient_updates(cost, params, alpha):
    updates = [(param, param - alpha*T.grad(cost, param)) for param in params]
    return updates

def gradient_updates_momentum(cost, params, alpha, momentum):
    """
    http://caffe.berkeleyvision.org/tutorial/solver.html
    """
    assert momentum < 1 and momentum >= 0
    updates = []
    for param in params:
        V = theano.shared(param.get_value()*0.)
        updates.append((param, param + V))
        updates.append((V, momentum*V - alpha * T.grad(cost, param)))
    return updates

# on peut rajouter d'autres méthodes ici

def gradient_updates_adam(cost, params, alpha, beta1, beta2):
    assert beta1 < 1 and beta1 >= 0
    assert beta2 < 1 and beta2 >= 0
    updates = []
    t = theano.shared(1.)
    updates.append((t, t+1))
    for param in params:
        gt = T.grad(cost, param) 
        mt = theano.shared(0.)
        updates.append((mt, beta1 * mt + (1-beta1) * gt))
        vt = theano.shared(0.)
        updates.append((vt, beta2 * vt + (1-beta2) * gt * gt))
        alpha_t = theano.shared(0.)
        updates.append((alpha_t, 
                        alpha*T.sqrt(1 - beta2**t)/(1 - beta1**t)))
        updates.append((param, param - alpha_t*mt/(T.sqrt(vt + 1e-8))))
    return updates   

## Optimization SGD via Theano (source space)

In [18]:
np.random.seed(seed)

alpha = 0.001
epochs = 10
# Declare Theano symbolic variables
x = T.scalar()
mu_0, sigma2_0 = np.random.randn(), np.random.random()
mu = theano.shared(mu_0, name="mu")
sigma2 = theano.shared(sigma2_0, name="sigma2") # ensure initial value is positive

# Construct Theano expression graph to minimize (- log-likehood for univariate gaussian)
nll =  (x - mu)**2 /(2 * sigma2) + T.log(T.sqrt(2 * sigma2 *np.pi))

# Compile
train = theano.function(
          inputs=[x],
          updates=gradient_updates(nll, [mu, sigma2], alpha))

# Train
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for i in range(epochs):
    for xt in Xt:
        __ = train(xt)
        mu_est, sigma2_est = mu.get_value(), sigma2.get_value()
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest)) 
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'SGD(S)-'+str(alpha)]
results.append(this_result)

00001	#include <Python.h>
00002	#include <iostream>
00003	#include <math.h>
00004	#include <numpy/arrayobject.h>
00005	#include <numpy/arrayscalars.h>
00006	//////////////////////
00007	////  Support Code
00008	//////////////////////
00009	
00010	
00011	    namespace {
00012	    struct __struct_compiled_op_e0b28ec03e051f80e055d0173c808bbb {
00013	        PyObject* __ERROR;
00014	
00015	        PyObject* storage_V3;
00016	PyObject* storage_V1;
00017	        
00018	
00019	        __struct_compiled_op_e0b28ec03e051f80e055d0173c808bbb() {}
00020	        ~__struct_compiled_op_e0b28ec03e051f80e055d0173c808bbb(void) {
00021	            cleanup();
00022	        }
00023	
00024	        int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* storage_V1) {
00025	            Py_XINCREF(storage_V3);
00026	Py_XINCREF(storage_V1);
00027	            this->storage_V3 = storage_V3;
00028	this->storage_V1 = storage_V1;
00029	            
00030	
00031	
00032	
00033	            this->__ERROR = __ERROR;


clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-sse4a'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-tbm'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-fma4'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-prfchw'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-targ

00001	#include <Python.h>
00002	#include <iostream>
00003	#include <math.h>
00004	#include <numpy/arrayobject.h>
00005	#include <numpy/arrayscalars.h>
00006	//////////////////////
00007	////  Support Code
00008	//////////////////////
00009	
00010	
00011	    namespace {
00012	    struct __struct_compiled_op_e0b28ec03e051f80e055d0173c808bbb {
00013	        PyObject* __ERROR;
00014	
00015	        PyObject* storage_V3;
00016	PyObject* storage_V1;
00017	        
00018	
00019	        __struct_compiled_op_e0b28ec03e051f80e055d0173c808bbb() {}
00020	        ~__struct_compiled_op_e0b28ec03e051f80e055d0173c808bbb(void) {
00021	            cleanup();
00022	        }
00023	
00024	        int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* storage_V1) {
00025	            Py_XINCREF(storage_V3);
00026	Py_XINCREF(storage_V1);
00027	            this->storage_V3 = storage_V3;
00028	this->storage_V1 = storage_V1;
00029	            
00030	
00031	
00032	
00033	            this->__ERROR = __ERROR;



clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-sse4a'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-tbm'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-fma4'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-prfchw'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-target-feature'
clang: error: unknown argument: '-tar

00001	#include <Python.h>
00002	#include <iostream>
00003	#include <math.h>
00004	#include <numpy/arrayobject.h>
00005	#include <numpy/arrayscalars.h>
00006	#include <vector>
00007	#include <algorithm>
00008	//////////////////////
00009	////  Support Code
00010	//////////////////////
00011	
00012	
00013	    namespace {
00014	    struct __struct_compiled_op_7851c42462294f62fdd1e58a1ef7d1d0 {
00015	        PyObject* __ERROR;
00016	
00017	        PyObject* storage_V3;
00018	PyObject* storage_V5;
00019	PyObject* storage_V1;
00020	        
00021	
00022	        __struct_compiled_op_7851c42462294f62fdd1e58a1ef7d1d0() {}
00023	        ~__struct_compiled_op_7851c42462294f62fdd1e58a1ef7d1d0(void) {
00024	            cleanup();
00025	        }
00026	
00027	        int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* storage_V5, PyObject* storage_V1) {
00028	            Py_XINCREF(storage_V3);
00029	Py_XINCREF(storage_V5);
00030	Py_XINCREF(storage_V1);
00031	            this->storage_V3 = s

Exception: ('The following error happened while compiling the node', Elemwise{sub,no_inplace}(<TensorType(float64, scalar)>, mu), '\n', "Compilation failed (return status=1): clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-sse4a'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-tbm'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-fma4'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-prfchw'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-rdseed'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-sha'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: unknown argument: '-target-feature'. clang: error: no such file or directory: '+cx16'. clang: error: no such file or directory: '+xsave'. clang: error: no such file or directory: '+bmi2'. clang: error: language not recognized: 'savec'. clang: error: no such file or directory: '+fsgsbase'. clang: error: no such file or directory: '+avx'. clang: error: no such file or directory: '+rtm'. clang: error: no such file or directory: '+popcnt'. clang: error: no such file or directory: '+fma'. clang: error: no such file or directory: '+bmi'. clang: error: no such file or directory: '+aes'. clang: error: no such file or directory: '+rdrnd'. clang: error: language not recognized: 'saves'. clang: error: no such file or directory: '+sse4.1'. clang: error: no such file or directory: '+sse4.2'. clang: error: no such file or directory: '+avx2'. clang: error: no such file or directory: '+sse'. clang: error: no such file or directory: '+lzcnt'. clang: error: no such file or directory: '+pclmul'. clang: error: no such file or directory: '+f16c'. clang: error: no such file or directory: '+ssse3'. clang: error: no such file or directory: '+mmx'. clang: error: no such file or directory: '+cmov'. clang: error: language not recognized: 'op'. clang: error: no such file or directory: '+movbe'. clang: error: no such file or directory: '+hle'. clang: error: no such file or directory: '+xsaveopt'. clang: error: no such file or directory: '+sse2'. clang: error: no such file or directory: '+sse3'. ", '[Elemwise{sub,no_inplace}(<TensorType(float64, scalar)>, mu)]')

# Optimization SGD via Theano (natural space)

Fails for $\alpha =0.005$

In [None]:
np.random.seed(seed)

alpha = 0.001
epochs = 10

# Declare Theano symbolic variables
x = T.scalar()

mu_0, sigma2_0 = np.random.randn(), np.random.random() 
theta1 = theano.shared(mu_0 / sigma2_0, name="theta1") # ensure initial value is positive
theta2 = theano.shared(0.5 / sigma2_0, name="theta2") # ensure initial value is positive

# Construct Theano expression graph to minimize (- log-likehood for univariate gaussian)
nll_theta =  -(x*theta1 - theta2*x*x - 0.25*theta1*theta1/theta2 - 0.5*np.log(np.pi) +0.5*T.log(theta2))

# Compile
train = theano.function(
          inputs=[x],
          updates=gradient_updates(nll_theta, [theta1, theta2], alpha))

# Train
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for i in range(epochs):
    for xt in Xt:
        train(xt)
        mu_est = 0.5*theta1.get_value()/theta2.get_value()
        sigma2_est = 0.5/theta2.get_value()
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest))
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'SGD(T)-'+str(alpha)]
results.append(this_result)

# Optimization SGD via Theano (expectation space)

In [None]:
np.random.seed(seed)

alpha = 0.001
epochs = 10

# Declare Theano symbolic variables
x = T.scalar()

mu_0, sigma2_0 = np.random.randn(), np.random.random()
eta1 = theano.shared(mu_0, name="eta1") # ensure initial value is positive
eta2 = theano.shared(-(mu_0*mu_0 + sigma2_0), name="eta2") # ensure initial value is positive
print(eta1.get_value(), eta2.get_value())
# Construct Theano expression graph to minimize (- log-likehood for univariate gaussian)
theta1 = - eta1 / (eta1*eta1 + eta2)
theta2 = - 0.5  / (eta1*eta1 + eta2)
nll_eta =  -(x*theta1 - theta2*x*x - 0.25*theta1*theta1/theta2 - 0.5*np.log(np.pi) +0.5*T.log(theta2))

# Compile
train = theano.function(
          inputs=[x],
          updates=gradient_updates(nll_eta, [eta1, eta2], alpha))

# Train
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for i in range(epochs):
    for xt in Xt:
        train(xt)
        # print(g1, s_1(xt) - eta1.get_value())
        # print(g2, s_2(xt) - eta2.get_value())
        mu_est = eta1.get_value()
        sigma2_est = - (eta1.get_value()*eta1.get_value()+eta2.get_value())
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest))
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'SGD(H)-'+str(alpha)]
results.append(this_result)

## Optimization via Theano +  SGD Nesterov momemtum (Source space)

In [None]:
np.random.seed(seed)

alpha = 0.01
momentum = 0.9
epochs = 10
# Declare Theano symbolic variables
x = T.scalar()
mu_0, sigma2_0 = np.random.randn(), np.random.random()
mu = theano.shared(mu_0, name="mu")
sigma2 = theano.shared(sigma2_0, name="sigma2") # ensure initial value is positive

# Construct Theano expression graph to minimize (- log-likehood for univariate gaussian)
nll =  (x - mu)**2 /(2 * sigma2) + T.log(T.sqrt(2 * sigma2 *np.pi))

# Compile
train = theano.function(
          inputs=[x],
          updates=gradient_updates_momentum(nll, [mu, sigma2], alpha, momentum))

# Train
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for i in range(epochs):
    for xt in Xt:
        train(xt)
        mu_est, sigma2_est = mu.get_value(), sigma2.get_value()
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest))
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'Mom. SGD(S)-'+str(momentum)+','+str(alpha)]
results.append(this_result)

## Optimization via Theano +  ADAM (Source space)

In [None]:
np.random.seed(seed)

alpha = 0.005
beta1 = 0.9
beta2 = 0.995
epochs = 10
# Declare Theano symbolic variables
x = T.scalar()
mu_0, sigma2_0 = np.random.randn(), np.random.random()
mu = theano.shared(mu_0, name="mu")
sigma2 = theano.shared(sigma2_0, name="sigma2") # ensure initial value is positive

# Construct Theano expression graph to minimize (- log-likehood for univariate gaussian)
nll =  (x - mu)**2 /(2 * sigma2) + T.log(T.sqrt(2 * sigma2 *np.pi))

# Compile
train = theano.function(
          inputs=[x],
          updates=gradient_updates_adam(nll, [mu, sigma2], 
                                        alpha, beta1, beta2))

# Train
mu_list, sig_list, ave_ll_list_train, ave_ll_list_test = [], [], [], []
for i in range(epochs):
    for xt in Xt:
        train(xt)
        mu_est, sigma2_est = mu.get_value(), sigma2.get_value()
        mu_list.append(mu_est)
        sig_list.append(sigma2_est)
        ave_ll_list_train.append(ave_ll(mu_est, sigma2_est, Xt)) 
        ave_ll_list_test.append(ave_ll(mu_est, sigma2_est, Xtest)) 
this_result = [ave_ll_list_train, ave_ll_list_test, mu_list, sig_list, 'Adam(S)-'+str(beta1)+','+str(beta2)+','+str(alpha)]
results.append(this_result)

In [None]:
disp_results(results)

# Optimization via downhill (adam)

Pas encore terminé...

In [None]:
climate.enable_default_logging()

np.random.seed(seed)

alpha = 0.01
momentum = 0.9
epochs = 10
# Declare Theano symbolic variables
x = T.vector()
mu_0, sigma2_0 = np.random.randn(), np.random.random()
mu = theano.shared(mu_0, name="mu")
sigma2 = theano.shared(sigma2_0, name="sigma2") # ensure initial value is positive

# Construct Theano expression graph to minimize (- log-likehood for univariate gaussian)
#nll =  (x - mu)**2 /(2 * sigma2) + T.log(T.sqrt(2 * sigma2 *np.pi))
navell =  T.sum(T.sqr(x - mu) /(2 * sigma2) + T.log(T.sqrt(2 * sigma2 *np.pi)))
train = downhill.Dataset(Xt)
#valid = downhill.Dataset(Xtest)

downhill.minimize(
    loss=navell,
    train=Xt,
    patience=0,
    max_gradient_norm=1,          # Prevent gradient explosion!
    learning_rate=alpha)


print(mu.get_value())
print(sigma2.get_value())

In [None]:
help(downhill.minimize)

## L'exemple de la doc de downhill ne marche pas

In [None]:
climate.enable_default_logging()


import theano.tensor as TT
def rand(a, b): return np.random.randn(a, b).astype('f')

A, B, K = 20, 5, 3

# Set up a matrix factorization problem to optimize.
u = theano.shared(rand(A, K), name='u')
v = theano.shared(rand(K, B), name='v')
e = TT.sqr(TT.matrix() - TT.dot(u, v))

# Minimize the regularized loss with respect to a data matrix.
y = np.dot(rand(A, K), rand(K, B)) + rand(A, B)

downhill.minimize(
    loss=e.mean() + abs(u).mean() + (v * v).mean(),
    train=[y],
    patience=0,
    batch_size=A,                 # Process y as a single batch.
    max_gradient_norm=1,          # Prevent gradient explosion!
    learning_rate=0.1,
    monitors=(('err', e.mean()),  # Monitor during optimization.
              ('|u|<0.1', (abs(u) < 0.1).mean()),
              ('|v|<0.1', (abs(v) < 0.1).mean())),
    monitor_gradients=True)

# Print out the optimized coefficients u and basis v.
print('u =', u.get_value())
print('v =', v.get_value())

In [None]:
num2str