In [1]:
import tensorflow as tf
import numpy as np
np.random.seed(3690)

In [2]:
beta_true = np.array([2, 1, 3], dtype=np.float)
X = 0.2*np.random.randn(10, 3) + np.array([3, 1, 2])
Y = X.dot(beta_true) + 0.1*np.random.randn(10)

In [3]:
X

array([[ 3.09503707,  0.40299582,  2.06102421],
       [ 2.98285144,  0.83189275,  2.37346666],
       [ 3.20810042,  1.00242926,  2.04051188],
       [ 2.84539432,  1.14191149,  2.18715818],
       [ 3.27580542,  0.82608776,  1.86972986],
       [ 2.5099321 ,  0.78008051,  2.40497419],
       [ 2.8538918 ,  1.00111678,  2.29768634],
       [ 2.86603255,  1.03767859,  2.26433845],
       [ 3.23374931,  0.84731398,  1.94832026],
       [ 3.25913163,  0.96937172,  1.95759511]])

In [4]:
Y

array([ 12.86750223,  13.96153011,  13.53130887,  13.42630843,
        12.97549328,  13.19593226,  13.57207838,  13.59171186,
        13.16687516,  13.33501955])

In [5]:
beta_ols = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y))
beta_ols

array([ 1.94279897,  0.82758608,  3.16593475])

$$
loss=\frac{1}{2N}(Y - X\beta)^T(Y-X\beta) \\
\frac{\partial loss}{\partial \beta}=\frac{1}{N}(X^TX\beta-X^TY)
$$

In [6]:
def grad_np(Y, X, beta):
    N = len(Y)
    return (X.T.dot(X).dot(beta) - X.T.dot(Y))/N

In [7]:
# estimate beta with gradient descent
beta_np = np.zeros(3, dtype=np.float)
for _ in range(1000):
    beta_np -= 0.1*grad_np(Y, X, beta_np)
beta_np # very close to true beta [2, 1, 3]

array([ 1.94272213,  0.83972984,  3.16101019])

## Tensorflow implementation

Simple showcase how `tf.gradients` works in `Tensorflow`.

In [8]:
# OLS gradient descent with Tensorflow
tf.reset_default_graph()
tf_X = tf.constant(X, dtype=tf.float64, name="tf_X")
tf_Y = tf.constant(Y, dtype=tf.float64, name="tf_Y")
tf_beta = tf.Variable(tf.zeros(3, dtype=tf.float64), name="beta")
tf_N = tf.cast(tf.shape(tf_Y), tf.float64)[0]
tf_Y_hat = tf.reduce_sum(tf_X*tf_beta, 1)
tf_loss = tf.reduce_sum(tf.square(tf_Y - tf_Y_hat))/(2*tf_N)
tf_gradients = tf.gradients(tf_loss, [tf_beta])[0]
# updating ops (mimic Optimizer.apply_gradients)
tf_new_beta = tf.placeholder(dtype=tf.float64, name="tf_new_beta")
tf_assign_beta = tf_beta.assign(tf_new_beta)

In [9]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

In [10]:
grad, _ = sess.run([tf_gradients, tf_beta])

In [11]:
np.allclose(grad_np(Y, X, np.zeros(3)), grad) # the same as numpy implementaion

True

In [12]:
# run iteration
for _ in range(1000):
    grad, beta = sess.run([tf_gradients, tf_beta])
    beta -= 0.1*grad
    ## this line is extremely slow....
    # _ = sess.run(tf_beta.assign(beta))
    ## better approach (much faster):
    _ = sess.run(tf_assign_beta, feed_dict={tf_new_beta:beta})
beta

array([ 1.94272213,  0.83972984,  3.16101019])

In [13]:
sess.close()