# Basic Concepts and API

All TF code follows this process:
1. Create a **computation graph** that defines your computational structure
2. Create a TF session
3. Run the computation graph in the session

In [175]:
import tensorflow as tf

In [176]:
# Define variables and operations in the graph

x = tf.Variable(3, name="x") # declare a symbolic name, x
y = tf.Variable(4, name="y")
g = x*x*y
h = y**3
print(type(g))
print(type(h))
f = g + h
print(type(f))

<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>


The *type* of each computation is a TF **op**.

In [177]:
# Build a session and run. Using the "with" context block automatically closes the session.
with tf.Session() as tf_sess:
    x.initializer.run()
    y.initializer.run()
    result = f.eval()

In [178]:
result

100

An alternative to initializing variables individually is to call the <code>global_variables_initializer</code> function.

In [179]:
init = tf.global_variables_initializer() # Creates an init node

with tf.Session() as tf_sess:
    init.run()
    result = f.eval()
    print(result)

100


## Graphs

We can build graphs and then merge them together programmatically. Otherwise, it is assumed that declared computations are applied to the **same graph**.

In [180]:
x1 = tf.Variable(1)
# check where this x1 node lives:
x1.graph is tf.get_default_graph()

True

In [181]:
# Now, make another graph and add a new variable to it:
new_graph = tf.Graph()
with new_graph.as_default():
    x2 = tf.Variable(2)
    
print(x2.graph is tf.get_default_graph())
print(x2.graph is new_graph)

False
True


## More on Nodes

TF node evaluation determines the set of nodes that the node depends on and evaluates them. **All node values (except variables) are dropped between graph runs!**

Varialbes start their life when initialized and end when the session closes.

In [182]:
tf.reset_default_graph()

w = tf.constant(9)
x = w * 7
y = x + 2
z = x**2

with tf.Session() as tf_sess:
    print(y.eval())
    print(z.eval())

65
3969


The above code is not efficient, as the computation of x and w will happen twice! Instead, have y and evaluate in a single graph run.

In [183]:
with tf.Session() as tf_sess:
    y_val, z_val = tf_sess.run([y,z])
    print(y_val)
    print(z_val)

65
3969


## Operations

TF "ops" can take *any* number of inputs and produce *any* number of outputs. Sources are constants and Variables. The inputs and outputs of operations are always **tensors** - multi-dimensional arrays. In TF, tensors are numpy `ndarray`s.

The following example performs linear regression using the closed form Normal Equation embedded as a TF op. 
This example code uses the California housing data set.

In [184]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

housing_dataset = fetch_california_housing()
m,n = housing_dataset.data.shape
print("Data shape: " + str(m) + " instances, " + str(n) + " features")

X_raw = housing_dataset.data
y_raw = housing_dataset.target

# Split up the data
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2)
print("Training size: " + str(X_train.shape[0]) + "; Test size: " + str(X_test.shape[0]))

# Scale the data sets
housing_scaler = StandardScaler()
X_train_scaled = housing_scaler.fit_transform(X_train)
X_test_scaled = housing_scaler.transform(X_test)

Data shape: 20640 instances, 8 features
Training size: 16512; Test size: 4128


In [185]:
# Add a bias of 1 to model the linear regression.
X_train_biased = np.c_[np.ones((X_train_scaled.shape[0],1)), X_train_scaled]
X_test_biased = np.c_[np.ones((X_test_scaled.shape[0],1)), X_test_scaled]
print("Biased train data shape: " + str(X_train_biased.shape[0]) + " instances, " + str(X_train_biased.shape[1]) + " features")
print("Biased test data shape: " + str(X_test_biased.shape[0]) + " instances, " + str(X_test_biased.shape[1]) + " features")

Biased train data shape: 16512 instances, 9 features
Biased test data shape: 4128 instances, 9 features


In [186]:
X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
print("Target array shape: " + str(X_train_biased.shape))
# Explicitly turn into an m x 1 vector
y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
print("...as TF constant: " + str(y_train.shape))
XT = tf.transpose(X)

Target array shape: (16512, 9)
...as TF constant: (16512,)


Implement the **Normal Equation**:
$\theta^{\star} = (X\cdot X^T)^{-1}\cdot{X^T}\cdot{y}$

In [187]:
inv = tf.matrix_inverse( tf.matmul(XT, X) )
theta = tf.matmul( tf.matmul(inv, XT), y )

In [188]:
# Vroom vroom!
with tf.Session() as tf_sess:
    theta_val = theta.eval()

In [189]:
print("Performed a linear regression over the data set:")
print(str(theta_val) + "\n " + str(theta_val.shape))

Performed a linear regression over the data set:
[[ 2.0682957 ]
 [ 0.84008783]
 [ 0.1174375 ]
 [-0.28864682]
 [ 0.3346616 ]
 [-0.00617133]
 [-0.03975914]
 [-0.9041359 ]
 [-0.86882347]]
 (9, 1)


## Manual Gradient Descent via TF

I will re-use the scaled data from above and implement gradient descent manually rather than use the normal equation solution.

In [190]:
m = X_train_biased.shape[0]
n = X_train_biased.shape[1]

n_epochs = 2000
alpha = 0.01 # learning rate

In [191]:
# For grins, make a new graph for this implementation.
gd_graph = tf.Graph()

with gd_graph.as_default():
    X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
    y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="predictions" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    # Gradient calculations
    dJdtheta = (2.0/m) * tf.matmul( tf.transpose(X), error )
    # Training/learning op. assign() computes a new value and assigns it to a TF variable
    train_op = tf.assign( theta, theta - alpha*dJdtheta )
    
    init_op = tf.global_variables_initializer()

In [192]:
with tf.Session( graph=gd_graph ) as sess:
    sess.run(init_op)
    
    for i in range(n_epochs):
        
        if i % 100 == 0:
            print("Epoch ", i, "MSE = ", mse.eval())
        sess.run(train_op)
    
    # At the end, print the current thetas
    print(theta.eval())


Epoch  0 MSE =  7.6506877
Epoch  100 MSE =  0.9170531
Epoch  200 MSE =  0.73852617
Epoch  300 MSE =  0.6789211
Epoch  400 MSE =  0.6374691
Epoch  500 MSE =  0.60767645
Epoch  600 MSE =  0.5862031
Epoch  700 MSE =  0.57069904
Epoch  800 MSE =  0.55948186
Epoch  900 MSE =  0.5513509
Epoch  1000 MSE =  0.5454397
Epoch  1100 MSE =  0.5411326
Epoch  1200 MSE =  0.53798336
Epoch  1300 MSE =  0.53567326
Epoch  1400 MSE =  0.5339736
Epoch  1500 MSE =  0.53271604
Epoch  1600 MSE =  0.5317826
Epoch  1700 MSE =  0.531086
Epoch  1800 MSE =  0.53056514
Epoch  1900 MSE =  0.53017145
[[ 2.068289  ]
 [ 0.85245454]
 [ 0.12877123]
 [-0.29521805]
 [ 0.3322528 ]
 [-0.00219188]
 [-0.04099313]
 [-0.8094103 ]
 [-0.77475417]]


The results are pretty good compared to the normal equation. But it would be nice to not have to compute the derivative by hand all the time, especially for more difficult functions, e.g. regularized cost functions. Next, I will use *autodiff* to automatically compute the gradient.

In [193]:
gd_graph2 = tf.Graph()
with gd_graph2.as_default():
    X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
    y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="predictions" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    # Using tf's autodiff capability compute the derivative of the MSE
    dJdtheta = tf.gradients( mse, [theta], name="dJdtheta" )[0]
    print(dJdtheta)
    
    # Training/learning op. assign() computes a new value and assigns it to a TF variable
    # This *is* the optimization process - simple gradient descent
    train_op = tf.assign( theta, theta - alpha*dJdtheta )
    
    init_op = tf.global_variables_initializer()

Tensor("dJdtheta/predictions_grad/MatMul_1:0", shape=(9, 1), dtype=float32)


In [194]:
with tf.Session( graph=gd_graph2 ) as sess:
    sess.run(init_op)
    
    for i in range(n_epochs):
        
        if i % 100 == 0:
            print("Epoch ", i, "MSE = ", mse.eval())
        sess.run(train_op)
    
    # At the end, print the current thetas
    print(theta.eval())


Epoch  0 MSE =  7.619579
Epoch  100 MSE =  0.9780748
Epoch  200 MSE =  0.77403235
Epoch  300 MSE =  0.70987123
Epoch  400 MSE =  0.66503584
Epoch  500 MSE =  0.63195205
Epoch  600 MSE =  0.6073477
Epoch  700 MSE =  0.5889569
Epoch  800 MSE =  0.5751409
Epoch  900 MSE =  0.5647041
Epoch  1000 MSE =  0.55677783
Epoch  1100 MSE =  0.5507234
Epoch  1200 MSE =  0.5460721
Epoch  1300 MSE =  0.5424777
Epoch  1400 MSE =  0.5396848
Epoch  1500 MSE =  0.5375022
Epoch  1600 MSE =  0.53578717
Epoch  1700 MSE =  0.53443295
Epoch  1800 MSE =  0.5333583
Epoch  1900 MSE =  0.532501
[[ 2.0682893e+00]
 [ 8.8227671e-01]
 [ 1.3487199e-01]
 [-3.4767443e-01]
 [ 3.7444222e-01]
 [-2.9430713e-04]
 [-4.2214889e-02]
 [-7.3923689e-01]
 [-7.0793283e-01]]


It is possible to roll all of the above into a simple call to a tf `Optimizer`!

In [207]:
gdwithopt_graph = tf.Graph()
# All the same intialization code, but then call on a MomentumOptimizer (or whatever other flavor)
with gdwithopt_graph.as_default():
    X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
    y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="predictions" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    
    # The optimizer:
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate=alpha)
    # Uncomment to use MomentumOptimizer
    optimizer = tf.train.MomentumOptimizer(learning_rate=alpha, momentum=0.9)

    training_op = optimizer.minimize(mse)
    
    init_op = tf.global_variables_initializer()
    
    # Added a saver node
    saver = tf.train.Saver()

In [208]:
with tf.Session(graph=gdwithopt_graph) as sess:
    sess.run(init_op)
    
    for i in range(n_epochs):
        if i % 100 == 0:
            print("Epoch ", i, "MSE = ", mse.eval())
        sess.run(training_op)
    
    # At the end, print the current thetas
    print(theta.eval())
    # Save out the model
    save_path = saver.save(sess, "./models/lin_reg_final.ckpt")


Epoch  0 MSE =  8.937419
Epoch  100 MSE =  0.53223294
Epoch  200 MSE =  0.5291265
Epoch  300 MSE =  0.5288685
Epoch  400 MSE =  0.52883744
Epoch  500 MSE =  0.5288341
Epoch  600 MSE =  0.5288338
Epoch  700 MSE =  0.52883357
Epoch  800 MSE =  0.5288333
Epoch  900 MSE =  0.52883375
Epoch  1000 MSE =  0.52883416
Epoch  1100 MSE =  0.52883404
Epoch  1200 MSE =  0.52883404
Epoch  1300 MSE =  0.52883404
Epoch  1400 MSE =  0.52883404
Epoch  1500 MSE =  0.52883404
Epoch  1600 MSE =  0.52883404
Epoch  1700 MSE =  0.52883404
Epoch  1800 MSE =  0.52883404
Epoch  1900 MSE =  0.52883404
[[ 2.0682945 ]
 [ 0.84008884]
 [ 0.11743774]
 [-0.28864896]
 [ 0.33466324]
 [-0.00617125]
 [-0.03975926]
 [-0.9041329 ]
 [-0.8688196 ]]


The above code still performed a batch learning process: the whole data set was consumed and the model was trained. Next step in thie evolution is to move to a **mini-batch** process. $X$ and $y$ get replaced with new values from the data set on each epoch iteration. In TF, we use `placeholder` nodes to accomplish this modification. (Note that `None` means any size.)

To pass in a value to placeholder nodes, create a `feed_dict` and assign the value with the variable as a keyword.

In [197]:
import numpy as np
from numpy.random import randn

placeholder_ex_graph = tf.Graph()
with placeholder_ex_graph.as_default():
    A = tf.placeholder(tf.float32, shape=(None,4), name='A')
    B = tf.placeholder(tf.int64, shape=(None,6), name='B')
    C = A * 2 
    D = B + 2

with tf.Session(graph=placeholder_ex_graph) as sess:
    C_result = C.eval( feed_dict={A: randn(1,4)} )
    D_result = D.eval( feed_dict={B: [[8,9,1,2,7,3]]} )

In [198]:
print(C_result)
print(D_result)

[[-2.865427   -3.4994688   0.97427607 -0.68409264]]
[[10 11  3  4  9  5]]


TensorFlow has a nice batching function, `tf.train.batch`. See the details [here](https://www.tensorflow.org/api_docs/python/tf/train/batch).

A quick review of the input data shapes...

In [199]:
print(X_train_biased.shape)
print("Sample X:\n{0}".format(X_train_biased[0]))
print(y_train.shape)
print("Sample y:\n{0}".format(y_train[0]))


(16512, 9)
Sample X:
[ 1.         -0.7946698  -0.69067362 -0.19501342 -0.09686171  0.22136642
 -0.0748938   2.40728738 -1.67456641]
(16512,)
Sample y:
0.912


In [200]:
batch_size = 256
n_batches = int(np.ceil(X_train_biased.shape[0] / batch_size))
print("Number of batches = {0}".format(n_batches))

Number of batches = 65


In [201]:
from random import sample

def next_batch_rand(Xs,ys,batch_size=64):
    """
    This function randomly samples from the training set based on the size of the batch.
    """
    data_len = Xs.shape[0]
    idxs = sample( list(range(0,data_len)), batch_size )
    return Xs[idxs,:], ys[idxs].reshape(-1,1)

In [202]:
X_b, y_b = next_batch_rand(X_train_biased, y_train, 16)
print(X_b.shape)
print(y_b.shape)

(16, 9)
(16, 1)


In [203]:
# Mini-batch Gradient Descent!
gd_minibatch = tf.Graph()
with gd_minibatch.as_default():
    
    # Now X and y are fed into the graph. n is the number of features (dimensions) in X
    X = tf.placeholder( tf.float32, shape=(None, n), name='X' )
    y = tf.placeholder( tf.float32, shape=(None, 1), name='y')

    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="predictions" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    
    optimizer = tf.train.MomentumOptimizer(learning_rate=alpha, momentum=0.9)

    training_op = optimizer.minimize(mse)
    
    init_op = tf.global_variables_initializer()

I am not sure why I get an exception with the following code. I get an `InvalidArgumentError` because of the `y` being the wrong shape. But my batch function always returns a `(?,1)` sized array, based on the size of the batch.

The exact error is:
```
You must feed a value for placeholder tensor 'y' with dtype float and shape [?,1]
	 [[Node: y = Placeholder[dtype=DT_FLOAT, shape=[?,1], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
```
I will skip this issue for now.

In [204]:
# with tf.Session(graph=gd_minibatch) as sess:
#     sess.run(init_op)
    
#     for i in range(n_epochs):
#         # Within each epoch, train over all mini batches in the training data
#         for batch_idx in range(n_batches):
#             X_batch, y_batch = next_batch_rand(X_train_biased, y_train, batch_size=128)
#             sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
#         if i % 100 == 0:
#             print("Epoch ", i, "MSE = ", mse.eval())

#     print(theta.eval())
