# Basic Concepts and API

All TF code follows this process:
1. Create a **computation graph** that defines your computational structure
2. Create a TF session
3. Run the computation graph in the session

In [1]:
import tensorflow as tf

In [2]:
# Define variables and operations in the graph

x = tf.Variable(3, name="x") # declare a symbolic name, x
y = tf.Variable(4, name="y")
g = x*x*y
h = y**3
print(type(g))
print(type(h))
f = g + h
print(type(f))

<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>


The *type* of each computation is a TF **op**.

In [3]:
# Build a session and run. Using the "with" context block automatically closes the session.
with tf.Session() as tf_sess:
    x.initializer.run()
    y.initializer.run()
    result = f.eval()

In [4]:
result

100

An alternative to initializing variables individually is to call the <code>global_variables_initializer</code> function.

In [5]:
init = tf.global_variables_initializer() # Creates an init node

with tf.Session() as tf_sess:
    init.run()
    result = f.eval()
    print(result)

100


## Graphs

We can build graphs and then merge them together programmatically. Otherwise, it is assumed that declared computations are applied to the **same graph**.

In [6]:
x1 = tf.Variable(1)
# check where this x1 node lives:
x1.graph is tf.get_default_graph()

True

In [7]:
# Now, make another graph and add a new variable to it:
new_graph = tf.Graph()
with new_graph.as_default():
    x2 = tf.Variable(2)
    
print(x2.graph is tf.get_default_graph())
print(x2.graph is new_graph)

False
True


## More on Nodes

TF node evaluation determines the set of nodes that the node depends on and evaluates them. **All node values (except variables) are dropped between graph runs!**

Varialbes start their life when initialized and end when the session closes.

In [8]:
tf.reset_default_graph()

w = tf.constant(9)
x = w * 7
y = x + 2
z = x**2

with tf.Session() as tf_sess:
    print(y.eval())
    print(z.eval())

65
3969


The above code is not efficient, as the computation of x and w will happen twice! Instead, have y and evaluate in a single graph run.

In [9]:
with tf.Session() as tf_sess:
    y_val, z_val = tf_sess.run([y,z])
    print(y_val)
    print(z_val)

65
3969


## Operations

TF "ops" can take *any* number of inputs and produce *any* number of outputs. Sources are constants and Variables. The inputs and outputs of operations are always **tensors** - multi-dimensional arrays. In TF, tensors are numpy `ndarray`s.

The following example performs linear regression using the closed form Normal Equation embedded as a TF op. 
This example code uses the California housing data set.

In [10]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

housing_dataset = fetch_california_housing()
m,n = housing_dataset.data.shape
print("Data shape: " + str(m) + " instances, " + str(n) + " features")

X_raw = housing_dataset.data
y_raw = housing_dataset.target

# Split up the data
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2)
print("Training size: " + str(X_train.shape[0]) + "; Test size: " + str(X_test.shape[0]))

# Scale the data sets
housing_scaler = StandardScaler()
X_train_scaled = housing_scaler.fit_transform(X_train)
X_test_scaled = housing_scaler.transform(X_test)

Data shape: 20640 instances, 8 features
Training size: 16512; Test size: 4128


In [11]:
# Add a bias of 1 to model the linear regression.
X_train_biased = np.c_[np.ones((X_train_scaled.shape[0],1)), X_train_scaled]
X_test_biased = np.c_[np.ones((X_test_scaled.shape[0],1)), X_test_scaled]
print("Biased train data shape: " + str(X_train_biased.shape[0]) + " instances, " + str(X_train_biased.shape[1]) + " features")
print("Biased test data shape: " + str(X_test_biased.shape[0]) + " instances, " + str(X_test_biased.shape[1]) + " features")

Biased train data shape: 16512 instances, 9 features
Biased test data shape: 4128 instances, 9 features


In [12]:
X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
print("Target array shape: " + str(X_train_biased.shape))
# Explicitly turn into an m x 1 vector
y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
print("...as TF constant: " + str(y_train.shape))
XT = tf.transpose(X)

Target array shape: (16512, 9)
...as TF constant: (16512,)


Implement the **Normal Equation**:
$\theta^{\star} = (X\cdot X^T)^{-1}\cdot{X^T}\cdot{y}$

In [13]:
inv = tf.matrix_inverse( tf.matmul(XT, X) )
theta = tf.matmul( tf.matmul(inv, XT), y )

In [14]:
# Vroom vroom!
with tf.Session() as tf_sess:
    theta_val = theta.eval()

In [15]:
print("Performed a linear regression over the data set:")
print(str(theta_val) + "\n " + str(theta_val.shape))

Performed a linear regression over the data set:
[[ 2.0732806e+00]
 [ 8.4796059e-01]
 [ 1.1729030e-01]
 [-2.7714539e-01]
 [ 3.2005358e-01]
 [-1.8042531e-03]
 [-4.1402280e-02]
 [-9.2463315e-01]
 [-8.9791656e-01]]
 (9, 1)


## Manual Gradient Descent via TF

I will re-use the scaled data from above and implement gradient descent manually rather than use the normal equation solution.

In [16]:
m = X_train_biased.shape[0]
n = X_train_biased.shape[1]

n_epochs = 2000
alpha = 0.01 # learning rate

In [17]:
# For grins, make a new graph for this implementation.
gd_graph = tf.Graph()

with gd_graph.as_default():
    X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
    y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="predictions" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    # Gradient calculations
    dJdtheta = (2.0/m) * tf.matmul( tf.transpose(X), error )
    # Training/learning op. assign() computes a new value and assigns it to a TF variable
    train_op = tf.assign( theta, theta - alpha*dJdtheta )
    
    init_op = tf.global_variables_initializer()

In [18]:
with tf.Session( graph=gd_graph ) as sess:
    sess.run(init_op)
    
    for i in range(n_epochs):
        
        if i % 100 == 0:
            print("Epoch ", i, "MSE = ", mse.eval())
        sess.run(train_op)
    
    # At the end, print the current thetas
    print(theta.eval())


Epoch  0 MSE =  5.955667
Epoch  100 MSE =  0.667297
Epoch  200 MSE =  0.57017946
Epoch  300 MSE =  0.5572751
Epoch  400 MSE =  0.54890317
Epoch  500 MSE =  0.54260564
Epoch  600 MSE =  0.53780556
Epoch  700 MSE =  0.53412336
Epoch  800 MSE =  0.5312809
Epoch  900 MSE =  0.52907455
Epoch  1000 MSE =  0.5273521
Epoch  1100 MSE =  0.5260012
Epoch  1200 MSE =  0.52493477
Epoch  1300 MSE =  0.52408993
Epoch  1400 MSE =  0.5234178
Epoch  1500 MSE =  0.5228803
Epoch  1600 MSE =  0.52244985
Epoch  1700 MSE =  0.5221027
Epoch  1800 MSE =  0.5218228
Epoch  1900 MSE =  0.5215965
[[ 2.0732749e+00]
 [ 8.7297672e-01]
 [ 1.2570815e-01]
 [-3.1378302e-01]
 [ 3.4614247e-01]
 [ 9.1348443e-04]
 [-4.2238470e-02]
 [-8.4004754e-01]
 [-8.1584382e-01]]


The results are pretty good compared to the normal equation. But it would be nice to not have to compute the derivative by hand all the time, especially for more difficult functions, e.g. regularized cost functions. Next, I will use *autodiff* to automatically compute the gradient.

In [19]:
gd_graph2 = tf.Graph()
with gd_graph2.as_default():
    X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
    y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="predictions" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    # Using tf's autodiff capability compute the derivative of the MSE
    dJdtheta = tf.gradients( mse, [theta], name="dJdtheta" )[0]
    print(dJdtheta)
    
    # Training/learning op. assign() computes a new value and assigns it to a TF variable
    # This *is* the optimization process - simple gradient descent
    train_op = tf.assign( theta, theta - alpha*dJdtheta )
    
    init_op = tf.global_variables_initializer()

Tensor("dJdtheta/predictions_grad/MatMul_1:0", shape=(9, 1), dtype=float32)


In [20]:
with tf.Session( graph=gd_graph2 ) as sess:
    sess.run(init_op)
    
    for i in range(n_epochs):
        
        if i % 100 == 0:
            print("Epoch ", i, "MSE = ", mse.eval())
        sess.run(train_op)
    
    # At the end, print the current thetas
    print(theta.eval())


Epoch  0 MSE =  7.85264
Epoch  100 MSE =  0.77685225
Epoch  200 MSE =  0.65164685
Epoch  300 MSE =  0.6166419
Epoch  400 MSE =  0.59215075
Epoch  500 MSE =  0.5742215
Epoch  600 MSE =  0.5610168
Epoch  700 MSE =  0.55124444
Epoch  800 MSE =  0.543973
Epoch  900 MSE =  0.5385329
Epoch  1000 MSE =  0.53444034
Epoch  1100 MSE =  0.5313431
Epoch  1200 MSE =  0.52898556
Epoch  1300 MSE =  0.5271791
Epoch  1400 MSE =  0.52578795
Epoch  1500 MSE =  0.52471
Epoch  1600 MSE =  0.52386934
Epoch  1700 MSE =  0.523211
Epoch  1800 MSE =  0.52269274
Epoch  1900 MSE =  0.52228254
[[ 2.0732749e+00]
 [ 8.7503898e-01]
 [ 1.2893695e-01]
 [-3.1248391e-01]
 [ 3.4260067e-01]
 [ 2.0070914e-03]
 [-4.2432308e-02]
 [-8.1376356e-01]
 [-7.8953385e-01]]


It is possible to roll all of the above into a simple call to a tf `Optimizer`! 
The following code includes logging (`tf.summary`) and model saving commands (`tf.train.Saver`) as well.

In [21]:
from datetime import datetime # for logging

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
logdir = "./logs/run-{0}".format(now)

gdwithopt_graph = tf.Graph()
# All the same intialization code, but then call on a MomentumOptimizer (or whatever other flavor)
with gdwithopt_graph.as_default():
    X = tf.constant(X_train_biased, dtype=tf.float32, name="X")
    y = tf.constant(y_train.reshape(-1,1), dtype=tf.float32, name="y")
    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="y_pred" )
    with tf.name_scope("loss") as scope:
        error = y_pred - y
        # Call on TF's mse function
        mse = tf.reduce_mean( tf.square(error), name="mse" )
    
    # The optimizer:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=alpha)
    # Uncomment to use MomentumOptimizer
#     optimizer = tf.train.MomentumOptimizer(learning_rate=alpha, momentum=0.9)

    training_op = optimizer.minimize(mse)
    
    init_op = tf.global_variables_initializer()
    
    # Added a saver node
    saver = tf.train.Saver()
    # Logging for Tensorboard
    mse_summary = tf.summary.scalar('MSE',mse) # Creates a node that outputs value of mse
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [22]:
with tf.Session(graph=gdwithopt_graph) as sess:
    sess.run(init_op)
    
    for i in range(n_epochs):
        if i % 50 == 0:
            # Log current MSE
            file_writer.add_summary(mse_summary.eval(), i)
            # Save a model checkpoint
            save_path = saver.save(sess, "./models/lin_reg_mid.ckpt")
        sess.run(training_op)
    
    # At the end, print the current thetas
    print(theta.eval())
    # Save out the model
    save_path = saver.save(sess, "./models/lin_reg_final.ckpt")
    file_writer.close()


[[ 2.0732749 ]
 [ 0.8661438 ]
 [ 0.13141075]
 [-0.29014128]
 [ 0.32104757]
 [ 0.00291607]
 [-0.04240436]
 [-0.8021911 ]
 [-0.7765645 ]]


The above code still performed a batch learning process: the whole data set was consumed and the model was trained. Next step in thie evolution is to move to a **mini-batch** process. $X$ and $y$ get replaced with new values from the data set on each epoch iteration. In TF, we use `placeholder` nodes to accomplish this modification. (Note that `None` means any size.)

To pass in a value to placeholder nodes, create a `feed_dict` and assign the value with the variable as a keyword.

In [23]:
import numpy as np
from numpy.random import randn

placeholder_ex_graph = tf.Graph()
with placeholder_ex_graph.as_default():
    A = tf.placeholder(tf.float32, shape=(None,4), name='A')
    B = tf.placeholder(tf.int64, shape=(None,6), name='B')
    C = A * 2 
    D = B + 2

with tf.Session(graph=placeholder_ex_graph) as sess:
    C_result = C.eval( feed_dict={A: randn(1,4)} )
    D_result = D.eval( feed_dict={B: [[8,9,1,2,7,3]]} )

In [24]:
print(C_result)
print(D_result)

[[-0.2006731  2.7963092  1.562574  -0.6011623]]
[[10 11  3  4  9  5]]


TensorFlow has a batching function, `tf.train.batch`. See the details [here](https://www.tensorflow.org/api_docs/python/tf/train/batch).

A quick review of the input data shapes...

In [25]:
print(X_train_biased.shape)
print("Sample X:\n{0}".format(X_train_biased[0]))
print(y_train.shape)
print("Sample y:\n{0}".format(y_train[0]))


(16512, 9)
Sample X:
[ 1.          0.9144781   0.5066612   0.49843741 -0.28258584  0.00519773
  0.00825509 -0.84464416  0.83965343]
(16512,)
Sample y:
2.008


In [26]:
batch_size = 256
n_batches = int(np.ceil(X_train_biased.shape[0] / batch_size))
print("Number of batches = {0}".format(n_batches))

Number of batches = 65


In [83]:
from random import sample

def next_batch_rand(Xs,ys,batch_size):
    """
    This function randomly samples from the training set based on the size of the batch.
    """
    data_len = Xs.shape[0]
    idxs = sample( list(range(0,data_len)), batch_size )
    return Xs[idxs,:], ys[idxs].reshape(-1,1)

In [84]:
X_b, y_b = next_batch_rand(X_train_biased, y_train, 16)
print(X_b.shape)
print(y_b.shape)

(16, 9)
(16, 1)


In [85]:
# Mini-batch Gradient Descent!
gd_minibatch = tf.Graph()
with gd_minibatch.as_default():
    
    # Now X and y are fed into the graph. n is the number of features (dimensions) in X
    X = tf.placeholder( tf.float32, shape=(None, n), name='X' )
    y = tf.placeholder( tf.float32, shape=(None, 1), name='y')

    # Initialize theta variables with uniform random values
    theta = tf.Variable( tf.random_uniform([n, 1], -1.0, 1.0), name="theta" )
    # Compute the predictions and error
    y_pred = tf.matmul( X, theta, name="y_pred" )
    error = y_pred - y
    # Call on TF's mse function
    mse = tf.reduce_mean( tf.square(error), name="mse" )
    
    optimizer = tf.train.MomentumOptimizer(learning_rate=alpha, momentum=0.9)

    training_op = optimizer.minimize(mse)
    
    init_op = tf.global_variables_initializer()

I am not sure why I get an exception with the following code. I get an `InvalidArgumentError` because of the `y` being the wrong shape. But my batch function always returns a `(?,1)` sized array, based on the size of the batch. It appears to be fine with the shape of `X_batch`.

The exact error is:
```
You must feed a value for placeholder tensor 'y' with dtype float and shape [?,1]
	 [[Node: y = Placeholder[dtype=DT_FLOAT, shape=[?,1], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
```
According to docs, values passed into the `feed_dict` with `(?,n)` shape requirement must have rank 2 (2d array).

**Issue resolved**: I did not pass in the `feed_dict` into `mse.eval()`...*facepalm*. That said, my random batch generation appears to be causing instabilities in the mini-batch training! Will return to fix later...

Key takeaway: ensure that the data passed into the `feed_dict` is a numpy array with rank 2.

In [90]:
# with tf.Session(graph=gd_minibatch) as sess:
#     sess.run(init_op)
    
#     for i in range(n_epochs):
#         # Within each epoch, train over all mini batches in the training data
#         for batch_idx in range(n_batches):
#             X_batch, y_batch = next_batch_rand(X_train_biased, y_train, batch_size=batch_size)
#             print(batch_idx)
#             print(X_batch.shape)
#             print(y_batch.shape)
#             if batch_idx % 64 == 0:
#                 print(y_batch)
#             sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
#             print("Ran...")
#         if i % 100 == 0:
#             print("Epoch ", i, "MSE = ", mse.eval(feed_dict={X:X_batch, y:y_batch}))

#     print(theta.eval())


## More Graph Construction Tools

TF allows you to define functions over the tf components. It also has facilities to share variables among graph components.

Let's make a modular rectified linear unit (ReLU) from scratch.
A ReLU outputs the maximum of a linear combination of features ($X$) and weights ($w$) or 0:

$$
h_{w,b}(X) = max(X\cdot w + b, 0)
$$



In [31]:
def relu(X):
    """X is an input feature tensor."""
    with tf.name_scope("relu") as scope:
        with tf.variable_scope("relu", reuse=True): # allow the reuse of a variable outside of scope
            w_shape = (int(X.get_shape()[1]), 1) # w must align with X's columns
            w = tf.Variable(tf.random_normal(w_shape), name="w")
            threshold = tf.get_variable("threshold")
#             b = tf.Variable(0.0, name="b")
            z = tf.add( tf.matmul(X,w), threshold, name="z" )
#             z = tf.add( tf.matmul(X,w), b, name="z" )
        return tf.maximum(z, 0.)

In [32]:
logdir = "./logs/relu-{0}".format(datetime.utcnow().strftime("%Y%m%d%H%M%S"))

n_features = 4

relu_graph = tf.Graph()
with relu_graph.as_default():
    X = tf.placeholder( tf.float32, shape=(None,n_features), name="X" )
    # Create the relu variable scope and define a threshold value
    with tf.variable_scope("relu"):
        threshold = tf.get_variable("threshold", shape=(),
                                   initializer=tf.constant_initializer(0.0))
    relus = [relu(X) for i in range(6)]
    output = tf.add_n(relus, name="output") # Add results of all n relu tensors
    
    init_op = tf.global_variables_initializer()
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [36]:
with tf.Session(graph=relu_graph) as sess:
    sess.run(init_op)
    result = output.eval( feed_dict = {X:randn(1,n_features)} )
    print("Output = {0}".format(result))
    file_writer.close()

Output = [[6.4203606]]
