# Andrew Ng Coursera Machine Learning Course - Ex 1
**Dean's TensorFlow Reimplementation Attempt**

*9/3/2017*

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

## 0. Dean's additional playing around with TensorFlow
https://www.tensorflow.org/get_started/get_started

In [5]:
C = tf.constant(3.0, dtype=tf.float32)
C

<tf.Tensor 'Const:0' shape=() dtype=float32>

In [6]:
with tf.Session() as sess:
    print(C.eval(session=sess))
    print(C.eval())
    print(sess.run(C))

3.0
3.0
3.0


In [7]:
P = tf.placeholder(dtype=tf.float32)
P

<tf.Tensor 'Placeholder:0' shape=<unknown> dtype=float32>

In [8]:
#add_three_node = tf.add(P, C)  # Same as next line but not as compact
add_three_node = P + C
add_three_node

<tf.Tensor 'add:0' shape=<unknown> dtype=float32>

In [9]:
with tf.Session() as sess:
    print(add_three_node.eval({P: 2.0}))
    print(sess.run(add_three_node, {P: 2.0}))
    print(sess.run(add_three_node, {'Placeholder:0': 2.0})) # Fails if name above diff
    print(sess.run(add_three_node, {P: [2.0, 4.0]}))

5.0
5.0
5.0
[ 5.  7.]


In [10]:
S = tf.Session()
print(S.run(add_three_node, {P: 2.0}))

# Frees any resources like variables, 
# the with context manager above does this automatically.
S.close() 

# print(S.run(add_three_node, {P: 2.0}))  # would error

5.0


## 1. Simple function

In [11]:
A = tf.eye(5, dtype=tf.int8)
A

<tf.Tensor 'eye/MatrixDiag:0' shape=(5, 5) dtype=int8>

In [9]:
with tf.Session() as sess:
    print(sess.run(A))
    print(A.eval())

[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]


## 2. Linear regression with one variable

**Skip down to 3 (multivariate linear regression) in tensorflow reimplementation.**  

Non-tensorflow implementation is here for reference.

### 2.1. Plotting the Data

In [None]:
data = np.loadtxt('ex1/ex1data1.txt', delimiter=',')
# print(type(data))
# print(data.shape)
# print(data[:5])

In [None]:
Xnp = data[:,0]
ynp = data[:,1]
# m = len(y)  # Number of training examples.
# print(X[:5])
# print(y[:5])
# print(m)

In [None]:
plt.plot(Xnp, ynp, 'rx', markersize=5)
plt.xlabel('Profit in $10,000s')
plt.ylabel('Population of City in 10,000s')

## 2.2. Gradient Descent
### 2.2.1. Update Equations

Hypothesis (univariate):

$$h_\theta(x) = \theta_0 + \theta_1x_1$$

Hypothesis (multivariate):

$$h_\theta(x) = \theta_0 + \theta_1x_1 + \theta_2x_2 + \cdots + \theta_nx_n$$

Hypothesis incorporating an $x_0$ (always 1) to make the bias term more consistent:

$$h_\theta(x) = \theta_0x_0 + \theta_1x_1 + \theta_2x_2 + \cdots + \theta_nx_n  \mid  x_0 = 1$$

Hypothesis vectorized:

$$h_\theta(x) = \theta^\top x $$

Cost function:

$$J(\theta) = \frac{1}{2m}\sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)})^2$$

**Note:** Superscripts in parenthesis $^{(i)}$ just refers to the $i^{th}$ training example, not raising to a power.  Superscripts that are not in parenthesis, like the final square in the cost function $^2$, does raise to a power.

Batch gradient descent procedure, repeatedly updating $\theta_j$ for all $j$:

$$\theta_j := \theta_j - \alpha \frac{\partial}{\partial\theta_j}J(\theta) $$

... which is ...

$$\theta_j := \theta_j - \alpha\frac{1}{m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)} $$

... updating $\theta_j$ for all $j$ **simultaneously**.

### 2.2.2. Implementation

In [None]:
# Make X and y two dimensional.
#print(X.shape)
#X = X[:, np.newaxis]
#y = y[:, np.newaxis]
#print(X.shape)
#print(X[:5])

In [None]:
# Initialize fitting parameters
#theta = np.zeros((2,1))
#theta
W = tf.Variable()

In [None]:
# Initialize gradient descent settings
#iterations = 1500
#alpha = 0.01 # Learning rate?

### 2.2.3. Computing the cost $J(\theta)$

In [None]:
# Student implements...
def computeCost(X, y, theta):
    m = len(y)
    h = X @ theta
    J = np.sum((h - y)**2) / (2*m)
    return J

In [None]:
J = computeCost(X, y, theta)
print('Cost at theta 0, 0: %f' % J)
print('Expected cost about 32.07')

In [None]:
theta = np.array(((-1,) 
                 ,( 2,)))
J = computeCost(X, y, theta)
print('Cost at theta -1, 2: %f' % J)
print('Expected cost about 54.24')

### 2.2.4. Gradient descent

In [None]:
print(X[:5])
print(y[:5])
print(theta)

In [None]:
# Student implements...
def gradientDescent(X, y, theta, alpha, num_iters):
    m = len(y)
    J_history = np.zeros(num_iters) 
    for n in range(num_iters):
        h = X @ theta
        gradient = (X.T @ (h - y)) / m
        theta_new = theta - alpha * gradient
        theta = theta_new
        J_history[n] = computeCost(X, y, theta)
    return theta, J_history  # J_history used in optional exercises far below

In [None]:
initial_theta = np.zeros((2,1))
trained_theta, _ = gradientDescent(X, y, initial_theta, alpha, iterations)

In [None]:
# Expected theta values (approx) -3.6303, 1.1664
trained_theta

In [None]:
plt.plot(X[:,1], y, 'rx', markersize=5)
plt.xlabel('Profit in $10,000s')
plt.ylabel('Population of City in 10,000s')
plt.plot(X[:,1], X@trained_theta)

## 2.4. Visualizing $J(\theta)$
(skipping)

# Optional Exercises
# 3. Linear regression with multiple variables
**Note: Some of this not tested as well as the above**

First get data for multivariate regression.

In [12]:
data = np.loadtxt('ex1/ex1data2.txt', delimiter=',')
Xnp = data[:, :-1]
ynp = data[:, -1]
m = Xnp.shape[0]
print(Xnp.shape)
print(ynp.shape)

(47, 2)
(47,)


In [13]:
Xnp[:5]

array([[  2.10400000e+03,   3.00000000e+00],
       [  1.60000000e+03,   3.00000000e+00],
       [  2.40000000e+03,   3.00000000e+00],
       [  1.41600000e+03,   2.00000000e+00],
       [  3.00000000e+03,   4.00000000e+00]])

In [14]:
ynp[:5]

array([ 399900.,  329900.,  369000.,  232000.,  539900.])

## 3.1. Feature normalization

Feature normalization (aka feature scaling?) is adjusting the input features to be similar in size.  It can help gradient descent to converge more quickly.

For example, mean normalization:

$$x_j^{(i)} := \frac{x_j^{(i)} - \mu_j}{\sigma_j}$$

...where $\mu_j$ is the mean of all $x_j$, and $\sigma_j$ is the standard deviation of all $x_j$

**Note:  Remember to apply the same treatment to any input variables of new data you make predictions on.**

In [15]:
Xnp[:5]

array([[  2.10400000e+03,   3.00000000e+00],
       [  1.60000000e+03,   3.00000000e+00],
       [  2.40000000e+03,   3.00000000e+00],
       [  1.41600000e+03,   2.00000000e+00],
       [  3.00000000e+03,   4.00000000e+00]])

In [16]:
# Student implements
def featureNormalize(X):
    mu = X.mean(axis=0)
    sigma = X.std(axis=0)
    X_norm = (X - mu) / sigma
    
    return X_norm, mu, sigma

In [17]:
Xnp, mu, sigma = featureNormalize(Xnp)

In [18]:
Xnp[:5]

array([[ 0.13141542, -0.22609337],
       [-0.5096407 , -0.22609337],
       [ 0.5079087 , -0.22609337],
       [-0.74367706, -1.5543919 ],
       [ 1.27107075,  1.10220517]])

In [17]:
# Prepend ones for bias/intercept term
#Xnp = np.hstack((np.ones((m, 1)), Xnp))
#Xnp[:5]

## 3.1.0. Make Tensorflow Graph

In [28]:
g = tf.Graph()
with g.as_default():
    X = tf.placeholder(tf.float32, name='X')
    y = tf.placeholder(tf.float32, name='y')
    feature_count = Xnp.shape[-1]
    W = tf.Variable(tf.zeros([Xnp.shape[-1]], dtype=tf.float32))
    b = tf.Variable(tf.zeros([], dtype=tf.float32))
    print(y)
    dummy = y**2
    #w = tf.Variable(X.get_shape)

Tensor("y:0", dtype=float32)


In [33]:
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    inputs = {X: Xnp, y: ynp}
    print(y.eval(feed_dict=inputs))
    print(sess.run(W, feed_dict=inputs))
    print(sess.run(b, feed_dict=inputs))
    

[ 399900.  329900.  369000.  232000.  539900.  299900.  314900.  198999.
  212000.  242500.  239999.  347000.  329999.  699900.  259900.  449900.
  299900.  199900.  499998.  599000.  252900.  255000.  242900.  259900.
  573900.  249900.  464500.  469000.  475000.  299900.  349900.  169900.
  314900.  579900.  285900.  249900.  229900.  345000.  549000.  287000.
  368500.  329900.  314000.  299000.  179900.  299900.  239500.]
[ 0.  0.]
0.0


In [21]:
# I tried splitting these into multiple cells, and it didn't seem to work.
# So I guess no multicell try finally's to close tf.Session's, etc.
try:
    print('Before exception')
    raise Exception('test')
    print('After exception')
except Exception as E:
    print('In Exception %s' % E)
finally:
    print('In finally')

Before exception
In Exception test
In finally


## 3.2. Gradient Descent - Choosing Learning Rate $\alpha$

You want a learning rate small enough to converge, but not so small that it takes forever to converge.  It can be helpful to graph cost $J(\theta)$ for each iteration of gradient descent.

In [None]:
alpha = 0.1
num_iters = 400;
theta = np.zeros(X.shape[-1])
theta

In [None]:
theta_trained, J_history = gradientDescent(X, y, theta, alpha, num_iters)

In [None]:
theta_trained

### 3.2.1. Optional (ungraded) exercise: Selecting learning rates

Try out different learning rates $\alpha$, (e.g. 0.3, 0.1, 0.03, 0.01), looking for a rate that converges quickly.  Plot the cost function $J(\theta)$ convergence over the first 50 iterations.

In [None]:
plt.yscale('log')
plt.plot(range(len(J_history)), J_history)

In [None]:
alpha = 0.03
num_iters = 50
theta_trained, J_history = gradientDescent(X, y, theta, alpha, num_iters)
plt.yscale('log')
plt.plot(range(len(J_history)), J_history)

In [None]:
alpha = 0.3
num_iters = 50
theta_trained, J_history = gradientDescent(X, y, theta, alpha, num_iters)
plt.yscale('log')
plt.plot(range(len(J_history)), J_history)

Use the trained value of $\theta$ to predict the price of a house with 1650 square feet and 3 bedrooms.  (Don't forget to normalize your features when you make this prediction.)

In [None]:
Xpred = np.array([1650, 3])
Xpred = (Xpred - mu) / sigma
Xpred = np.hstack(([1], Xpred))
Xpred

In [None]:
theta_trained

In [None]:
ypred = theta_trained @ Xpred  # Should one of these be transposed?
ypred

## 3.3. Normal Equations

Gradient descent is applicable to many other situations.  However, for linear regression, there is a way to solve for the optimal parameters directly, without feature scaling or iterating.  However, this normal equation can be slow if the number of features $n$ is very large, e.g $n > 10,000$.  It scales approximately $O(n^3)$.

$$\theta = (X^\top X)^{-1} X^\top \vec{y}$$

You may want to use a (`np.linalg.pinv()`?) pseudo-inverse function instead of a normal inverse, just in case $X^TX$ is non-invertable.

In [None]:
def normalEqn(X, y):
    theta = np.linalg.pinv(X.T @ X) @ X.T @ y
    return theta

In [None]:
theta_trained = normalEqn(X, y)
theta_trained