# Section 6 Lecture 31: TensorFlow Regression Example

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf

In [None]:
# we want a big data set this time
x_data = np.linspace(0.0,10.0,1000000)
noise = np.random.randn(len(x_data))

In [None]:
#noise
#x_data

$y = mx + b$

$b = 5$

In [None]:
y_true = (0.5 * x_data) + 5 + noise # add some noise so it's not a perfect line

In [None]:
#Use pandas to make charts and other ops easier

In [None]:
x_df = pd.DataFrame(data=x_data,columns=['X Data'])
y_df = pd.DataFrame(data=y_true,columns=['Y'])

In [None]:
# now concatenate 2 data frames
my_data = pd.concat([x_df, y_df], axis=1) # join them on columns rather than as "pancakes"

In [None]:
# my_data.plot() # too much data, we may crash factory

In [None]:
my_data.sample(n=250).plot(kind='scatter',x='X Data',y='Y')

In [None]:
# ^^^ linear trend but not straight because we've added noise
# can we get TF to find a line here? We know intercept should be ~5 
# and slope should be 0.5
# But we can't feed 1mm points to TF, we have to batch it

Steps to using TF:
* Define constants
* Define variables
* Define placeholders
* Define model
* define graph
* define loss function
* define optimizer

In [None]:
batch_size = 8 # grab 8 pts at a time, choosing it is an art

In [None]:
np.random.randn(2)

In [None]:
m = tf.Variable(-0.00361972)
b = tf.Variable(-1.4707748)

In [None]:
xph = tf.placeholder(tf.float32,[batch_size])
yph = tf.placeholder(tf.float32,[batch_size])

In [None]:
y_model = m*xph + b

In [None]:
# define loss function
# we could use **2 but use tf function for consistency with documentation
error = tf.reduce_sum(tf.square(yph-y_model))

In [None]:
# Now define optimizer and training
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train = optimizer.minimize(error)

In [None]:
init = tf.global_variables_initializer()

In [None]:
# test passing an array (rather than a scalar) as an index
multi_index = range(0,100,7)
x_data[multi_index]
# it works!

In [None]:
val = []
with tf.Session() as sess:
    sess.run(init) # we need to initialize locals as well as globals [why??]
    # run "some" batches
    # 8,000 samples _should_ be enough
    batches = 1000
    for i in range(batches):
        # rand_ind is fetch random indexes of the data
        rand_ind = np.random.randint(len(x_data), size=batch_size)
        # fill the random index points into the TF graph
        feed_dict = {xph:x_data[rand_ind], yph:y_true[rand_ind]}
        # Now train with these random points
        sess.run(train,feed_dict)
        model_m, model_b = sess.run([m,b])
        val.append((model_m,model_b,i))
    

    print(f'len {len(val)} 0={val[0]} last={val[-1]}')

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
fig = plt.figure(figsize=(10,10))
ax=fig.add_subplot(111,projection='3d')
val_x,val_y,val_z = list(zip(*val))
plt.plot(val_x,val_y,val_z,marker='.')
plt.scatter(val_x[-1],val_y[-1],val_z[-1],c='r', marker='+')
print(val_x[-1],val_y[-1],len(val[:10]))
ax.set_xlabel('slope')
ax.set_ylabel('intercept')
ax.set_zlabel('time')
ax.set_xlim(min(val_x),max(val_x))
ax.set_ylim(min(val_y),max(val_y))
ax.set_zlim(min(val_z),max(val_z))

In [None]:
# We started with orig slope of 0.5 then preturbed it with noise so m should be close to 0.5
# Likewise, b started close to 5, so it should end up being close to 5
print(f'm={model_m} b={model_b}')

In [None]:
# now we visualize
y_hat = x_data * model_m + model_b
my_data.sample(250).plot(kind='scatter',x='X Data', y='Y')
my_val_samples = np.array(val)[np.linspace(0,len(val)-1,10,dtype=int)]#.sample(10)
for points in my_val_samples:
    _y_hat = x_data * points[0] + points[1]
    _col = 1-(points[2] / len(val))
    plt.plot(x_data, _y_hat, color=(_col, .8, .8))
plt.plot(x_data, y_hat, 'r')
plt.ylim(0,my_data.Y.max())

# **Session 6, Lecture 32 Using Estimator API (and Train/Test Fit)**

We use TF but there are others like Keras and Layers, look at them at the end of course

# TF Estimator

In [None]:
# we have a single feature column that happens to be a numeric
feat_cols = [tf.feature_column.numeric_column('x',shape=[1])]

In [None]:
# now set up estimator, ignore errors
# similar to a Scikit-learn estimator
estimator = tf.estimator.LinearRegressor(feature_columns=feat_cols)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_eval, y_train, y_eval = train_test_split(x_data, y_true, test_size=0.3, random_state=101)

In [None]:
print(x_train.shape) # 70% of 1 mil == 700k

In [None]:
x_eval.shape # same, 30% of 1mil == 300k

In [None]:
# input fucntion that acts like feed dictionary and batch size indicator all at once
# It can take in numpy *and* pandas

# "x" is dictionary, so pass in same as the feat_cols
input_func = tf.estimator.inputs.numpy_input_fn({'x': x_train}, y_train,  # train data
                                                batch_size=8, num_epochs=None, shuffle=True) # other args for batches

In [None]:
train_input_func = tf.estimator.inputs.numpy_input_fn({'x': x_train}, y_train,  # train data
                                                batch_size=8, num_epochs=1000, shuffle=False) # shuffle = false because we're going to use this for eval against a test func

In [None]:
eval_input_func = tf.estimator.inputs.numpy_input_fn({'x': x_eval}, y_eval,  # train data
                                                batch_size=8, num_epochs=1000, shuffle=False) # shuffle = false because we're going to use this for eval against a test func

### Train estimator

In [None]:
estimator.train(input_fn=input_func, steps=1000)

### Now lets get estimator metrics

In [None]:
# compare to values
train_metrics = estimator.evaluate(input_fn=train_input_func, steps=1000)

In [None]:
eval_metrics = estimator.evaluate(input_fn=eval_input_func,steps=1000)

### now we compare results

In [None]:
print('TRAINING DATA METRICS')
print(train_metrics)

In [None]:
# now compare to test set metrics

In [None]:
print('EVAL METRICS')
print(eval_metrics)

check for overfit by comparing train loss to test loss. if test loss is a lot bigger then it's overfit
You should expect that test loss is higher but should be close

## So how do we predict new values??

In [None]:
brand_new_data = np.linspace(0,10,10)

In [None]:
input_fn_predict = tf.estimator.inputs.numpy_input_fn({'x': brand_new_data}, shuffle=False)

In [None]:
estimator.predict(input_fn=input_fn_predict)

_note output is a generator object, so we can cast to a list or iterate thru values_

In [None]:
#predictions = []
#for prod in estimator.predict(input_fn=input_fn_predict):
#    predictions.append(pred['predictions'])
predictions = [pred['predictions'] for pred in estimator.predict(input_fn=input_fn_predict)]

In [None]:
predictions

### now plot and see how we did

In [None]:
my_data.sample(n=250).plot(kind='scatter', x='X Data', y='Y')
plt.plot(brand_new_data,predictions,'r*')
# Note how close the plotted predicted points are to the existing data