In [1]:
import tensorflow as tf
import numpy as np 
from sklearn.datasets import fetch_california_housing
from IPython.display import clear_output, Image, display, HTML

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Implement Data Preprocess here ######
housing = fetch_california_housing()
# Show description/ statisics about the dataset
print(housing.DESCR)
print('Features')
print('----------')
print(', '.join(housing.feature_names))
print("\nShape of dataset:", housing.data.shape)
print("Shape of label:", housing.target.shape)

California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.


Features
----------
MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude

Shape of dataset: (20640, 8)
Shape of label: (20640,)


## Get training set and testing set

In [2]:
# Define cut point
cut = int(housing.data.shape[0]*0.9)
# Convert to column vector format, which is (n,1) not (n,)
housing.target = housing.target.reshape(housing.target.shape[0],1)
# Split dataset
x_train, x_test = housing.data[:cut], housing.data[cut:]
y_train, y_test = housing.target[:cut], housing.target[cut:]

print("\nShape of training set:", x_train.shape)
print("Shape of testing set:", x_test.shape)
print("Shape of training label:", y_train.shape)
print("Shape of testing label:", y_test.shape)


Shape of training set: (18576, 8)
Shape of testing set: (2064, 8)
Shape of training label: (18576, 1)
Shape of testing label: (2064, 1)


## Define Model Graph

In [3]:
# Clear graph
tf.reset_default_graph()

# Define variables to take input feature x, label y
x = tf.placeholder(tf.float64, shape = [None, x_train.shape[1]], name="Input")
y = tf.placeholder(tf.float64, shape = [None, 1], name="Ground_True")

# Define the weights for each element of input x, and random assign value as normal distribution
W = tf.Variable(tf.random_uniform([x_train.shape[1], 1], -1, 1, tf.float64), name="Weights")

# Define the bias for the linear function
b = tf.Variable(tf.zeros(1, dtype=tf.float64), name="Bias")

# Connect each elements for model
linear_regression = tf.matmul(x, W) + b

# Define cost/loss function
error_rate = tf.abs((y - linear_regression)/y, name="Error_Rate")
loss = tf.reduce_mean(error_rate, name="Mean_Error")

# Define training optimizer
optimizer = tf.train.GradientDescentOptimizer(0.05)
train = optimizer.minimize(loss)


## Model Graph

In linear regression, we need to model a line following the equation: 

$\hat y = \sum_{i=1}^n w_i x_i + b$ 

or

$\hat y = W^T x + b$

Where $i$ stand for each feature, and we given a weight $w$ for each $x_i$ in $x$.

We will go through this graph in bottom up direction:

- Linear Function

    1. In bottom layer, we got **Input** and **Weights** elements.

      a. Input, represent $x$ , is the input node which we input our feature about the houses.
      
      b. Weights, represent $w$, is the weight  we given to each feature $x_i$. To initialized the weight  value, we using random_uniform method to give a set of value($-1 \leq w_i \leq 1$) follow uniform distribution.

    2. Do the matrix mutiple by utilizing tf.matmul function to calculate $W^T x$ which equal to $\sum_{i=1}^n x_i w_i$. In this part, we have finished half th equation.

    3. Following we add the Bias $b$ to the $W^T x$ to complete the equation $\hat y = W^T x + b$.


- **Error Rate** and **Loss** Calculate

    1. To calculate the error, we define the error function as $(y - \hat y)/y$. In the graph, we define $y$ as Ground_True label, following substract the $\hat y$ and divided by $y$ which labeled as **truediv**. By this step, we are able to get the **Error Rate** which labeled as **Error_Rate**

    2. To calculate the Loss labeled as **Mean_Error** in graph, we collect all the **Error Rate** and get the mean of them by utilizing *tf.reduce_mean*.


- **GradientDescent** is the element which connect to  **Weights** and **Bias** that need to adjust.


- **gradients** connects to several elements to calculate the gradient slope, with this the network is able to do the back propagation to adjust the **Weights** and **Bias**.

In [4]:
###### Start TF session ######
with tf.Session() as sess:
    show_graph(tf.get_default_graph().as_graph_def())

## Directly Train
Let's try directly train model without data pre-processing

In [5]:
###### Start TF session ######
with tf.Session() as sess:
    # training loop
    sess.run(tf.global_variables_initializer()) # reset values to wrong
    for i in range(1000):
        sess.run(train, {x: x_train, y: y_train})
        if i % 200 == 0:
            print("Round ",i+1)
            l = sess.run(loss, {x: x_train, y: y_train})
            print("Training - loss: %f"%(l))
            l = sess.run(loss, {x: x_test, y: y_test})
            print("Testing  - loss: %f\n"%(l))
            
    # evaluate training accuracy
    y_, e_, l = sess.run([y, error_rate, loss], {x: x_train, y: y_train})
    print("Training - loss: %f"%(l))
    lr_1, e_1, l1 = sess.run([linear_regression, error_rate, loss], {x: x_test, y: y_test})
    print("Testing  - loss: %f"%(l1))
    
#     show_graph(tf.get_default_graph().as_graph_def())

Round  1
Training - loss: 41101.473097
Testing  - loss: 48728.478295

Round  201
Training - loss: 41007.510317
Testing  - loss: 48613.009103

Round  401
Training - loss: 41007.510317
Testing  - loss: 48613.009103

Round  601
Training - loss: 41007.510317
Testing  - loss: 48613.009103

Round  801
Training - loss: 41007.510317
Testing  - loss: 48613.009103

Training - loss: 642.051792
Testing  - loss: 761.141444


## Train after pre-processing
### Pre-processing

In [6]:
import pandas as pd

In [7]:
x_train_pd = pd.DataFrame(x_train, columns=housing.feature_names)
x_train_pd.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0
mean,3.917221,29.118594,5.403187,1.095654,1433.185831,3.020782,35.430371,-119.409006
std,1.932617,12.65881,2.54363,0.485395,1142.294876,6.078652,2.061388,1.993468
min,0.4999,1.0,0.846154,0.375,3.0,0.692308,32.54,-124.35
25%,2.585725,19.0,4.39144,1.005917,793.0,2.421582,33.9,-121.54
50%,3.5719,30.0,5.188982,1.048474,1170.0,2.819232,34.16,-118.37
75%,4.8036,38.0,6.031433,1.098765,1727.0,3.296951,37.58,-117.95
max,15.0001,52.0,141.909091,34.066667,35682.0,599.714286,41.95,-114.31


### Standardization

In [8]:
# Training data
x_train_standard_pd = x_train_pd.sub(x_train_pd.mean(),axis=1).div(x_train_pd.std(),axis=1)
x_train_standard = x_train_standard_pd.values
x_train_standard_pd.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.280834,0.938588,0.621529,-0.148012,-0.972766,-0.076534,1.18834,-1.415119
1,2.268519,-0.641339,0.328251,-0.254995,0.847254,-0.149859,1.178637,-1.410103
2,1.728319,1.807548,1.134185,-0.045751,-0.820441,-0.035949,1.173786,-1.420135
3,0.893027,1.807548,0.162824,-0.046549,-0.766165,-0.077786,1.173786,-1.425152
4,-0.036749,1.807548,0.345438,-0.030022,-0.760037,-0.138076,1.173786,-1.425152
5,0.061874,1.807548,-0.25221,0.016426,-0.893102,-0.144915,1.173786,-1.425152
6,-0.13356,1.807548,-0.185279,-0.297267,-0.296934,-0.146805,1.168935,-1.425152
7,-0.412508,1.807548,-0.238109,-0.069696,-0.241782,-0.202763,1.168935,-1.425152
8,-0.950432,1.017584,-0.436018,0.04531,-0.198885,-0.163505,1.168935,-1.430168
9,-0.116951,1.807548,-0.170071,-0.217261,0.103138,-0.139589,1.168935,-1.425152


### Value change between standardization before and after

In [9]:
print(x_train[0])
print(x_train_standard[0])

[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
[ 2.28083372  0.9385879   0.62152884 -0.14801185 -0.97276619 -0.07653445
  1.18833965 -1.415119  ]


In [10]:
# Testing data
x_test_pd = pd.DataFrame(x_test, columns=housing.feature_names)
x_test_standard = x_test_pd.sub(x_train_pd.mean(),axis=1).div(x_train_pd.std(),axis=1).values

### Training

In [15]:
###### Start TF session ######
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer()) # re-assign Weights
    # training loop
    for i in range(400):
        sess.run(train, {x: x_train_standard, y: y_train})
        if i % 100 == 0:
            print("Round ",i+1)
            l = sess.run(loss, {x: x_train_standard, y: y_train})
            print("Training - loss: %f"%(l))
            l = sess.run(loss, {x: x_test_standard, y: y_test})
            print("Testing  - loss: %f\n"%(l))
            
    # evaluate training accuracy
    print("Final")
    lr_, e_, l = sess.run([linear_regression, error_rate, loss], {x: x_train_standard, y: y_train})
    print("Training - loss: %f"%(l))
    lr_1, e_1, l1 = sess.run([linear_regression, error_rate, loss], {x: x_test_standard, y: y_test})
    print("Testing  - loss: %f\n"%(l1))
    print("Model Error Rate %f"% np.mean(np.abs((y_test - lr_1)/y_test)))

Round  1
Training - loss: 1.036947
Testing  - loss: 1.007736

Round  101
Training - loss: 0.373263
Testing  - loss: 0.342342

Round  201
Training - loss: 0.279289
Testing  - loss: 0.286243

Round  301
Training - loss: 0.269301
Testing  - loss: 0.275412

Final
Training - loss: 0.265120
Testing  - loss: 0.268059

Model Error Rate 0.268059
