In [1]:
import tensorflow as tf
import numpy as np 
from sklearn.datasets import fetch_california_housing
from IPython.display import clear_output, Image, display, HTML

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Implement Data Preprocess here ######
housing = fetch_california_housing()
# Show description/ statisics about the dataset
print(housing.DESCR)
print('Features')
print('----------')
print(', '.join(housing.feature_names))
print("\nShape of dataset:", housing.data.shape)
print("Shape of label:", housing.target.shape)

California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.


Features
----------
MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude

Shape of dataset: (20640, 8)
Shape of label: (20640,)


## Get training set and testing set

In [2]:
cut = int(housing.data.shape[0]*0.9)
x_train, x_test = housing.data[:cut], housing.data[cut:]
y_train, y_test = housing.target[:cut], housing.target[cut:]

print("\nShape of training set:", x_train.shape)
print("Shape of testing set:", x_test.shape)
print("Shape of training label:", y_train.shape)
print("Shape of testing label:", y_test.shape)


Shape of training set: (18576, 8)
Shape of testing set: (2064, 8)
Shape of training label: (18576,)
Shape of testing label: (2064,)


## Define Model Graph

In [79]:
# Clear graph
tf.reset_default_graph()

# Define variables to take input feature x, label y
x = tf.placeholder(tf.float32, shape = [None, x_train.shape[1]], name="Input")
y = tf.placeholder(tf.float32, name="Y")
# y = tf.placeholder(tf.float32, shape = [None, 1], name="Input")

# Define the weights for each element of input x, and random assign value as normal distribution
W = tf.Variable(tf.random_normal(shape = [x_train.shape[1], 1]), name="weights")
# W = tf.Variable(tf.zeros([8,1]), dtype=tf.float32)

# Define the bias for the linear function
b = tf.Variable([0.0], dtype=tf.float32, name="Bias")

# Connect each elements for model
linear_regression = tf.reshape(tf.matmul(x, W), [-1]) + b
# linear_regression = tf.matmul(x, W) + b

# Define cost/loss function
error_rate = tf.abs((y - linear_regression)/y , name="Absolute_value")
# error_rate = (y - linear_regression)/y 
loss = tf.reduce_sum(error_rate, name="Sum_Error")

# Define training optimizer
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(loss)

# Initialize Weights
init = tf.global_variables_initializer()

TypeError: Failed to convert object of type <class 'list'> to Tensor. Contents: [None, 1]. Consider casting elements to a supported type.

## Model Graph

In [34]:
###### Start TF session ######
with tf.Session() as sess:
    show_graph(tf.get_default_graph().as_graph_def())

## Directly Train
Let's try directly train model without data pre-processing

In [76]:
y_train_ = y_train.reshape((y_train.shape[0], 1))
y_test_ = y_test.reshape((y_test.shape[0], 1))

In [78]:
###### Start TF session ######
with tf.Session() as sess:
    # training loop
    sess.run(init) # reset values to wrong
    for i in range(1000):
        sess.run(train, {x: x_train, y: y_train_})
        if i % 100 == 0:
            print("\nRound ",i+1)
            l = sess.run(loss, {x: x_train, y: y_train_})
            print("Training - loss: %.4d"%(l/ y_train_.shape[0]))
            l = sess.run(loss, {x: x_test, y: y_test_})
            print("Testing  - loss: %.4d"%(l/ y_test_.shape[0]))
            
    # evaluate training accuracy
    y_, e_, l = sess.run([y, error_rate, loss], {x: x_train, y: y_train_})
    print("Final Training - loss: %.4d"%(l/ y_train_.shape[0]))
    lr_1, e_1, l1 = sess.run([linear_regression, error_rate, loss], {x: x_test, y: y_test_})
    print("Final Testing  - loss: %.4d"%(l1/ y_test_.shape[0]))
    
#     show_graph(tf.get_default_graph().as_graph_def())


Round  1
Training - loss: 154735802
Testing  - loss: 183434041

Round  101
Training - loss: 154735787
Testing  - loss: 183434041

Round  201
Training - loss: 154735802
Testing  - loss: 183434041

Round  301
Training - loss: 154735802
Testing  - loss: 183434041

Round  401
Training - loss: 154735802
Testing  - loss: 183434041

Round  501
Training - loss: 154735802
Testing  - loss: 183434041

Round  601
Training - loss: 154735802
Testing  - loss: 183434041

Round  701
Training - loss: 154735787
Testing  - loss: 183434041

Round  801
Training - loss: 154735787
Testing  - loss: 183434041

Round  901
Training - loss: 154735802
Testing  - loss: 183434041
Final Training - loss: 0650
Final Testing  - loss: 0773


In [66]:
y_test

array([ 1.214,  1.904,  1.843, ...,  0.923,  0.847,  0.894])

In [67]:
lr_1

array([ 494130.96875,  558902.5625 ,  658362.6875 , ...,  390438.21875,
        305750.125  ,  511571.375  ], dtype=float32)

In [68]:
e_1

array([ 407026.15625,  293540.28125,  357222.375  , ...,  423008.96875,
        360979.09375,  572226.5    ], dtype=float32)

In [69]:
l1

8.31296e+08

## Train after pre-processing
### Pre-processing

In [14]:
import pandas as pd

In [15]:
x_train_pd = pd.DataFrame(x_train, columns=housing.feature_names)
x_train_pd.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0
mean,3.917221,29.118594,5.403187,1.095654,1433.185831,3.020782,35.430371,-119.409006
std,1.932617,12.65881,2.54363,0.485395,1142.294876,6.078652,2.061388,1.993468
min,0.4999,1.0,0.846154,0.375,3.0,0.692308,32.54,-124.35
25%,2.585725,19.0,4.39144,1.005917,793.0,2.421582,33.9,-121.54
50%,3.5719,30.0,5.188982,1.048474,1170.0,2.819232,34.16,-118.37
75%,4.8036,38.0,6.031433,1.098765,1727.0,3.296951,37.58,-117.95
max,15.0001,52.0,141.909091,34.066667,35682.0,599.714286,41.95,-114.31


### Standardization

In [8]:
# Training data
x_train_standard_pd = x_train_pd.sub(x_train_pd.mean(),axis=1).div(x_train_pd.std(),axis=1)
x_train_standard = x_train_standard_pd.values
x_train_standard_pd

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.280834,0.938588,0.621529,-0.148012,-0.972766,-0.076534,1.188340,-1.415119
1,2.268519,-0.641339,0.328251,-0.254995,0.847254,-0.149859,1.178637,-1.410103
2,1.728319,1.807548,1.134185,-0.045751,-0.820441,-0.035949,1.173786,-1.420135
3,0.893027,1.807548,0.162824,-0.046549,-0.766165,-0.077786,1.173786,-1.425152
4,-0.036749,1.807548,0.345438,-0.030022,-0.760037,-0.138076,1.173786,-1.425152
5,0.061874,1.807548,-0.252210,0.016426,-0.893102,-0.144915,1.173786,-1.425152
6,-0.133560,1.807548,-0.185279,-0.297267,-0.296934,-0.146805,1.168935,-1.425152
7,-0.412508,1.807548,-0.238109,-0.069696,-0.241782,-0.202763,1.168935,-1.425152
8,-0.950432,1.017584,-0.436018,0.045310,-0.198885,-0.163505,1.168935,-1.430168
9,-0.116951,1.807548,-0.170071,-0.217261,0.103138,-0.139589,1.168935,-1.425152


### Value change between standardization before and after

In [9]:
print(x_train[0])
print(x_train_standard[0])

[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
[ 2.28083372  0.9385879   0.62152884 -0.14801185 -0.97276619 -0.07653445
  1.18833965 -1.415119  ]


In [12]:
# Testing data
x_test_pd = pd.DataFrame(x_test, columns=housing.feature_names)
x_test_standard = x_test_pd.sub(x_train_pd.mean(),axis=1).div(x_train_pd.std(),axis=1).values


### Training

In [60]:
###### Start TF session ######
with tf.Session() as sess:
    sess.run(init) # re-assign Weights
    # training loop
    for i in range(1000):
        sess.run(train, {x: x_train_standard, y: y_train})
        if i % 100 == 0:
            print("\n Round ",i+1)
            l = sess.run(loss, {x: x_train, y: y_train})
            print("Training - loss: %.4d"%(l/ y_train.shape[0]))
            l = sess.run(loss, {x: x_test, y: y_test})
            print("Testing  - loss: %.4d"%(l/ y_test.shape[0]))

    # evaluate training accuracy
    l = sess.run(loss, {x: x_train, y: y_train})
    print("Final Training - loss: %.4d"%(l/ y_train.shape[0]))
    lr_1, e_1, l1 = sess.run([linear_regression, error_rate, loss], {x: x_test, y: y_test})
    print("Final Testing  - loss: %.4d"%(l1/ y_test.shape[0]))


 Round  1
Training - loss: 22460
Testing  - loss: 27764

 Round  101
Training - loss: 4132
Testing  - loss: 4791

 Round  201
Training - loss: 4121
Testing  - loss: 4781

 Round  301
Training - loss: 4180
Testing  - loss: 4838

 Round  401
Training - loss: 4132
Testing  - loss: 4790

 Round  501
Training - loss: 4173
Testing  - loss: 4831

 Round  601
Training - loss: 4161
Testing  - loss: 4820

 Round  701
Training - loss: 4165
Testing  - loss: 4824

 Round  801
Training - loss: 4193
Testing  - loss: 4852

 Round  901
Training - loss: 4216
Testing  - loss: 4875
Final Training - loss: 2659
Final Testing  - loss: 3108


In [59]:
y_test

array([ 1.214,  1.904,  1.843, ...,  0.923,  0.847,  0.894])

In [56]:
lr_1

array([  761.96386719,   879.55963135,  1044.70043945, ...,   583.88165283,
         453.7093811 ,   790.68371582], dtype=float32)

In [57]:
e_1

array([ 626.64733887,  460.95358276,  565.84771729, ...,  631.59118652,
        534.6663208 ,  883.43371582], dtype=float32)

In [58]:
l1

1319083.5