In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from IPython.display import clear_output, Image, display, HTML

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Implement Data Preprocess here ######
housing = fetch_california_housing()
# Show description/ statisics about the dataset
print(housing.DESCR)

California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.




## Data Pre-processing

In [2]:
# Define cut point
cut = int(housing.data.shape[0]*0.9)
# Convert to column vector format, which is (n,1) not (n,)
housing.target = housing.target.reshape(housing.target.shape[0],1)
# Split dataset
x_train, x_test = housing.data[:cut], housing.data[cut:]
y_train, y_test = housing.target[:cut], housing.target[cut:]

print("\nShape of training set:", x_train.shape)
print("Shape of testing set:", x_test.shape)
print("Shape of training label:", y_train.shape)
print("Shape of testing label:", y_test.shape)


Shape of training set: (18576, 8)
Shape of testing set: (2064, 8)
Shape of training label: (18576, 1)
Shape of testing label: (2064, 1)


## Standardization & Include Bias to Weight Matrix

(Useless in this case, it's linear transformation which can be solve in coeifficient calculation step)

In [3]:
# x_train_ = np.concatenate((x_train,np.ones([x_train.shape[0],1])),axis=1)
# x_test_  = np.concatenate((x_test ,np.ones([ x_test.shape[0],1])),axis=1)
x_train_ = np.concatenate((np.divide(x_train - np.mean(x_train, axis=0, keepdims=True), np.std(x_train, axis=0, keepdims=True)),np.ones([x_train.shape[0],1])),axis=1)
x_test_  = np.concatenate((np.divide(x_test  - np.mean(x_train, axis=0, keepdims=True), np.std(x_train, axis=0, keepdims=True)),np.ones([ x_test.shape[0],1])),axis=1)

## Define Graph

In [4]:
# Clear graph
tf.reset_default_graph()

# Define variables to take input feature x, label y
x = tf.placeholder(tf.float32, shape = [None, x_train_.shape[1]], name="Input")
y = tf.placeholder(tf.float32, shape = [None, 1], name="Y")
W = tf.placeholder(tf.float32, shape = [x_train_.shape[1], 1], name="W")

# Calculate W
XT = tf.matrix_transpose(x, name="X_Transpose") # calculate transpose x
W_train = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, x, name="XT_X") , name="Inverse_XT_X"),XT, name="Multiple_XT"),y, name="Multiple_y") # calculate w by formula

# Systems of linear equation
linear_regression = tf.matmul(x, W, name="Linear_Regression")

# Calculate the mean of error rate
mean_error_rate = tf.reduce_mean(tf.divide(tf.abs(tf.subtract(y,linear_regression, name="Predict_Difference")),y,name="Error_Rate"), name="Mean_Error_Rate")


### Graph Explain and Result

To get the weight $W$ for linear equation 

$\hat y = XW + b$

We can utilizing following equation to solve it

$W=(X^{(train)\top} X^{(train)})^{-1} X^{(train)\top} y^{(train)}$

1. We set the placeholder to take the input feature as variable x and label as variable y, which corresponding to **Input** and **Y** in the graph below. Note that the x is original feature concatenate with 1 which represented the Bias $b$.

2. To calculate the Weight $W$,

    a. We calculate the element transpose X as variable XT which is **Inverse_XT_X** in graph.
    
    b. Calculate though the equation $W=(X^{(train)\top} X^{(train)})^{-1} X^{(train)\top} y^{(train)}$ to get W.
    
    c. By this step, we already get W for this systems of linear equation. Therefore, we are able to calculate the predict y via $XW$ as **Linear_Regression** in the graph.
    
    d. Finally, we calculte the mean error rate via $ \frac{1}{m} \sum \frac{\lvert(y -\hat y) \rvert}{y}$. which is the right-top part in the graph. Calculate the **Predict_Difference** via $y - \hat y$, take absolute value (**Abs**), calculate **Error_Rate** ($y$ as denominator), then take the mean of error rate (**Mean_Error_Rate**).

## Before Pre-processing

In [5]:
init_w = np.ones((9,1),dtype=np.float64)
with tf.Session() as sess:
    ###### Start TF session ######
    W_t = sess.run(W_train, {x: x_train_, y: y_train, W:init_w}) # Calculate W
#     print(W_t)
    print("Training Error Rate: ", sess.run(mean_error_rate, {x: x_train_, y: y_train, W:W_t}))
    print(" Testing Error Rate: ", sess.run(mean_error_rate, {x: x_test_, y: y_test, W:W_t}))
    
    show_graph(tf.get_default_graph().as_graph_def())

Training Error Rate:  0.316861
 Testing Error Rate:  0.344206


## Feature Engineering

In [6]:
from geopy.distance import vincenty

In [7]:
x_train_pd = pd.DataFrame(x_train, columns=housing.feature_names)
x_test_pd = pd.DataFrame(x_test, columns=housing.feature_names)

In [8]:
x_train_data = pd.DataFrame(x_train_pd.values, columns=housing.feature_names)
x_test_data = pd.DataFrame(x_test_pd.values, columns=housing.feature_names)

In [9]:
LA_coords = (36.778259, -119.417931)
SF_coords = (37.773972, -122.431297)

In [10]:
# Calculate the distance between the target and Los Angels
x_train_data['Distance_LA'] = x_train_data.apply(lambda x: vincenty((x['Latitude'], x['Longitude']), LA_coords).miles, axis=1)
x_test_data['Distance_LA'] = x_test_data.apply(lambda x: vincenty((x['Latitude'], x['Longitude']), LA_coords).miles, axis=1)

In [11]:
# Calculate the distance between the target and San Francisco
x_train_data['Distance_SF'] = x_train_data.apply(lambda x: vincenty((x['Latitude'], x['Longitude']), SF_coords).miles, axis=1)
x_test_data['Distance_SF'] = x_test_data.apply(lambda x: vincenty((x['Latitude'], x['Longitude']), SF_coords).miles, axis=1)
print(x_train_data.shape)
print(x_test_data.shape)

(18576, 10)
(2064, 10)


# Non-linear scaler + Polynominal (Final Result)

Doesn't need to adding the bias, cause polynominal feature is already include it.
Because Polynominal Feature will increase feature number, thus we need to modify the dimension in tensorflow graph

In [12]:
from sklearn.preprocessing import QuantileTransformer, PolynomialFeatures

In [13]:
normalizer = QuantileTransformer(n_quantiles=2)
x_train__ = normalizer.fit_transform(x_train_data)
x_test__ = normalizer.transform(x_test_data)

poly = PolynomialFeatures(2)
x_train__ = poly.fit_transform(x_train__)
x_test__ = poly.transform(x_test__)

# Clear graph
tf.reset_default_graph()

# Define variables to take input feature x, label y
x = tf.placeholder(tf.float64, shape = [None, x_train__.shape[1]], name="Input")
y = tf.placeholder(tf.float64, shape = [None, 1], name="Y")
W = tf.placeholder(tf.float64, shape = [x_train__.shape[1], 1], name="W")

# Calculate W
XT = tf.matrix_transpose(x, name="X_Transpose") # calculate transpose x
W_train = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, x, name="XT_X") , name="Inverse_XT_X"),XT, name="Multiple_XT"),y, name="Multiple_y") # calculate w by formula

# Systems of linear equation
linear_regression = tf.matmul(x, W, name="Linear_Regression")

# Calculate the mean of error rate
mean_error_rate = tf.reduce_mean(tf.divide(tf.abs(tf.subtract(y,linear_regression, name="Predict_Difference")),y,name="Error_Rate"), name="Mean_Error_Rate")

init_w = np.ones((x_train__.shape[1],1),dtype=np.float32)
with tf.Session() as sess:
    
    W_t = sess.run(W_train, {x: x_train__, y: y_train, W:init_w}) # Calculate W
    
    print("Train Error Rate: ", sess.run(mean_error_rate, {x: x_train__, y: y_train, W:W_t}))
    print("Testing Error Rate: ", sess.run(mean_error_rate, {x: x_test__, y: y_test, W:W_t}))

Train Error Rate:  0.253196666012
Testing Error Rate:  0.284491391691
