In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display

## Dataset 

California Housing Data

This data set contains information about all the block groups in California from the 1990 Census. In this sample a block group on average includes 1425.5 individuals living in a geographically compact area. 

The task is to aproximate the median house value of each block from the values of the rest of the variables. 

 It has been obtained from the LIACC repository. The original page where the data set can be found is: http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html.
 

In [2]:
data = pd.read_csv('cal_housing_clean.csv')

display(data.info())
display(data.head())
display(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 7 columns):
housingMedianAge    20640 non-null float64
totalRooms          20640 non-null float64
totalBedrooms       20640 non-null float64
population          20640 non-null float64
households          20640 non-null float64
medianIncome        20640 non-null float64
medianHouseValue    20640 non-null float64
dtypes: float64(7)
memory usage: 1.1 MB


None

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Train/Test Split

In [3]:
X = data.drop(['medianHouseValue'],axis=1)
y = data['medianHouseValue']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,random_state=101)

display(X_train.shape)
display(y_train.shape)
display(X_test.shape)
display(y_test.shape)

(14448, 6)

(14448,)

(6192, 6)

(6192,)

## Feature Engineering

In [5]:
# Min Max Scaling
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
mms.fit(X_train)

X_train_mms = pd.DataFrame(
    data=mms.transform(X_train),
    columns=X_train.columns,
    index=X_train.index)

X_test_mms = pd.DataFrame(
    data=mms.transform(X_test),
    columns=X_test.columns,
    index=X_test.index)

## Model

In [6]:
import tensorflow as tf

In [7]:
# Features
age = tf.feature_column.numeric_column('housingMedianAge')
rooms = tf.feature_column.numeric_column('totalRooms')
bedrooms = tf.feature_column.numeric_column('totalBedrooms')
pop = tf.feature_column.numeric_column('population')
households = tf.feature_column.numeric_column('households')
income = tf.feature_column.numeric_column('medianIncome')

feat_cols = [age,rooms,bedrooms,pop,households,income]

In [8]:
# Estimator model: DNNRegressor
model = tf.estimator.DNNRegressor(
    hidden_units=[6,6,6],
    feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp_dmaif3a', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd4853a7160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:
# Train Input Function
train_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_train_mms, y=y_train,
    batch_size=10, num_epochs=1000, shuffle=True)

In [10]:
# Verbose = False
tf.logging.set_verbosity(tf.logging.WARN)

# Train Estimator
model.train(
    input_fn=train_input_func,
    steps=25000, saving_listeners=None)

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7fd44d87a7f0>

## Evaluate Train

In [11]:
train_metrics = model.evaluate(
    input_fn=train_input_func,
    steps=1000)

print("train metrics: {}".format(train_metrics))

train metrics: {'average_loss': 9410256000.0, 'loss': 94102560000.0, 'global_step': 25000}


## Evaluate Test

In [12]:
# Test Input Function
test_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test_mms, y=y_test,
      batch_size=10, num_epochs=1, shuffle=False)

In [13]:
test_metrics = model.evaluate(
    input_fn=test_input_func,
    steps=1000)

print("test metrics: {}".format(test_metrics))

test metrics: {'average_loss': 9628412000.0, 'loss': 96159880000.0, 'global_step': 25000}


## Predict

In [14]:
# Predict Input Function
predict_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test_mms,
      batch_size=10, num_epochs=1, shuffle=False)

In [15]:
y_test_pred = model.predict(predict_input_func)

y_test_pred_list = []
for pred in list(y_test_pred):
    y_test_pred_list.append(pred['predictions'])

In [16]:
# MSE
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_test_pred_list)**0.5

98124.4604703076