In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler

## Importing the data

In [3]:
data = pd.read_csv('cal_housing_clean.csv')
data.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 7 columns):
housingMedianAge    20640 non-null float64
totalRooms          20640 non-null float64
totalBedrooms       20640 non-null float64
population          20640 non-null float64
households          20640 non-null float64
medianIncome        20640 non-null float64
medianHouseValue    20640 non-null float64
dtypes: float64(7)
memory usage: 1.1 MB


In [5]:
data.isnull().any()

housingMedianAge    False
totalRooms          False
totalBedrooms       False
population          False
households          False
medianIncome        False
medianHouseValue    False
dtype: bool

In [6]:
data.describe()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


All columns are continuous.


## Exploratory Profiling

In [7]:
#!conda install -c conda-forge pandas_profiling
# Thank you for this awesome package
# https://github.com/pandas-profiling/pandas-profiling/graphs/contributors
from pandas_profiling import ProfileReport
# Note: problem with chesterish jupyter theme. reset to default. O0ps!

In [8]:
ProfileReport(data)

0,1
Number of variables,7
Number of observations,20640
Total Missing (%),0.0%
Total size in memory,1.1 MiB
Average record size in memory,56.0 B

0,1
Numeric,5
Categorical,0
Boolean,0
Date,0
Text (Unique),0
Rejected,2
Unsupported,0

0,1
Correlation,0.90722

0,1
Distinct count,52
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,28.639
Minimum,1
Maximum,52
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,8
Q1,18
Median,29
Q3,37
95-th percentile,52
Maximum,52
Range,51
Interquartile range,19

0,1
Standard deviation,12.586
Coef of variation,0.43945
Kurtosis,-0.80063
Mean,28.639
MAD,10.552
Skewness,0.060331
Sum,591120
Variance,158.4
Memory size,161.3 KiB

Value,Count,Frequency (%),Unnamed: 3
52.0,1273,6.2%,
36.0,862,4.2%,
35.0,824,4.0%,
16.0,771,3.7%,
17.0,698,3.4%,
34.0,689,3.3%,
26.0,619,3.0%,
33.0,615,3.0%,
18.0,570,2.8%,
25.0,566,2.7%,

Value,Count,Frequency (%),Unnamed: 3
1.0,4,0.0%,
2.0,58,0.3%,
3.0,62,0.3%,
4.0,191,0.9%,
5.0,244,1.2%,

Value,Count,Frequency (%),Unnamed: 3
48.0,177,0.9%,
49.0,134,0.6%,
50.0,136,0.7%,
51.0,48,0.2%,
52.0,1273,6.2%,

0,1
Distinct count,3842
Unique (%),18.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,206860
Minimum,14999
Maximum,500000
Zeros (%),0.0%

0,1
Minimum,14999
5-th percentile,66200
Q1,119600
Median,179700
Q3,264720
95-th percentile,489810
Maximum,500000
Range,485000
Interquartile range,145120

0,1
Standard deviation,115400
Coef of variation,0.55786
Kurtosis,0.32787
Mean,206860
MAD,91170
Skewness,0.97776
Sum,4269500000
Variance,13316000000
Memory size,161.3 KiB

Value,Count,Frequency (%),Unnamed: 3
500001.0,965,4.7%,
137500.0,122,0.6%,
162500.0,117,0.6%,
112500.0,103,0.5%,
187500.0,93,0.5%,
225000.0,92,0.4%,
350000.0,79,0.4%,
87500.0,78,0.4%,
275000.0,65,0.3%,
150000.0,64,0.3%,

Value,Count,Frequency (%),Unnamed: 3
14999.0,4,0.0%,
17500.0,1,0.0%,
22500.0,4,0.0%,
25000.0,1,0.0%,
26600.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
498800.0,1,0.0%,
499000.0,1,0.0%,
499100.0,1,0.0%,
500000.0,27,0.1%,
500001.0,965,4.7%,

0,1
Distinct count,12928
Unique (%),62.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.8707
Minimum,0.4999
Maximum,15
Zeros (%),0.0%

0,1
Minimum,0.4999
5-th percentile,1.6006
Q1,2.5634
Median,3.5348
Q3,4.7432
95-th percentile,7.3003
Maximum,15.0
Range,14.5
Interquartile range,2.1799

0,1
Standard deviation,1.8998
Coef of variation,0.49082
Kurtosis,4.9525
Mean,3.8707
MAD,1.4016
Skewness,1.6467
Sum,79891
Variance,3.6093
Memory size,161.3 KiB

Value,Count,Frequency (%),Unnamed: 3
3.125,49,0.2%,
15.0001,49,0.2%,
2.875,46,0.2%,
2.625,44,0.2%,
4.125,44,0.2%,
3.875,41,0.2%,
3.375,38,0.2%,
3.0,38,0.2%,
4.0,37,0.2%,
3.625,37,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.4999,12,0.1%,
0.536,10,0.0%,
0.5495,1,0.0%,
0.6433,1,0.0%,
0.6775,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
14.4219,1,0.0%,
14.5833,1,0.0%,
14.9009,1,0.0%,
15.0,2,0.0%,
15.0001,49,0.2%,

0,1
Distinct count,3888
Unique (%),18.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1425.5
Minimum,3
Maximum,35682
Zeros (%),0.0%

0,1
Minimum,3
5-th percentile,348
Q1,787
Median,1166
Q3,1725
95-th percentile,3288
Maximum,35682
Range,35679
Interquartile range,938

0,1
Standard deviation,1132.5
Coef of variation,0.79444
Kurtosis,73.553
Mean,1425.5
MAD,714.24
Skewness,4.9359
Sum,29422000
Variance,1282500
Memory size,161.3 KiB

Value,Count,Frequency (%),Unnamed: 3
891.0,25,0.1%,
761.0,24,0.1%,
1227.0,24,0.1%,
850.0,24,0.1%,
1052.0,24,0.1%,
825.0,23,0.1%,
999.0,22,0.1%,
782.0,22,0.1%,
1005.0,22,0.1%,
781.0,21,0.1%,

Value,Count,Frequency (%),Unnamed: 3
3.0,1,0.0%,
5.0,1,0.0%,
6.0,1,0.0%,
8.0,4,0.0%,
9.0,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
15507.0,1,0.0%,
16122.0,1,0.0%,
16305.0,1,0.0%,
28566.0,1,0.0%,
35682.0,1,0.0%,

0,1
Correlation,0.92989

0,1
Distinct count,5926
Unique (%),28.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2635.8
Minimum,2
Maximum,39320
Zeros (%),0.0%

0,1
Minimum,2.0
5-th percentile,620.95
Q1,1447.8
Median,2127.0
Q3,3148.0
95-th percentile,6213.2
Maximum,39320.0
Range,39318.0
Interquartile range,1700.2

0,1
Standard deviation,2181.6
Coef of variation,0.8277
Kurtosis,32.631
Mean,2635.8
MAD,1344.5
Skewness,4.1473
Sum,54402000
Variance,4759400
Memory size,161.3 KiB

Value,Count,Frequency (%),Unnamed: 3
1527.0,18,0.1%,
1613.0,17,0.1%,
1582.0,17,0.1%,
2127.0,16,0.1%,
1703.0,15,0.1%,
1471.0,15,0.1%,
2053.0,15,0.1%,
1722.0,15,0.1%,
1607.0,15,0.1%,
1717.0,15,0.1%,

Value,Count,Frequency (%),Unnamed: 3
2.0,1,0.0%,
6.0,1,0.0%,
8.0,1,0.0%,
11.0,1,0.0%,
12.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
30450.0,1,0.0%,
32054.0,1,0.0%,
32627.0,1,0.0%,
37937.0,1,0.0%,
39320.0,1,0.0%,

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


## Feature Selection

In [9]:
data = data.drop(['households', 'totalBedrooms'], axis=1) # removing features

In [10]:
data.head()

Unnamed: 0,housingMedianAge,totalRooms,population,medianIncome,medianHouseValue
0,41.0,880.0,322.0,8.3252,452600.0
1,21.0,7099.0,2401.0,8.3014,358500.0
2,52.0,1467.0,496.0,7.2574,352100.0
3,52.0,1274.0,558.0,5.6431,341300.0
4,52.0,1627.0,565.0,3.8462,342200.0


## Preprocessing

In [11]:
standardScaler = StandardScaler(with_mean=True, with_std=True)
standardScaler.fit(data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
# Normal fit transform returns a numpy ndarray, so converting it to pd.df
columns = data.columns
indexes = data.index

In [13]:
scaledData = pd.DataFrame(standardScaler.transform(data), index=indexes, columns=columns)
scaledData.head()

Unnamed: 0,housingMedianAge,totalRooms,population,medianIncome,medianHouseValue
0,0.982143,-0.804819,-0.974429,2.344766,2.129631
1,-0.607019,2.04589,0.861439,2.332238,1.314156
2,1.856182,-0.535746,-0.820777,1.782699,1.258693
3,1.856182,-0.624215,-0.766028,0.932968,1.1651
4,1.856182,-0.462404,-0.759847,-0.012881,1.1729


## Train Test Split

In [14]:
x_data = scaledData.drop(['medianHouseValue'], axis=1)
x_data.head()
y_label = scaledData['medianHouseValue']
y_label.head()

0    2.129631
1    1.314156
2    1.258693
3    1.165100
4    1.172900
Name: medianHouseValue, dtype: float64

In [15]:
scaledData.head()

Unnamed: 0,housingMedianAge,totalRooms,population,medianIncome,medianHouseValue
0,0.982143,-0.804819,-0.974429,2.344766,2.129631
1,-0.607019,2.04589,0.861439,2.332238,1.314156
2,1.856182,-0.535746,-0.820777,1.782699,1.258693
3,1.856182,-0.624215,-0.766028,0.932968,1.1651
4,1.856182,-0.462404,-0.759847,-0.012881,1.1729


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data, y_label, test_size=0.3, 
                                                    random_state = 0)

For DNNRegressor, we need

1. Feature column

2. Input Function

3. Model object

## Feature Columns

In [18]:
columns

Index(['housingMedianAge', 'totalRooms', 'population', 'medianIncome',
       'medianHouseValue'],
      dtype='object')

In [22]:
# I hope this can make feature_col vector wihtout manually typing all those names
feature_cols = []
for column in columns:
    if column!='medianHouseValue':
        feature_cols.append(tf.feature_column.numeric_column(column))
feature_cols

[NumericColumn(key='housingMedianAge', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='totalRooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='population', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='medianIncome', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

## Input Function

In [23]:
train_input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train,
                                                num_epochs=1000, shuffle=True)
test_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test,
                                                     num_epochs=1, shuffle=False)

## Dense Neural Network Regressor Model

In [24]:
dnnRegressor = tf.estimator.DNNRegressor(hidden_units=[4, 5, 5], 
                                         feature_columns=feature_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpvfs0dinw', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff08cbc62e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## Training the DNN

In [25]:
dnnRegressor.train(input_fn=train_input_func, steps=1000)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpvfs0dinw/model.ckpt.
INFO:tensorflow:loss = 103.70192, step = 1
INFO:tensorflow:global_step/sec: 99.5329
INFO:tensorflow:loss = 55.164307, step = 101 (1.009 sec)
INFO:tensorflow:global_step/sec: 182.245
INFO:tensorflow:loss = 50.98037, step = 201 (0.546 sec)
INFO:tensorflow:global_step/sec: 153.755
INFO:tensorflow:loss = 58.243065, step =

<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor at 0x7ff08cc72940>

## Evaluating the model

In [26]:
dnnRegressor.evaluate(input_fn=test_input_func, steps=None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-18T10:38:34Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /tmp/tmpvfs0dinw/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-18-10:38:35
INFO:tensorflow:Saving dict for global step 1000: average_loss = 0.44503897, global_step = 1000, label/mean = 0.0006307352, loss = 56.238396, prediction/mean = 0.00036508098
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tmpvfs0dinw/model.ckpt-1000


{'average_loss': 0.44503897,
 'label/mean': 0.0006307352,
 'loss': 56.238396,
 'prediction/mean': 0.00036508098,
 'global_step': 1000}

## Making Predictions

In [30]:
predict_input_function = tf.estimator.inputs.pandas_input_fn(x=X_test, y=None,
                                                            num_epochs=1, shuffle=False)
predictions = dnnRegressor.predict(input_fn=predict_input_function)
predictionsList = list(predictions)
predictionsList

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpvfs0dinw/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'predictions': array([-0.13668442], dtype=float32)},
 {'predictions': array([0.80127853], dtype=float32)},
 {'predictions': array([0.25624454], dtype=float32)},
 {'predictions': array([-0.7329405], dtype=float32)},
 {'predictions': array([0.49609095], dtype=float32)},
 {'predictions': array([-0.37940887], dtype=float32)},
 {'predictions': array([0.47986668], dtype=float32)},
 {'predictions': array([1.0987372], dtype=float32)},
 {'predictions': array([0.7192536], dtype=float32)},
 {'predictions': array([0.1441023], dtype=float32)},
 {'predictions': array([-0.08240128], dtype=float32)},
 {'predictions': array([-0.72281015], dtype=float32)},
 {'predictions': array([-0.6633817], dtype=float32)},
 {'predictions': array([-0.70895135], dtype=float32)},
 {'predictions': array([0.20220849], dtype=float32)},
 {'predictions': array([0.5281351], dtype=float32)},
 {'predictions': array([1.2723892], dtype=float32)},
 {'predictions': array([-0.5450579], dtype=float32)},
 {'predictions': array([-0.2

In [46]:
predictionsList[0]['predictions'][0]

IndexError: list index out of range

In [45]:
y_test[14740]

-0.606240636669762

In [38]:
assert(len(y_test) == len(predictionsList))
N = len(y_test)
sumSquaredError = 0
for i in range(N):
    sumSquaredError += 