In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
import copy

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

# First Model: No Tranformations, drop all NAs, only numerical columns

In [2]:
house_prices = pd.read_csv('train.csv')
houses = copy.copy(house_prices)
houses = houses.set_index('Id')
houses = houses.reindex(np.random.permutation(houses.index))

In [3]:
houses=houses.drop(['Alley', 'LotFrontage', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
houses=houses.dropna()

In [4]:
display.display(houses.head())
houses.describe()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
167,20,RL,10708,Pave,IR1,Lvl,AllPub,Inside,Gtl,ClearCr,...,0,0,142,0,0,11,2009,COD,Normal,190000
93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,Crawfor,...,44,0,0,0,0,8,2009,WD,Normal,163500
668,20,RL,8125,Pave,Reg,Lvl,AllPub,Inside,Gtl,SawyerW,...,0,0,0,0,0,10,2008,WD,Normal,193500
111,50,RL,9525,Pave,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,0,0,0,0,0,10,2006,WD,Normal,136900
459,70,RM,5100,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,0,0,0,0,0,6,2008,WD,Normal,161000


Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,...,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,56.1,10706.3,6.2,5.6,1973.0,1985.7,110.4,464.2,49.2,582.5,...,99.4,47.8,21.3,3.6,16.4,3.0,42.9,6.3,2007.8,186761.8
std,41.3,10336.6,1.3,1.1,29.6,20.3,185.6,458.8,166.2,440.0,...,127.5,65.4,60.8,30.2,58.1,42.0,508.1,2.7,1.3,78913.8
min,20.0,1300.0,2.0,2.0,1880.0,1950.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,35311.0
25%,20.0,7744.0,5.0,5.0,1956.0,1968.0,0.0,0.0,0.0,248.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,135000.0
50%,50.0,9600.0,6.0,5.0,1976.0,1994.5,0.0,413.0,0.0,489.0,...,6.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,168500.0
75%,70.0,11760.8,7.0,6.0,2001.0,2004.0,174.0,733.0,0.0,815.8,...,174.5,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,220000.0
max,190.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
# Use only subset of features
houses = houses[['SalePrice', 'LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'HalfBath', 'TotRmsAbvGrd']]

In [6]:
# Separate housing data into inputs and labels
labels = pd.DataFrame()
labels['SalePrice'] = houses['SalePrice']

features = copy.copy(houses)
features = features.drop(['SalePrice'], axis=1)

In [7]:
# separate data into training and validation
train_examples = features.head(1003)
train_labels = labels.head(1003)

validation_examples = features.tail(335)
validation_labels = labels.tail(335)

#Check that we preprocessed our inputs correctly
print("Training examples summary:")
display.display(train_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(train_labels.describe())
print("Validation targets summary:")
display.display(validation_labels.describe())
display.display(train_examples.head())

Training examples summary:


Unnamed: 0,LotArea,OverallQual,YearBuilt,TotalBsmtSF,1stFlrSF,FullBath,HalfBath,TotRmsAbvGrd
count,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0
mean,10687.4,6.2,1972.2,1093.6,1170.2,1.6,0.4,6.6
std,9491.2,1.3,29.8,384.6,374.4,0.6,0.5,1.6
min,1300.0,2.0,1880.0,105.0,438.0,0.0,0.0,3.0
25%,7797.5,5.0,1954.0,821.0,894.0,1.0,0.0,6.0
50%,9600.0,6.0,1974.0,1012.0,1095.0,2.0,0.0,6.0
75%,11644.0,7.0,2001.0,1326.0,1398.0,2.0,1.0,7.0
max,164660.0,10.0,2010.0,3200.0,3228.0,3.0,2.0,12.0


Validation examples summary:


Unnamed: 0,LotArea,OverallQual,YearBuilt,TotalBsmtSF,1stFlrSF,FullBath,HalfBath,TotRmsAbvGrd
count,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0
mean,10763.0,6.3,1975.4,1103.0,1194.3,1.6,0.4,6.5
std,12548.0,1.3,28.6,463.2,421.3,0.5,0.5,1.6
min,1526.0,3.0,1890.0,264.0,526.0,0.0,0.0,3.0
25%,7638.0,5.0,1958.0,820.5,893.5,1.0,0.0,5.0
50%,9350.0,6.0,1977.0,1032.0,1128.0,2.0,0.0,6.0
75%,11900.0,7.0,2003.0,1317.5,1435.0,2.0,1.0,7.0
max,215245.0,10.0,2009.0,6110.0,4692.0,3.0,1.0,12.0


Training targets summary:


Unnamed: 0,SalePrice
count,1003.0
mean,186585.3
std,80113.9
min,35311.0
25%,135000.0
50%,169990.0
75%,220500.0
max,755000.0


Validation targets summary:


Unnamed: 0,SalePrice
count,335.0
mean,187290.2
std,75320.1
min,58500.0
25%,136750.0
50%,165500.0
75%,216918.5
max,582933.0


Unnamed: 0_level_0,LotArea,OverallQual,YearBuilt,TotalBsmtSF,1stFlrSF,FullBath,HalfBath,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
167,10708,5,1955,1617,1867,1,0,7
93,13360,5,1921,876,964,1,0,5
668,8125,6,1994,1408,1679,2,0,7
111,9525,6,1954,994,1216,2,0,7
459,5100,8,1925,588,833,1,0,7


# Tensorflow

### graph

In [8]:
g = tf.Graph()
input_dim = 8

#### Placeholders: inputs and hyperparameters

In [9]:
# Inputs
with tf.name_scope("Inputs"):
    x = tf.placeholder(tf.float32, shape=[None, input_dim], name="features")
    y_label = tf.placeholder(tf.float32, shape=[None, 1], name='labels')
    learn_rate = tf.placeholder(tf.float32, name="learning_rate")
    batch_size = tf.placeholder(tf.int64, name="batch_size")

#### Parameters

In [10]:
with tf.name_scope("Parameters"):
    W = tf.get_variable("W", [input_dim, 1], initializer = tf.random_uniform_initializer(minval=-1, maxval=1))
    b = tf.get_variable("b", [1], initializer = tf.random_uniform_initializer(minval=-1, maxval=1))

#### Datasets

In [11]:
with tf.name_scope("Datasets"):
    with tf.name_scope("Train_dataset"):
        train_data = tf.data.Dataset.from_tensor_slices((train_labels, train_examples))
        train_data = train_data.shuffle(5000)
        train_data = train_data.repeat()
        train_data = train_data.batch(batch_size)
    with tf.name_scope("train_eval_dataset"):
        train_eval_data = tf.data.Dataset.from_tensor_slices((train_labels, train_examples))
        train_eval_data = train_eval_data.batch(batch_size)
    with tf.name_scope("validation_dataset"):
        validation_data = tf.data.Dataset.from_tensor_slices((validation_labels, validation_examples))
        validation_data = validation_data.batch(1).repeat(1)

#### Iterator

In [12]:
with tf.name_scope("Iterators"):
    with tf.name_scope("Train_iterator"):
        train_iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes)
        next_element_train = train_iterator.get_next()
        train_init_op = train_iterator.make_initializer(train_data)
    with tf.name_scope("train_eval_iterator"):
        eval_train_iterator = tf.data.Iterator.from_structure(train_eval_data.output_types, train_eval_data.output_shapes)
        next_element_eval_train = eval_train_iterator.get_next()
        eval_train_init_op = eval_train_iterator.make_initializer(train_eval_data)
    with tf.name_scope("validation_iterator"):
        validation_iterator = tf.data.Iterator.from_structure(validation_data.output_types, validation_data.output_shapes)
        next_element_validation = validation_iterator.get_next()
        validation_init_op = validation_iterator.make_initializer(validation_data)

#### predictions

In [13]:
with tf.name_scope("Predictions"):
    predictions = tf.add(tf.matmul(x, W), b, name="predictions")

#### Loss function and optimizer

In [14]:
with tf.name_scope("Loss"):
    mean_squared_error = tf.losses.mean_squared_error(y_label, predictions)

with tf.name_scope("optimizer"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learn_rate)

#### Train

In [15]:
with tf.name_scope("Training"):
    train = optimizer.minimize(loss=mean_squared_error, name="Training")

#### Tensorboard - Loss

In [16]:
with tf.name_scope("Loss_tensorboard"):
    loss = tf.placeholder(tf.float32)
    loss_summary = tf.summary.scalar("Loss", loss)

In [17]:
saver = tf.train.Saver()
init = tf.global_variables_initializer()

In [18]:
def train_model(batch=100, learning_rate=0.001, steps=10000, output_dir="./first_graph"):
    with tf.Session() as sess:
        # initialize variables
        sess.run(init)
        
        # Filewriters for tensorboard
        train_writer = tf.summary.FileWriter(output_dir + "/train", graph=tf.get_default_graph())
        validation_writer = tf.summary.FileWriter(output_dir + "/validation", graph=tf.get_default_graph())
        
        # initiate training iterator
        sess.run(train_init_op, feed_dict={batch_size:batch})
        
        for i in range(steps + 1):
            batch_labels, batch_features = sess.run(next_element_train)
            
            sess.run(train, feed_dict={x : batch_features, y_label : batch_labels, learn_rate : learning_rate,})
        saver.save(sess, output_dir + "/model/final.ckpy")
    print("Finished training")

In [19]:
train_model(batch=1, learning_rate=0.01, steps=1000)

Finished training
