In [37]:
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tempfile

import pandas as pd
from six.moves import urllib
import tensorflow as tf

In [38]:
#data processing

train_df = pd.read_csv("../data/train_df.csv",index_col=0)
train_df.shape

(90275, 61)

In [39]:
train_df.head()

Unnamed: 0,parcelid,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transaction_month,transaction_year
0,11016594,0.0276,1.0,,,2.0,3.0,,4.0,2.0,...,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0,1,2016
1,14366692,-0.1684,,,,3.5,4.0,,,3.5,...,346458.0,585529.0,2015.0,239071.0,10153.02,,,,1,2016
2,12098116,-0.004,1.0,,,3.0,2.0,,4.0,3.0,...,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0,1,2016
3,12643413,0.0218,1.0,,,2.0,2.0,,4.0,2.0,...,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0,1,2016
4,14432541,-0.005,,,,2.5,4.0,,,2.5,...,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0,1,2016


In [40]:
pd.options.display.max_rows = 65

dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
d_object = dtype_df.ix[dtype_df["Column Type"] != "object"]
dtype_df 

Unnamed: 0,Count,Column Type
0,parcelid,int64
1,logerror,float64
2,airconditioningtypeid,float64
3,architecturalstyletypeid,float64
4,basementsqft,float64
5,bathroomcnt,float64
6,bedroomcnt,float64
7,buildingclasstypeid,float64
8,buildingqualitytypeid,float64
9,calculatedbathnbr,float64


In [60]:
# default-dic
default_dic = {}
for ind,row in dtype_df.iterrows():
    if row['Column Type'] == 'float64':
        default_dic[row['Count']] = -1
    else:
        default_dic[row['Count']] = 'missing'
#default_dic     
train_df.fillna(value = default_dic, inplace=True)


In [69]:
import numpy as np
wide_columns = []
deep_columns = []

hashottuborspa = tf.feature_column.categorical_column_with_hash_bucket(
    "hashottuborspa",2)

propertycountylandusecode = tf.feature_column.categorical_column_with_hash_bucket(
    "propertycountylandusecode",80)

propertyzoningdesc = tf.feature_column.categorical_column_with_hash_bucket(
    "propertyzoningdesc",2000)

fireplaceflag = tf.feature_column.categorical_column_with_hash_bucket(
    "fireplaceflag",2)

taxdelinquencyflag = tf.feature_column.categorical_column_with_hash_bucket(
    "taxdelinquencyflag",2)

hashottuborspa = tf.feature_column.indicator_column(hashottuborspa)
propertycountylandusecode = tf.feature_column.indicator_column(propertycountylandusecode)
propertyzoningdesc = tf.feature_column.indicator_column(propertyzoningdesc)
fireplaceflag = tf.feature_column.indicator_column(fireplaceflag)
taxdelinquencyflag = tf.feature_column.indicator_column(taxdelinquencyflag)

deep_columns=[hashottuborspa,propertycountylandusecode,
              propertyzoningdesc,fireplaceflag,taxdelinquencyflag]

# float and int

## parcelid
d_object = dtype_df.ix[dtype_df["Column Type"] != "object"]
d_object = d_object.ix[d_object.Count != 'parcelid']
parcelid =  tf.contrib.layers.sparse_column_with_integerized_feature(
    'parcelid', bucket_size=100000)
parcelid = tf.feature_column.embedding_column(parcelid,32) # embedding

deep_columns.append(parcelid)
wide_columns.append(parcelid)

## 
for col in d_object['Count'].values:
    
    category = train_df[col].value_counts().shape[0]
    if category < 1000:
        print("feature:{}, category: {} < 1000, hashing.".format(col,category))
       
        train_df[col] = train_df[col].astype(np.int32)
        
        layer = tf.contrib.layers.sparse_column_with_integerized_feature(
    col, bucket_size=category)
        wide_columns.append(layer)
        
        # embeding
        layer = tf.contrib.layers.embedding_column(
            layer, dimension=32, combiner="mean")
        deep_columns.append(layer)
        
    else:
        print("feature:{} ,category: {} > 1000, numneric.".format(col,category))
        layer = tf.feature_column.numeric_column(col)
        
        deep_columns.append(layer)
        
train_df.to_csv("../data/train_df_new.csv")    

feature:logerror, category: 873 < 1000, hashing.
feature:airconditioningtypeid, category: 7 < 1000, hashing.
feature:architecturalstyletypeid, category: 7 < 1000, hashing.
feature:basementsqft, category: 40 < 1000, hashing.
feature:bathroomcnt, category: 23 < 1000, hashing.
feature:bedroomcnt, category: 8 < 1000, hashing.
feature:buildingclasstypeid, category: 2 < 1000, hashing.
feature:buildingqualitytypeid, category: 9 < 1000, hashing.
feature:calculatedbathnbr, category: 23 < 1000, hashing.
feature:decktypeid, category: 2 < 1000, hashing.
feature:finishedfloor1squarefeet ,category: 1887 > 1000, numneric.
feature:calculatedfinishedsquarefeet ,category: 5103 > 1000, numneric.
feature:finishedsquarefeet12 ,category: 4983 > 1000, numneric.
feature:finishedsquarefeet13, category: 12 < 1000, hashing.
feature:finishedsquarefeet15 ,category: 1916 > 1000, numneric.
feature:finishedsquarefeet50 ,category: 1899 > 1000, numneric.
feature:finishedsquarefeet6, category: 361 < 1000, hashing.
featu

In [72]:
def build_estimator(model_dir, model_type):
    """Build an estimator."""
    if model_type == "wide":
        m = tf.estimator.LinearRegressor(
        model_dir=model_dir, feature_columns=wide_columns)
    elif model_type == "deep":
        m = tf.estimator.DNNRegressor(
        model_dir=model_dir,
        feature_columns=deep_columns,
        hidden_units=[100, 50])
    else:
        m = tf.estimator.DNNLinearCombinedRegressor(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100, 50])
    return m


def input_fn(data_file, num_epochs, shuffle):
    """Input builder function."""
    df_data = pd.read_csv(data_file,engine="python",index_col=0)
    
    labels = train_df['logerror']
    return tf.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=100,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)


def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
    """Train and evaluate the model."""

    m = build_estimator(model_dir, model_type)
  # set num_epochs to None to get infinite stream of data.
    m.train(
      input_fn=input_fn(train_data, num_epochs=None, shuffle=True),
      steps=train_steps)
  # set steps to None to run evaluation until all data consumed.
    results = m.evaluate(
      input_fn=input_fn(test_data, num_epochs=1, shuffle=False),
      steps=None)
    print("model directory = %s" % model_dir)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))


FLAGS = None


def main(_):
    train_and_eval("/data/chenlongzhen/property/data/modedir", "wide_n_deep", 50,
                 "/data/chenlongzhen/property/data/train_df_new.csv", "/data/chenlongzhen/property/data/train_df_new.csv")


if __name__ == "__main__":
    main(1)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_model_dir': '/data/chenlongzhen/property/data/modedir', '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.


KeyboardInterrupt: 

In [70]:
df_data = pd.read_csv("../data/train_df_new.csv",engine="python",index_col=0)

In [71]:
df_data

Unnamed: 0,parcelid,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transaction_month,transaction_year
0,11016594,0,1,-1,-1,2,3,-1,4,2,...,122754.0,360170.0,2015,237416.0,6735.88,missing,-1,6.037107e+13,1,2016
1,14366692,0,-1,-1,-1,3,4,-1,-1,3,...,346458.0,585529.0,2015,239071.0,10153.02,missing,-1,-1.000000e+00,1,2016
2,12098116,0,1,-1,-1,3,2,-1,4,3,...,61994.0,119906.0,2015,57912.0,11484.48,missing,-1,6.037464e+13,1,2016
3,12643413,0,1,-1,-1,2,2,-1,4,2,...,171518.0,244880.0,2015,73362.0,3048.74,missing,-1,6.037296e+13,1,2016
4,14432541,0,-1,-1,-1,2,4,-1,-1,2,...,169574.0,434551.0,2015,264977.0,5488.96,missing,-1,6.059042e+13,1,2016
5,11509835,0,1,-1,-1,4,4,-1,1,4,...,880650.0,2447951.0,2015,1567301.0,27126.57,missing,-1,6.037621e+13,1,2016
6,12286022,0,-1,-1,-1,1,2,-1,7,1,...,64549.0,111521.0,2015,46972.0,2304.97,missing,-1,6.037542e+13,1,2016
7,17177301,0,-1,-1,-1,2,3,-1,-1,2,...,107000.0,306000.0,2015,199000.0,3745.50,missing,-1,6.111003e+13,1,2016
8,14739064,0,-1,-1,-1,1,2,-1,-1,1,...,66834.0,210064.0,2015,143230.0,2172.88,missing,-1,6.059042e+13,1,2016
9,14677559,0,-1,-1,-1,2,2,-1,-1,2,...,109977.0,190960.0,2015,80983.0,1940.26,missing,-1,6.059063e+13,1,2016
