# Lab - Rental Price 
Reference: Dataset was extracted from one of the popular property listing site (updated to Sep 2018)

Exercise: 
1. Try train on all states dataset, then predict the rental based on selected features
2. Try train on only Selangor dataset, then predict the rental based on selected features



In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import shutil

## Read dataset


In [2]:
df_orig = pd.read_csv("https://raw.githubusercontent.com/thenghui/trainerdata/master/RentIndexRaw.csv", low_memory=False)
COLUMNS = list(df_orig)
print (COLUMNS)

['ID', 'BuiltUpSize', 'City', 'CreatedDateTime', 'CreatedUser', 'DataSourceID', 'Furnishing', 'Latitude', 'Longitude', 'NoOfBathroom', 'NoOfBedroom', 'NoOfParking', 'Postcode', 'PostedDate', 'PropertyAddress', 'PropertyName', 'PropertyType', 'RentalPerMth', 'SourceUrl', 'State', 'UpdatedDateTime', 'UpdatedUser']


## Preparing dataset 


In [3]:
print('Original dataset: {}'.format(df_orig.shape))

df_orig['State'] = df_orig['State'].str.upper()
df_orig['City'] = df_orig['City'].str.upper()
df_orig['PropertyType'] = df_orig['PropertyType'].str.upper()

# Selangor
df_sel = df_orig[(df_orig.State == 'SELANGOR')]
print('Selangor dataset: {}'.format(df_sel.shape))

# KL
df_kul = df_orig[(df_orig.State == 'KUALA-LUMPUR')]
print('KL dataset: {}'.format(df_kul.shape))

# Use Selangor data
# remove rows with zero value
df = df_sel[(df_sel[['BuiltUpSize','RentalPerMth', 'NoOfBedroom']] != 0).all(axis=1)]
                            #'NoOfBathroom', 'NoOfParking']] != 0).all(axis=1)]


print('Wanted dataset: {}'.format(df.shape))

Original dataset: (54405, 22)
Selangor dataset: (30292, 22)
KL dataset: (13648, 22)
Wanted dataset: (20626, 22)


## Split into dataset of: Train, Validation, Test (70%-20%-10%)


In [4]:
df_train0, df_valid0, df_test0 = np.split(df.sample(frac=1), [int(.7*len(df)), int(.9*len(df))])

# Retrieve only Featured columns
df_train = df_train0[['State','BuiltUpSize','RentalPerMth','NoOfBedroom','Furnishing','PropertyType']]
df_valid = df_valid0[['State','BuiltUpSize','RentalPerMth','NoOfBedroom','Furnishing','PropertyType']]
df_test = df_test0[['State','BuiltUpSize','RentalPerMth','NoOfBedroom','Furnishing','PropertyType']]

print('Train: {}'.format(df_train.shape))
print('Valid: {}'.format(df_valid.shape))
print('Test: {}'.format(df_test.shape))

Train: (14438, 6)
Valid: (4125, 6)
Test: (2063, 6)


## Categorize string value columns


In [5]:
df_train['PropertyType'].unique()

array(['CONDOMINIUM', 'SERVICED RESIDENCE', 'TOWNHOUSE', 'FLAT',
       '2-STY TERRACE/LINK HOUSE', '1-STY TERRACE/LINK HOUSE',
       '2.5-STY TERRACE/LINK HOUSE', 'APARTMENT',
       '3-STY TERRACE/LINK HOUSE', 'SEMI-DETACHED HOUSE', 'BUNGALOW',
       '1.5-STY TERRACE/LINK HOUSE', 'CLUSTER HOUSE',
       '4-STY TERRACE/LINK HOUSE', '3.5-STY TERRACE/LINK HOUSE'],
      dtype=object)

In [6]:
df_train['State'].unique()

array(['SELANGOR'], dtype=object)

## Determine metrics for validation - RMSE

In [7]:
def print_rmse(model, df):
  metrics = model.evaluate(input_fn = tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = df[LABEL],
    batch_size = 128,
    shuffle = False,
    #queue_capacity = 100
      
  ))
  print('RMSE on dataset = {}'.format(np.sqrt(metrics['average_loss'])))
    
#print_rmse(model, df_valid)

## Model: DNNRegressor 


In [22]:
# DNNRegressor

LABEL = 'RentalPerMth'
OUTDIR = 'rentalDNNRegression_trained'
tf.logging.set_verbosity(tf.logging.INFO)
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

# Categorize string features
categorical_column1 = tf.feature_column.categorical_column_with_vocabulary_list(
    key="State", vocabulary_list=['SELANGOR', 'JOHOR', 'KUALA-LUMPUR', 'PUTRAJAYA'], 
    default_value=0)

categorical_column2 = tf.feature_column.categorical_column_with_vocabulary_list(
    key="Furnishing", vocabulary_list=['Unfurnished', 'Fully Furnished', 'Partly Furnished', 'Unknown'], 
    default_value=0)

categorical_column3 = tf.feature_column.categorical_column_with_vocabulary_list(
    key="PropertyType", vocabulary_list=['CONDOMINIUM', 'SERVICED RESIDENCE', 'APARTMENT',
       '2-STY TERRACE/LINK HOUSE', '3-STY TERRACE/LINK HOUSE',
       '1-STY TERRACE/LINK HOUSE', 'TOWNHOUSE', 'SEMI-DETACHED HOUSE',
       'BUNGALOW', '1.5-STY TERRACE/LINK HOUSE',
       '3.5-STY TERRACE/LINK HOUSE', 'CLUSTER HOUSE', 'FLAT',
       '2.5-STY TERRACE/LINK HOUSE', '4-STY TERRACE/LINK HOUSE',
       'RESIDENTIAL LAND'], 
    default_value=0)


model = tf.estimator.DNNRegressor(    
    hidden_units = [32, 512, 128],
    feature_columns = [tf.feature_column.numeric_column('BuiltUpSize'),
                       tf.feature_column.numeric_column('NoOfBedroom'),
                       #tf.feature_column.indicator_column(categorical_column1),
                       tf.feature_column.indicator_column(categorical_column2),
                       tf.feature_column.indicator_column(categorical_column3)
                      ], 
    activation_fn = tf.nn.relu,
  optimizer=tf.train.AdamOptimizer(
        learning_rate=0.001,
        #l1_regularization_strength=0.01
    ),

    model_dir = OUTDIR
)
 

model.train(input_fn = tf.estimator.inputs.pandas_input_fn(
    x = df_train,
    y = df_train[LABEL],
    batch_size = 512,
    num_epochs = 5,
    shuffle = True,
  )
  );

print_rmse(model, df_valid)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'rentalDNNRegression_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f97204c1e80>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorfl

## Prediction 

In [21]:
predictions = model.predict(input_fn = tf.estimator.inputs.pandas_input_fn(
#    x = pd.DataFrame({
#                      'BuiltUpSize':[560,1200,1600],
#                      'NoOfBedroom':[2,3,3],
#                      'PropertyType':['CONDOMINIUM', 'SERVICED RESIDENCE', 'APARTMENT'],
#                      'Furnishing':['Unfurnished', 'Fully Furnished', 'Partly Furnished']}),
    x = df_test,
    y = None,
    batch_size = 128,
    shuffle = False
  ))

for items in predictions:
  print(items)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from rentalDNNRegression_trained/model.ckpt-564
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'predictions': array([1381.7448], dtype=float32)}
{'predictions': array([947.01685], dtype=float32)}
{'predictions': array([1854.8107], dtype=float32)}
{'predictions': array([1636.5519], dtype=float32)}
{'predictions': array([947.01685], dtype=float32)}
{'predictions': array([2509.756], dtype=float32)}
{'predictions': array([2072.5413], dtype=float32)}
{'predictions': array([2104.7896], dtype=float32)}
{'predictions': array([1432.1954], dtype=float32)}
{'predictions': array([3570.908], dtype=float32)}
{'predictions': array([933.56335], dtype=float32)}
{'predictions': array([932.8098], dtype=float32)}
{'predictions': array([4007.3381], dtype=float32)}
{'predictions': array([1314.4769], dtype=float32)}
{'predictio