In [13]:
# Importing Libraries
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandarallel
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import keras
import tensorflow as tf
from kerastuner.engine.hyperparameters import HyperParameters
from keras.activations import relu
from keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from kerastuner.tuners import RandomSearch, Sklearn
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import math
import os

In [14]:
filename = "./EDA Notebook.csv"

In [15]:
df = pd.read_csv(filename)

In [4]:
# Sorting DataFrame by our index before setting it as such
# df.sort_values(by=["host_since"], inplace=True, ascending=True)   # host_since is the closest thing to a date or date_time column we have
# df.set_index("host_since", inplace=True)

In [16]:
df.head()

Unnamed: 0,host_since,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,instant_bookable,review_scores_rating,bedrooms,beds,price
0,2008-03-03,14,0,2,1.0,4,0,1,5,1,0,97.0,1.0,1.0,175.0
1,2008-06-27,10,0,6,2.0,4,2,1,3,1,0,80.0,2.0,3.0,122.0
2,2008-07-31,0,0,2,1.0,4,2,1,3,1,0,100.0,1.0,1.0,120.0
3,2008-08-16,0,1,2,1.0,4,1,1,1,1,0,89.0,1.0,1.0,40.0
4,2008-08-16,20,0,10,1.0,4,2,1,1,1,0,71.0,4.0,4.0,150.0


In [17]:
df.shape

(38502, 15)

In [18]:
# Splitting data into target and feature matrix
target = 'price'
X = df.drop(columns=target)
y = df[target]

In [19]:
# Sanity Check
assert len(X) == len(y)

In [20]:
X.dtypes

host_since                 object
property_type               int64
room_type                   int64
accommodates                int64
bathrooms                 float64
bed_type                    int64
cancellation_policy         int64
cleaning_fee                int64
city                        int64
host_identity_verified      int64
instant_bookable            int64
review_scores_rating      float64
bedrooms                  float64
beds                      float64
dtype: object

In [21]:
print(X)

       host_since  property_type  room_type  accommodates  bathrooms  \
0      2008-03-03             14          0             2        1.0   
1      2008-06-27             10          0             6        2.0   
2      2008-07-31              0          0             2        1.0   
3      2008-08-16              0          1             2        1.0   
4      2008-08-16             20          0            10        1.0   
...           ...            ...        ...           ...        ...   
38497  2017-09-21             28          0             4        1.5   
38498  2017-09-21              0          0             4        1.0   
38499  2017-09-22              0          0             2        1.0   
38500  2017-09-22             20          1             1        1.0   
38501  2017-09-25             10          1             2        1.0   

       bed_type  cancellation_policy  cleaning_fee  city  \
0             4                    0             1     5   
1             4

In [22]:
# Standardizating data
X_scaled = StandardScaler().fit_transform(X)
### NEED TO 

ValueError: could not convert string to float: '2008-03-03'

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [12]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.38, random_state=42)

In [13]:
# Sanity Checks
len(X_train) == len(y_train)
len(X_val) == len(y_val)
len(X_test) == len(y_test)

True

In [14]:
print("Number of Rows for Training Set: ", len(X_train))
print("Number of Rows for Validation Set: ", len(X_val))
print("Number of Rows for Testing Set: ", len(X_test))
# The validation set is smaller than the testing set. This is why I have the second train_test_split in
# an odd arrangement, had to make sure val was smaller than test

Number of Rows for Training Set:  23101
Number of Rows for Validation Set:  5853
Number of Rows for Testing Set:  9548


In [15]:
# Regression Baseline
y_pred = [y_train.mean()] * len(y_train)

baseline_mae = mean_absolute_error(y_train, y_pred)
print('Baseline MAE:', "%.0f%%" % (baseline_mae))

Baseline MAE: 80%


Seems that even without amenities or zipcode our model manages a baseline MAE of 80%. Hope this doesn't end with >=95%. NNs tend to overfit after all.

## Building Our Model

In [16]:
# get dim of image row vectors and save to imput_dim
imput_dim = X.shape[1]

In [24]:
def build_regression_model(hp):
    
    """
    Returns a complied keras model ready for keras-tuner gridsearch algorithms 
    """
    
    model = Sequential()
    
    # 1st hidden layer
    model.add(Dense(input_dim=imput_dim,
                    units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # 2nd hidden layer 
    model.add(Dense(units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # 2nd hidden layer 
    model.add(Dense(units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # output layer
    model.add(Dense(1, activation='relu'))
    
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(hp.get('learning_rate')),
        loss='mse',
        metrics=['mae'])
    
    return model

In [26]:
# build out our hyperparameter dictionary 
hp = HyperParameters()
hp.Int('units', min_value=32, max_value=512, step=32)
hp.Choice('learning_rate',values=[1e-1, 1e-2, 1e-3])
hp.Choice('activation',values=["linear", "relu"])

'linear'

In [27]:
n_unique_hparam_combos = len(range(32,512+32, 32)) * 3 *2
n_param_combos_to_sample = n_unique_hparam_combos * .25

In [33]:
random_tuner = RandomSearch(
            build_regression_model,
            objective='val_mae',
            max_trials=n_param_combos_to_sample, # number of times to sample the parameter set and build a model 
            seed=1234,
            hyperparameters=hp, # pass in our hyperparameter dictionary
            directory='./keras-tuner-trial',
            project_name='random_search')

In [34]:
# take note of Total elapsed time in print out
random_tuner.search(X_train,
                    y_train,
                    epochs=3,
                    validation_data=(X_val, y_val))

Trial 24 Complete [00h 00m 05s]
val_mae: 145.39039611816406

Best val_mae So Far: 76.60479736328125
Total elapsed time: 00h 02m 34s
INFO:tensorflow:Oracle triggered exit


In [35]:
# identify the best score and hyperparamter (should be at the top since scores are ranked)
random_tuner.results_summary()

Results summary
Results in ./keras-tuner-trial/random_search
Showing 10 best trials
Objective(name='val_mae', direction='min')
Trial summary
Hyperparameters:
units: 448
learning_rate: 0.001
activation: relu
Score: 76.60479736328125
Trial summary
Hyperparameters:
units: 416
learning_rate: 0.001
activation: relu
Score: 91.08583068847656
Trial summary
Hyperparameters:
units: 480
learning_rate: 0.001
activation: relu
Score: 145.39039611816406
Trial summary
Hyperparameters:
units: 96
learning_rate: 0.1
activation: linear
Score: 145.39039611816406
Trial summary
Hyperparameters:
units: 352
learning_rate: 0.001
activation: linear
Score: 145.39039611816406
Trial summary
Hyperparameters:
units: 384
learning_rate: 0.01
activation: linear
Score: 145.39039611816406
Trial summary
Hyperparameters:
units: 288
learning_rate: 0.1
activation: linear
Score: 145.39039611816406
Trial summary
Hyperparameters:
units: 224
learning_rate: 0.001
activation: relu
Score: 145.39039611816406
Trial summary
Hyperparame

# Trial summary


## Hyperparameters:

*units:* **416**

*learning_rate:* **0.001**

*activation:* **relu**

---
*Score:* **91.08583068847656**

In [51]:
## Plugging in best params

# Instantiating our model's class architecture
model = Sequential()
    
# hidden layer
model.add(Dense(10,
                input_dim=imput_dim,  # Input layer
                activation='relu'))

# 2nd hidden layer
model.add(Dense(416, activation='relu'))
    
# output layer
model.add(Dense(1,
                activation='relu'))

# Assigning learning rate to RMSprop optimizer
opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)

# compiling our model architecture with loss function & corresponding metric
model.compile(optimizer=opt,
              loss='mse',
              metrics=['mae'])

In [52]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 10)                180       
_________________________________________________________________
dense_7 (Dense)              (None, 416)               4576      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 417       
Total params: 5,173
Trainable params: 5,173
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Use TensorBoard if you're running this on Colab

# %load_ext tensorboard

# import os
# import datetime

# logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [39]:
# Readying our EarlyStop callback
stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, mode="min")

In [64]:
# Fitting model
model.fit(X_train,
          y_train,
          batch_size=42,
          epochs=20,
          validation_data=(X_val, y_val),
          callbacks=[stop_callback])

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
INFO:tensorflow:Assets written to: saved_model/my_model/assets


In [28]:
# %load_ext tensorboard
# %tensorboard --logdir logs

In [65]:
# Evaluating model's predictive power
predictions = model.predict(X_test, batch_size=32)
predictions

array([[332.69308],
       [565.41693],
       [ 79.00336],
       ...,
       [262.7436 ],
       [478.9129 ],
       [232.4342 ]], dtype=float32)

In [66]:
# Actual prices
y_test

host_since
2017-01-18    150.0
2014-08-08    225.0
2012-06-06     40.0
2011-12-26    100.0
2014-04-23     90.0
              ...  
2013-07-25    150.0
2013-10-12     41.0
2016-05-29     60.0
2014-10-02    140.0
2013-12-25    110.0
Name: price, Length: 9548, dtype: float64

Not a good model. Going to try a GridSearchCV & BayesianOptimization next. This will have to suffice considering its 4:27 AM.