In [2]:
# Importing Libraries
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandarallel
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import keras
import tensorflow as tf
from kerastuner.engine.hyperparameters import HyperParameters
from keras.activations import relu
from keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from kerastuner.tuners import RandomSearch, Sklearn
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import math
import os

In [3]:
filename = "./EDA Notebook - Encoded.csv"

In [4]:
df = pd.read_csv(filename)

In [None]:
# Sorting DataFrame by our index before setting it as such
# df.sort_values(by=["host_since"], inplace=True, ascending=True)   # host_since is the closest thing to a date or date_time column we have
# df.set_index("host_since", inplace=True)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,host_since,instant_bookable,review_scores_rating,zipcode,bedrooms,beds,price
0,54511,0,0,8,2,4,2,1,0,1,16115,1,100,2108,4,4,259.0
1,20313,0,0,4,1,4,2,1,0,0,15786,1,89,2108,1,2,89.0
2,68172,0,0,2,1,4,0,1,0,0,16347,1,87,2108,1,1,225.0
3,62506,0,0,3,1,4,2,1,0,1,15554,1,87,2108,1,1,185.0
4,56709,0,0,2,1,4,1,0,0,1,16279,0,91,2108,1,1,199.0


In [14]:
df.drop(columns="Unnamed: 0", inplace=True)

In [6]:
df.shape

(38499, 17)

In [15]:
df.dtypes

property_type               int64
room_type                   int64
accommodates                int64
bathrooms                   int64
bed_type                    int64
cancellation_policy         int64
cleaning_fee                int64
city                        int64
host_identity_verified      int64
host_since                  int64
instant_bookable            int64
review_scores_rating        int64
zipcode                     int64
bedrooms                    int64
beds                        int64
price                     float64
dtype: object

In [16]:
# Splitting data into target and feature matrix
target = 'price'
X = df.drop(columns=target)
y = df[target]

In [17]:
# Sanity Check
assert len(X) == len(y)

In [18]:
X.dtypes

property_type             int64
room_type                 int64
accommodates              int64
bathrooms                 int64
bed_type                  int64
cancellation_policy       int64
cleaning_fee              int64
city                      int64
host_identity_verified    int64
host_since                int64
instant_bookable          int64
review_scores_rating      int64
zipcode                   int64
bedrooms                  int64
beds                      int64
dtype: object

In [19]:
print(X)

       property_type  room_type  accommodates  bathrooms  bed_type  \
0                  0          0             8          2         4   
1                  0          0             4          1         4   
2                  0          0             2          1         4   
3                  0          0             3          1         4   
4                  0          0             2          1         4   
...              ...        ...           ...        ...       ...   
38494              0          1             2          1         4   
38495              0          1             3          1         4   
38496              0          0             2          1         4   
38497              0          1             1          1         4   
38498              0          1             4          1         4   

       cancellation_policy  cleaning_fee  city  host_identity_verified  \
0                        2             1     0                       1   
1          

In [None]:
# Standardizating data
# X_scaled = StandardScaler().fit_transform(X)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [21]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.38, random_state=42)

In [22]:
# Sanity Checks
len(X_train) == len(y_train)
len(X_val) == len(y_val)
len(X_test) == len(y_test)

True

In [23]:
print("Number of Rows for Training Set: ", len(X_train))
print("Number of Rows for Validation Set: ", len(X_val))
print("Number of Rows for Testing Set: ", len(X_test))
# The validation set is smaller than the testing set. This is why I have the second train_test_split in
# an odd arrangement, had to make sure val was smaller than test

Number of Rows for Training Set:  23099
Number of Rows for Validation Set:  5852
Number of Rows for Testing Set:  9548


In [24]:
# Regression Baseline
y_pred = [y_train.mean()] * len(y_train)

baseline_mae = mean_absolute_error(y_train, y_pred)
print('Baseline MAE:', baseline_mae)

Baseline MAE: 79.97292301694209


## Building Our Model

In [25]:
# get dim of image row vectors and save to imput_dim
imput_dim = X.shape[1]

In [26]:
def build_regression_model(hp):
    
    """
    Returns a complied keras model ready for keras-tuner gridsearch algorithms 
    """
    
    model = Sequential()
    
    # 1st hidden layer
    model.add(Dense(input_dim=imput_dim,
                    units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # 2nd hidden layer 
    model.add(Dense(units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # 3rd hidden layer 
    model.add(Dense(units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # output layer
    model.add(Dense(1, activation='relu'))
    
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(hp.get('learning_rate')),
        loss='mse',
        metrics=['mae'])
    
    return model

In [27]:
# build out our hyperparameter dictionary 
hp = HyperParameters()
hp.Int('units', min_value=32, max_value=512, step=32)
hp.Choice('learning_rate',values=[1e-1, 1e-2, 1e-3])
hp.Choice('activation',values=["linear", "relu"])

'linear'

In [28]:
n_unique_hparam_combos = len(range(32,512+32, 32)) * 3 *2
n_param_combos_to_sample = n_unique_hparam_combos * .25

In [30]:
random_tuner = RandomSearch(
            build_regression_model,
            objective='val_mae',
            max_trials=n_param_combos_to_sample, # number of times to sample the parameter set and build a model 
            seed=1234,
            hyperparameters=hp, # pass in our hyperparameter dictionary
            directory='./keras-tuner-trial',
            project_name='random_search')

In [55]:
# take note of Total elapsed time in print out
random_tuner.search(X_train,
                    y_train,
                    epochs=10,
                    validation_data=(X_val, y_val))

INFO:tensorflow:Oracle triggered exit


In [56]:
# identify the best score and hyperparamter (should be at the top since scores are ranked)
random_tuner.results_summary()

Results summary
Results in ./keras-tuner-trial/random_search
Showing 10 best trials
Objective(name='val_mae', direction='min')
Trial summary
Hyperparameters:
units: 480
learning_rate: 0.001
activation: relu
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 96
learning_rate: 0.1
activation: linear
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 352
learning_rate: 0.001
activation: linear
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 384
learning_rate: 0.01
activation: linear
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 288
learning_rate: 0.1
activation: linear
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 416
learning_rate: 0.001
activation: relu
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 224
learning_rate: 0.001
activation: relu
Score: 146.27255249023438
Trial summary
Hyperparameters:
units: 96
learning_rate: 0.01
activation: linear
Score: 146.27255249023438
Trial summary
Hyperpara

# Trial summary


## Hyperparameters:

*units:* **480**

*learning_rate:* **0.001**

*activation:* **relu**

---
*Score:* **146.27255249023438**

In [62]:
## Plugging in best params

# Instantiating our model's class architecture
model = Sequential()
    
# hidden layer
model.add(Dense(140,
                input_dim=imput_dim,
                activation='relu'))

model.add(Dense(240,
                activation='relu'))

model.add(Dense(480,
                activation='relu'))

model.add(Dense(240,
                activation='relu'))
    
# output layer
model.add(Dense(1,
                activation='relu'))

# Assigning learning rate to RMSprop optimizer
opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)

# compiling our model architecture with loss function & corresponding metric
model.compile(optimizer=opt,
              loss='mse',
              metrics=['mae'])

In [63]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 140)               2240      
_________________________________________________________________
dense_10 (Dense)             (None, 240)               33840     
_________________________________________________________________
dense_11 (Dense)             (None, 480)               115680    
_________________________________________________________________
dense_12 (Dense)             (None, 240)               115440    
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 241       
Total params: 267,441
Trainable params: 267,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Use TensorBoard if you're running this on Colab

# %load_ext tensorboard

# import os
# import datetime

# logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [64]:
# Readying our EarlyStop callback
stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=5, mode="min")

In [82]:
# Fitting model
model.fit(X_train,
          y_train,
          batch_size=32,
          epochs=20,
          validation_data=(X_val, y_val),
          callbacks=[stop_callback])

# Save the entire model as a Saved Model.
model.save('retrained_rs_model')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
INFO:tensorflow:Assets written to: retrained_rs_model/assets


In [83]:
new_model = tf.keras.models.load_model('retrained_rs_model')

In [84]:
new_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 140)               2240      
_________________________________________________________________
dense_10 (Dense)             (None, 240)               33840     
_________________________________________________________________
dense_11 (Dense)             (None, 480)               115680    
_________________________________________________________________
dense_12 (Dense)             (None, 240)               115440    
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 241       
Total params: 267,441
Trainable params: 267,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
# %load_ext tensorboard
# %tensorboard --logdir logs

In [70]:
# Evaluating model's predictive power
predictions = model.predict(X_test, batch_size=32)
predictions

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [71]:
# Actual prices
y_test

3494      99.0
24311    110.0
12761     50.0
9375     112.0
1465     119.0
         ...  
12135    140.0
13468    130.0
32132     89.0
20233     70.0
14657    140.0
Name: price, Length: 9548, dtype: float64