In [23]:
# Importing Libraries
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandarallel
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import keras
import tensorflow as tf
from kerastuner.engine.hyperparameters import HyperParameters
from keras.activations import relu
from keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from kerastuner.tuners import RandomSearch, Sklearn
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import math
import os

In [24]:
filename = "./EDA Notebook - Encoded.csv"

In [25]:
df = pd.read_csv(filename)

In [26]:
# Sorting DataFrame by our index before setting it as such
# df.sort_values(by=["host_since"], inplace=True, ascending=True)   # host_since is the closest thing to a date or date_time column we have
# df.set_index("host_since", inplace=True)

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,host_since,instant_bookable,review_scores_rating,zipcode,bedrooms,beds,price
0,1,0,0,7,1.0,4,2,1,4,0,2884,1,93.0,10019,3.0,3.0,169.0
1,2,0,0,5,1.0,4,1,1,4,1,2647,1,92.0,10027,1.0,3.0,145.0
2,5,0,1,2,1.0,4,2,1,5,1,2872,1,100.0,94131,1.0,1.0,85.0
3,7,10,0,2,1.0,4,1,1,3,1,1391,0,93.0,90015,1.0,1.0,120.0
4,8,16,1,2,1.0,4,1,1,5,0,2138,0,99.0,94121,1.0,1.0,120.0


In [28]:
df.shape

(38499, 17)

In [29]:
# Splitting data into target and feature matrix
target = 'price'
X = df.drop(columns=target)
y = df[target]

In [30]:
# Sanity Check
assert len(X) == len(y)

In [31]:
X.dtypes

Unnamed: 0                  int64
property_type               int64
room_type                   int64
accommodates                int64
bathrooms                 float64
bed_type                    int64
cancellation_policy         int64
cleaning_fee                int64
city                        int64
host_identity_verified      int64
host_since                  int64
instant_bookable            int64
review_scores_rating      float64
zipcode                     int64
bedrooms                  float64
beds                      float64
dtype: object

In [32]:
print(X)

       Unnamed: 0  property_type  room_type  accommodates  bathrooms  \
0               1              0          0             7        1.0   
1               2              0          0             5        1.0   
2               5              0          1             2        1.0   
3               7             10          0             2        1.0   
4               8             16          1             2        1.0   
...           ...            ...        ...           ...        ...   
38494       74102             29          2            10        3.0   
38495       74103              0          0             2        1.0   
38496       74107              0          0             4        2.0   
38497       74108              0          0             5        1.0   
38498       74110              2          0             4        1.0   

       bed_type  cancellation_policy  cleaning_fee  city  \
0             4                    2             1     4   
1             4

In [33]:
# Standardizating data
X_scaled = StandardScaler().fit_transform(X)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [35]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.38, random_state=42)

In [36]:
# Sanity Checks
len(X_train) == len(y_train)
len(X_val) == len(y_val)
len(X_test) == len(y_test)

True

In [37]:
print("Number of Rows for Training Set: ", len(X_train))
print("Number of Rows for Validation Set: ", len(X_val))
print("Number of Rows for Testing Set: ", len(X_test))
# The validation set is smaller than the testing set. This is why I have the second train_test_split in
# an odd arrangement, had to make sure val was smaller than test

Number of Rows for Training Set:  23099
Number of Rows for Validation Set:  5852
Number of Rows for Testing Set:  9548


In [77]:
# Regression Baseline
y_pred = [y_train.mean()] * len(y_train)

baseline_mae = mean_absolute_error(y_train, y_pred)
print('Baseline MAE:', baseline_mae)

Baseline MAE: 79.53030841010894


## Building Our Model

In [39]:
# get dim of image row vectors and save to imput_dim
imput_dim = X.shape[1]

In [40]:
def build_regression_model(hp):
    
    """
    Returns a complied keras model ready for keras-tuner gridsearch algorithms 
    """
    
    model = Sequential()
    
    # 1st hidden layer
    model.add(Dense(input_dim=imput_dim,
                    units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # 2nd hidden layer 
    model.add(Dense(units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # 2nd hidden layer 
    model.add(Dense(units=hp.get('units'),
                    activation=hp.get("activation")))
    
    # output layer
    model.add(Dense(1, activation='relu'))
    
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(hp.get('learning_rate')),
        loss='mse',
        metrics=['mae'])
    
    return model

In [41]:
# build out our hyperparameter dictionary 
hp = HyperParameters()
hp.Int('units', min_value=32, max_value=512, step=32)
hp.Choice('learning_rate',values=[1e-1, 1e-2, 1e-3])
hp.Choice('activation',values=["linear", "relu"])

'linear'

In [42]:
n_unique_hparam_combos = len(range(32,512+32, 32)) * 3 *2
n_param_combos_to_sample = n_unique_hparam_combos * .25

In [54]:
random_tuner = RandomSearch(
            build_regression_model,
            objective='val_mae',
            max_trials=n_param_combos_to_sample, # number of times to sample the parameter set and build a model 
            seed=1234,
            hyperparameters=hp, # pass in our hyperparameter dictionary
            directory='./keras-tuner-trial',
            project_name='random_search')

In [57]:
# take note of Total elapsed time in print out
random_tuner.search(X_train,
                    y_train,
                    epochs=3,
                    validation_data=(X_val, y_val))

INFO:tensorflow:Oracle triggered exit


In [58]:
# identify the best score and hyperparamter (should be at the top since scores are ranked)
random_tuner.results_summary()

Results summary
Results in ./keras-tuner-trial/random_search
Showing 10 best trials
Objective(name='val_mae', direction='min')
Trial summary
Hyperparameters:
units: 480
learning_rate: 0.001
activation: relu
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 96
learning_rate: 0.1
activation: linear
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 352
learning_rate: 0.001
activation: linear
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 384
learning_rate: 0.01
activation: linear
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 288
learning_rate: 0.1
activation: linear
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 416
learning_rate: 0.001
activation: relu
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 224
learning_rate: 0.001
activation: relu
Score: 144.88278198242188
Trial summary
Hyperparameters:
units: 96
learning_rate: 0.01
activation: linear
Score: 144.88278198242188
Trial summary
Hyperpara

# Trial summary


## Hyperparameters:

*units:* **416**

*learning_rate:* **0.001**

*activation:* **relu**

---
*Score:* **91.08583068847656**

In [78]:
## Plugging in best params

# Instantiating our model's class architecture
model = Sequential()
    
# hidden layer
model.add(Dense(10,
                input_dim=imput_dim,  # Input layer
                activation='relu'))

# 2nd hidden layer
model.add(Dense(480, activation='relu'))
    
# output layer
model.add(Dense(1,
                activation='relu'))

# Assigning learning rate to RMSprop optimizer
opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)

# compiling our model architecture with loss function & corresponding metric
model.compile(optimizer=opt,
              loss='mse',
              metrics=['mae'])

In [60]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 10)                170       
_________________________________________________________________
dense_5 (Dense)              (None, 480)               5280      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 481       
Total params: 5,931
Trainable params: 5,931
Non-trainable params: 0
_________________________________________________________________


In [48]:
# Use TensorBoard if you're running this on Colab

# %load_ext tensorboard

# import os
# import datetime

# logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [74]:
# Readying our EarlyStop callback
# stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, mode="min")

In [76]:
# Fitting model
model.fit(X_train,
          y_train,
          batch_size=32,
          epochs=20,
          validation_data=(X_val, y_val))

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: saved_model/my_model/assets


In [64]:
# %load_ext tensorboard
# %tensorboard --logdir logs

In [70]:
# Evaluating model's predictive power
predictions = model.predict(X_test, batch_size=32)
predictions

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [53]:
# Actual prices
y_test

3494     375.0
24311     60.0
12761     72.0
9375     150.0
1465      42.0
         ...  
12135     78.0
13468     43.0
32132     80.0
20233     95.0
14657     75.0
Name: price, Length: 9548, dtype: float64

Not a good model. Going to try a GridSearchCV & BayesianOptimization next. This will have to suffice considering its 4:27 AM.