In [1]:
from word2vec_functions import *
import h2o
h2o.init()
import numpy as np
import pandas as pd
from math import sqrt
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import sklearn
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,17 mins 41 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,29 days
H2O_cluster_name:,H2O_from_python_abbyjohnson_jboy44
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,630 Mb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
# Initializing word2vec model
airbnb_path = "../data/newnh_airbnb_2021.csv"
airbnb_names = h2o.import_file(airbnb_path, destination_frame = "airbnbnames",
                             header = 1)
# Create word vectors
words = tokenize(airbnb_names["name"])
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame = words)
airbnb_names_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
valid_airbnb_names = ~ airbnb_names_vecs["C1"].isna()
data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs[valid_airbnb_names,:])
data_split = data.split_frame(ratios=[0.8])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
word2vec Model Build progress: |█████████████████████████████████████████████████| (done) 100%


In [3]:
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = airbnb_names_vecs.names,
                y= "price", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

# We can't just rely on the names because RMSE is pretty high

varimp = gbm_model.varimp(use_pandas=True)
top_20 = list(varimp.head(20)["variable"])
print(top_20)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
['C83', 'C31', 'C48', 'C84', 'C62', 'C66', 'C45', 'C67', 'C20', 'C94', 'C53', 'C5', 'C70', 'C32', 'C88', 'C81', 'C16', 'C96', 'C49', 'C15']


In [4]:
## In word2vec.ipynb, we see the top 20 most important categorizations
# Let's filter just for those 20
top_20_list = list(set(airbnb_names_vecs.names)  & set(top_20))
airbnb_names_vecs = airbnb_names_vecs[valid_airbnb_names, :]
airbnb_names_vecs = airbnb_names_vecs[top_20_list]
airbnb_names_vecs
#data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs[valid_airbnb_names,:])
#data

C49,C5,C84,C67,C15,C32,C81,C83,C20,C16,C53,C48,C62,C45,C70,C88,C96,C66,C31,C94
-0.226829,-0.00331929,-0.100867,0.0348192,0.136442,0.0316466,-0.194046,0.129924,-0.0339271,-0.012946,0.070784,-0.0885248,0.0866059,-0.252656,0.0363556,-0.107741,0.0716062,-0.0564377,-0.10197,0.0225174
-0.385209,0.00725589,0.0308885,0.118868,-0.0596237,-0.0917521,-0.0989658,0.107034,0.146662,0.0453749,0.235285,-0.0668755,0.0787157,-0.180755,-0.0655932,-0.0301153,-0.135611,-0.237574,-0.0891468,0.100429
-0.227481,0.156333,-0.0934061,0.0687227,-0.0164939,0.154562,-0.159851,0.253024,-0.0579056,-0.0131639,0.184164,-0.246212,0.234722,-0.13871,0.0380687,-0.0180144,0.0809035,-0.0148315,0.130846,-0.0874411
-0.191128,-0.0149406,-0.0192827,0.149606,-0.0864978,0.0143837,-0.165046,-0.155068,0.0103688,0.230515,0.171771,-0.253718,0.199631,0.0251822,-0.0712964,0.16581,-0.121677,-0.133927,0.0305516,-0.174662
-0.222934,-0.00747541,0.203948,0.0994162,0.0134285,-0.0807283,-0.0581093,-0.068858,0.115496,0.198769,0.372385,0.0288504,0.0296374,-0.072911,-0.0770348,-0.0433355,-0.0762749,0.0210361,0.0903293,-0.239701
-0.12449,-0.0773138,-0.0413609,0.171925,0.0308794,0.0259176,-0.111597,-0.0650996,0.0150474,0.179538,0.300659,-0.137604,0.246534,-0.0716339,-0.0213,0.0907684,0.188557,0.0381672,-0.181886,-0.173324
-0.303421,-0.388135,-0.0609552,0.0526506,-0.110009,0.0479563,-0.0429476,-0.0264059,-0.119237,0.125561,0.171829,0.00539542,-0.0167616,-0.00501219,-0.0934137,-0.0870991,0.0407977,-0.154033,-0.0830255,-0.276597
-0.241772,0.244356,0.262874,0.125109,0.0207956,-0.0786425,-0.0763643,0.0137846,-0.0255451,-0.0366834,0.244543,-0.153159,0.207124,-0.198232,-0.0839708,0.0119481,-0.143629,-0.147384,0.161334,-0.202488
-0.11327,-0.0854524,0.117834,0.17582,0.0246111,-0.109778,-0.325837,-0.153545,0.272897,0.102445,0.063881,-0.192117,0.0938325,-0.0733746,0.0744583,0.104653,0.108306,-0.130916,-0.205027,-0.163951
-0.283534,0.132475,0.0529393,0.047105,-0.0216184,0.0191014,-0.110142,0.145835,-0.0683666,0.00160141,0.293288,-0.108489,0.0729414,-0.0967388,-0.195423,0.0700178,0.056304,0.00316367,0.0249952,-0.163202




In [5]:
data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs).as_data_frame()
data.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                       float64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                           float64
new_neighbourhood                  object
C49                               float64
C5                                float64
C84                               float64
C67                               float64
C15                               

In [6]:
not_used = ["id", "host_id", "name", "host_name", "neighbourhood", "latitude", "longitude", "price", "last_review"]
X = data.drop(not_used, axis = 1)
# We want numbers to be of same type
dict_ints = {'minimum_nights': float,
             'number_of_reviews': float,
             'calculated_host_listings_count': float,
             'availability_365': float,
             'number_of_reviews_ltm': float
}
X = X.astype(dict_ints)
dict_cats = { 
    "neighbourhood_group": "category",
    "room_type": "category",
    "new_neighbourhood": "category"
}
list_cats = list(dict_cats.keys())

X = X.astype(dict_cats)
X.dtypes


# Convert y to float as well
y = data["price"].astype(float)

# One hot encode room type
encoder = OneHotEncoder(handle_unknown = "ignore")
encoder_df = pd.DataFrame(encoder.fit_transform(X[["room_type"]]).toarray())
# Merge with original dataset
X_df = X.join(encoder_df)

# Filter for top neighborhoods

# One hot encode neighborhoods
encoder_n = pd.DataFrame(encoder.fit_transform(X[["neighbourhood_group"]]).toarray())
encoder_n.columns = [4, 5, 6, 7, 8]
# Merge with original dataset
X = X_df.join(encoder_n)
#Drop original categorical vartiables
X.dtypes
X = X.drop(list_cats, axis = 1)
X.dtypes # all variables are floats
X = X.fillna(0)


In [None]:
## Create gradient-boosted model
# All feature names must be strings
X.columns = X.columns.astype(str)
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
params = {
    "n_estimators": 500,
    "max_depth": 10,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error"}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
print(X_train)

In [None]:
from sklearn.metrics import mean_squared_error
rmse = sqrt(mean_squared_error(y_test, reg.predict(X_test))) # manually extract rmse from mse
print(rmse) #out-of-sample rmse

In [None]:
# variable importance
