In [33]:
from word2vec_functions import *
import h2o
h2o.init()
import numpy as np
import pandas as pd
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import sklearn
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,4 hours 46 mins
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_csridlen_enbzy1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.618 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
# Initializing word2vec model
airbnb_path = "../data/newnh_airbnb_2021.csv"
airbnb_names = h2o.import_file(airbnb_path, destination_frame = "airbnbnames",
                             header = 1)
# Create word vectors
words = tokenize(airbnb_names["name"])
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame = words)
airbnb_names_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
valid_airbnb_names = ~ airbnb_names_vecs["C1"].isna()
data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs[valid_airbnb_names,:])
data_split = data.split_frame(ratios=[0.8])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
word2vec Model Build progress: |█████████████████████████████████████████████████| (done) 100%


In [3]:
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = airbnb_names_vecs.names,
                y= "price", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

# We can't just rely on the names because RMSE is pretty high

varimp = gbm_model.varimp(use_pandas=True)
top_20 = list(varimp.head(20)["variable"])
print(top_20)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
['C92', 'C97', 'C33', 'C60', 'C30', 'C46', 'C62', 'C16', 'C53', 'C81', 'C9', 'C84', 'C49', 'C66', 'C29', 'C85', 'C17', 'C57', 'C65', 'C21']


In [4]:
## In word2vec.ipynb, we see the top 20 most important categorizations
# Let's filter just for those 20
top_20_list = list(set(airbnb_names_vecs.names)  & set(top_20))
airbnb_names_vecs = airbnb_names_vecs[valid_airbnb_names, :]
airbnb_names_vecs = airbnb_names_vecs[top_20_list]
airbnb_names_vecs
#data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs[valid_airbnb_names,:])
#data

C30,C65,C85,C17,C81,C46,C29,C21,C97,C49,C16,C33,C92,C9,C62,C57,C53,C66,C60,C84
0.138452,0.0882096,-0.109111,-0.0932079,-0.0281687,0.0839297,0.346719,0.0564687,-0.130399,-0.264949,0.00569484,0.158718,-0.0437643,-0.00786723,0.00894642,-0.111275,-0.0232239,-0.158035,-0.211503,-0.162331
0.10467,0.211296,-0.192658,-0.0459824,-0.145458,0.190162,0.186416,0.0864193,-0.239544,-0.241183,-0.0661186,-0.0618359,0.251197,0.0353954,0.106801,0.0756906,0.0167082,-0.277006,-0.131027,0.058207
0.0256332,-0.0220693,-0.256189,-0.0447542,0.0881627,0.0053105,0.231273,0.0596998,-0.136719,-0.281873,0.0993152,-0.173435,0.00481443,-0.00576418,0.12721,0.199854,-0.0347593,0.0589286,-0.23413,0.00216571
0.0915846,0.293155,-0.0973103,-0.190998,-0.213081,0.10819,0.181367,0.0348803,-0.159283,-0.204446,0.158485,0.0732135,-0.0690258,0.0393661,0.188906,0.149343,0.235994,0.00503856,-0.117875,0.0336975
0.162707,0.110302,-0.0989238,-0.0865106,-0.144345,0.267411,0.220156,-0.0935621,-0.202636,-0.271676,-0.0247078,0.0837265,0.00236096,0.142765,0.0088824,0.0231127,0.23223,0.0177517,-0.0252448,0.162048
0.0254488,0.0652406,-0.335152,0.0215001,-0.101563,0.119857,0.0325376,0.0254503,-0.108634,-0.299177,0.119306,-0.149501,0.114587,0.0832183,0.0430904,-0.0527559,0.183255,0.0427877,-0.0429539,0.116422
0.276262,0.070509,0.070852,0.000369769,-0.183965,0.379194,0.182335,-0.044723,0.0972133,-0.19825,-0.167795,-0.0412998,0.140137,0.174682,-0.0543366,-0.0327202,0.143864,-0.173741,-0.315762,-0.0957297
0.341587,0.0148213,-0.142423,-0.347568,0.0320922,0.147152,0.138503,-0.190036,-0.187483,-0.212368,-0.0448324,-0.0501488,0.168962,0.218561,0.199515,0.158083,0.0451653,-0.0449297,0.0137013,0.201687
0.317527,0.125857,-0.224148,-0.0568766,-0.15614,-0.0403699,0.375767,0.0274122,-0.180391,-0.372636,0.136539,-0.0340165,-0.208416,0.303544,0.161755,0.124959,0.0715356,0.0779732,-0.198293,0.115918
-0.0126726,0.12384,0.0443981,-0.143841,-0.0435344,0.218653,0.0859209,0.118677,-0.168895,-0.147903,0.0428687,-0.151905,-0.115328,0.0640788,0.0147373,-0.047455,0.127387,-0.109779,-0.0996156,0.0886643




In [47]:
data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs).as_data_frame()
data

Index([                            'id',                 'minimum_nights',
                    'number_of_reviews',              'reviews_per_month',
       'calculated_host_listings_count',               'availability_365',
                'number_of_reviews_ltm',                        'license',
                    'new_neighbourhood',                            'C30',
                                  'C65',                            'C85',
                                  'C17',                            'C81',
                                  'C46',                            'C29',
                                  'C21',                            'C97',
                                  'C49',                            'C16',
                                  'C33',                            'C92',
                                   'C9',                            'C62',
                                  'C57',                            'C53',
                         

In [83]:
not_used = ["name", "neighbourhood", "neighbourhood_group", "latitude", "longitude", "price", "last_review"]
X = data.drop(not_used, axis = 1)
X
## One-hot encoding for categorical variables
encoder = OneHotEncoder(handle_unknown = 'ignore')
# encode all categorical variables
X_encode = pd.DataFrame(encoder.fit_transform(X[["id", "host_id", "host_name", "room_type", "new_neighbourhood"]]))
X = pd.concat([X, X_encode], axis = 1, join = "outer")
# Drop original categorical variables that are now o-h-e
cat = ["id", "host_id", "host_name", "room_type", "new_neighbourhood"]
X = X.drop(cat, axis = 1)
y = data.loc[ :, data.columns == "price"]
X.dtypes
dict_ints = {'minimum_nights': float,
             'number_of_reviews': float,
             'calculated_host_listings_count': float,
             'availability_365': float,
             'number_of_reviews_ltm': float
}
X = X.astype(dict_ints)
y = y.astype(float)

In [84]:
## Create gradient-boosted model

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.001,
    "loss": "squared_error"}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
rmse = root_mean_squared_error(y_test, reg.predict(X_test))



ValueError: setting an array element with a sequence.

In [69]:
type(X_train)

pandas.core.frame.DataFrame