In [18]:
import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from datetime import datetime
import numpy as np
import pandas as pd

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 min 51 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,25 days
H2O_cluster_name:,abbyjohnson
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.761 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [19]:
airbnb_names_path = "../data/airbnb_listings_2021.csv"


In [23]:
airbnb_names = h2o.import_file(airbnb_names_path, destination_frame = "airbnbnames",
                             header = 1)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [41]:
STOP_WORDS = ["w/","at","from","in","to","/","*","-","w","+","and","&", "near", "next"]

In [42]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [43]:
def predict(airbnb_names,w2v, gbm):
    words = tokenize(h2o.H2OFrame(airbnb_names).ascharacter())
    airbnb_names_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=airbnb_names_vec))

In [44]:
print("Break airbnb names into sequence of words")
words = tokenize(airbnb_names["name"])

Break airbnb names into sequence of words


In [45]:
print("Build word2vec model")
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)

Build word2vec model
word2vec Model Build progress: |█████████████████████████████████████████████████| (done) 100%
Model Details
H2OWord2vecEstimator :  Word2Vec
Model Key:  Word2Vec_model_python_1652126024331_5

No model summary for this model




In [46]:
print("Sanity check - find synonyms for the word 'clean'")
w2v_model.find_synonyms("clean", count = 5)

Sanity check - find synonyms for the word 'clean'


OrderedDict([('convenient', 0.6348998546600342),
             ('comfortable', 0.5299177169799805),
             ('environment', 0.5138055682182312),
             ('cute', 0.5103007555007935),
             ('cheerful', 0.5038788318634033)])

In [47]:
print("Calculate a vector for each airbnb name")
airbnb_names_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

Calculate a vector for each airbnb name


In [48]:
print("Prepare training&validation data (keep only names made of known words)")
valid_airbnb_names = ~ airbnb_names_vecs["C1"].isna()
data = airbnb_names[valid_airbnb_names,:].cbind(airbnb_names_vecs[valid_airbnb_names,:])
data_split = data.split_frame(ratios=[0.8])

Prepare training&validation data (keep only names made of known words)


In [49]:
print("Build a basic GBM model")
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = airbnb_names_vecs.names,
                y= "price", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

Build a basic GBM model
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1652126024331_6


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,9862.0,5.0,5.0,5.0,6.0,21.0,11.04




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 63548.62407433614
RMSE: 252.0885242813249
MAE: 96.8214342860207
RMSLE: NaN
Mean Residual Deviance: 63548.62407433614

ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 95966.48017692077
RMSE: 309.7845705920822
MAE: 101.73448832453057
RMSLE: NaN
Mean Residual Deviance: 95966.48017692077

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
0,,2022-05-09 15:10:04,0.042 sec,0.0,283.929606,111.576063,80616.021241,314.501139,113.196552,98910.966746
1,,2022-05-09 15:10:05,0.252 sec,1.0,282.134375,110.457293,79599.805407,313.009403,112.032782,97974.88621
2,,2022-05-09 15:10:05,0.368 sec,2.0,279.414355,109.25187,78072.381787,312.328964,110.975528,97549.381684
3,,2022-05-09 15:10:05,0.474 sec,3.0,277.933607,108.339127,77247.089864,311.197426,110.174469,96843.83816
4,,2022-05-09 15:10:05,0.603 sec,4.0,276.085004,107.506191,76222.929558,310.65519,109.397761,96506.647243
5,,2022-05-09 15:10:05,0.696 sec,5.0,275.086487,106.700746,75672.575311,309.749319,108.658169,95944.640513
6,,2022-05-09 15:10:05,0.798 sec,6.0,273.416167,106.124343,74756.400623,309.621665,108.316545,95865.575485
7,,2022-05-09 15:10:05,0.907 sec,7.0,271.894993,105.306321,73926.887045,309.212738,107.65242,95612.517346
8,,2022-05-09 15:10:05,1.060 sec,8.0,270.946528,104.749718,73412.021017,309.009172,107.178893,95486.668119
9,,2022-05-09 15:10:06,1.143 sec,9.0,270.042493,104.271112,72922.947876,308.394593,106.748778,95107.225209



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C27,362290912.0,1.0,0.136884
1,C83,133559256.0,0.368652,0.050462
2,C32,131694104.0,0.363504,0.049758
3,C84,111575160.0,0.307971,0.042156
4,C51,101447688.0,0.280017,0.03833
5,C25,92763960.0,0.256048,0.035049
6,C40,86613544.0,0.239072,0.032725
7,C93,78999264.0,0.218055,0.029848
8,C10,72973032.0,0.201421,0.027571
9,C87,67833648.0,0.187235,0.025629



See the whole table with table.as_data_frame()




In [50]:
print("Predict!")
print(predict(["Cozy & Clean Apartment for Two"], w2v_model, gbm_model))
print(predict(["Private Room Near Central Park"], w2v_model, gbm_model))
print(predict(["Charming Apartment Walking Distance from Times Square"], w2v_model, gbm_model))

Predict!
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


predict
129.15



None
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


predict
101.527



None
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


predict
175.202



None
