In [1]:
# Word2vec for amenities
from readtable import * 
import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from datetime import datetime
import numpy as np
import pandas as pd

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,17 hours 39 mins
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,29 days
H2O_cluster_name:,H2O_from_python_csridlen_366ufw
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.672 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
airbnb_path = "../data/cleaned_data_updated.csv"
airbnb_words = h2o.import_file(airbnb_path, destination_frame = "airbnbwords",
                             header = 1)
STOP_WORDS = ["w/","at","from","in","to","/","*","-","w","+","and","&", "near", "next"]


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [3]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def predict(airbnb_names,w2v, gbm):
    words = tokenize(h2o.H2OFrame(airbnb_names).ascharacter())
    airbnb_words_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=airbnb_words_vec))

In [9]:
words = tokenize(airbnb_words["amenities"])
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)

airbnb_words_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

valid_airbnb_names = ~ airbnb_words_vecs["C1"].isna()
data = airbnb_words[valid_airbnb_names,:].cbind(airbnb_words_vecs[valid_airbnb_names,:])
data_split = data.split_frame(ratios=[0.8])

word2vec Model Build progress: |█████████████████████████████████████████████████| (done) 100%


In [10]:
# gbm model
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = airbnb_words_vecs.names,
                y= "price", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1652364474688_7


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,19351.0,5.0,5.0,5.0,18.0,31.0,26.12




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 29841.214598731458
RMSE: 172.74609864981454
MAE: 88.98475645147413
RMSLE: 0.6042677779341591
Mean Residual Deviance: 29841.214598731458

ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 36980.83325796875
RMSE: 192.30401258936004
MAE: 98.08044715464094
RMSLE: 0.6457739065795731
Mean Residual Deviance: 36980.83325796875

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
0,,2022-05-13 03:00:57,0.019 sec,0.0,229.046022,121.629054,52462.080371,233.043782,123.263174,54309.404273
1,,2022-05-13 03:00:57,0.218 sec,1.0,224.660189,118.800089,50472.200555,229.618774,121.496416,52724.781455
2,,2022-05-13 03:00:57,0.319 sec,2.0,220.976622,116.486101,48830.667477,226.126391,119.286267,51133.144812
3,,2022-05-13 03:00:57,0.406 sec,3.0,217.66254,114.346768,47376.981102,223.248396,117.366278,49839.846396
4,,2022-05-13 03:00:57,0.485 sec,4.0,214.357387,112.464602,45949.089569,220.833438,115.951605,48767.407444
5,,2022-05-13 03:00:57,0.567 sec,5.0,211.895069,110.95704,44899.520271,218.331126,114.510294,47668.480743
6,,2022-05-13 03:00:57,0.663 sec,6.0,209.460291,109.377726,43873.613706,216.583834,113.127618,46908.557325
7,,2022-05-13 03:00:57,0.732 sec,7.0,207.252923,107.99285,42953.774019,214.925978,112.293977,46193.176155
8,,2022-05-13 03:00:57,0.802 sec,8.0,205.130949,106.771324,42078.706222,213.289363,111.211888,45492.352335
9,,2022-05-13 03:00:57,0.873 sec,9.0,202.95507,105.525301,41190.760277,211.775442,110.193158,44848.837963



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C44,264197760.0,1.0,0.07348
1,C9,244023264.0,0.923639,0.067869
2,C69,192357824.0,0.728083,0.053499
3,C38,101889944.0,0.385658,0.028338
4,C32,93737640.0,0.354801,0.026071
5,C7,89641112.0,0.339296,0.024931
6,C60,84039656.0,0.318094,0.023373
7,C87,82930880.0,0.313897,0.023065
8,C64,82716432.0,0.313085,0.023005
9,C24,78449920.0,0.296936,0.021819



See the whole table with table.as_data_frame()




In [30]:
w2v_model.wv.counts

AttributeError: type object 'ModelBase' has no attribute 'wv'

In [19]:
airbnb_og = get_dists()
x = airbnb_og['amenities']


In [26]:
x = x.astype("category")
counts = x.value_counts()
print(counts.head())
print(predict(["Dedicated workspace", "Wifi", "Long term stays allowed"], w2v_model, gbm_model))

["Hangers", "Long term stays allowed", "Iron", "TV", "Carbon monoxide alarm", "Fire extinguisher", "Elevator", "Hair dryer", "Wifi", "Heating", "Shampoo", "Smoke alarm", "First aid kit", "Air conditioning", "Essentials"]                                                                                                                                                         189
["Dedicated workspace", "Iron", "Hair dryer", "Fire extinguisher", "Building staff", "Essentials", "Heating", "Shower gel", "Body soap", "Air conditioning", "TV", "Smoke alarm", "Hot water", "Hangers", "Wifi", "Luggage dropoff allowed", "Lock on bedroom door", "Carbon monoxide alarm", "First aid kit", "Long term stays allowed", "Conditioner", "Bed linens", "Shampoo"]    150
["Fire extinguisher", "Lock on bedroom door", "Smoke alarm", "Shower gel", "Dedicated workspace", "Conditioner", "Building staff", "Body soap", "Hot water", "TV", "Hangers", "First aid kit", "Hair dryer", "Bed linens", "Long term stays allowed", 

predict
420.654
490.983
121.746
132.426
132.426



None
