# Airbnb New User Bookings - Gradient Boosting with H2O

Load the required libraries

In [1]:
import h2o
import pandas as pd
import numpy as np

Initialize H2O cluster:

In [2]:
h2o.init()
h2o.remove_all() 

0,1
H2O cluster uptime:,1 hours 32 minutes 54 seconds 594 milliseconds
H2O cluster version:,3.6.0.3
H2O cluster name:,H2O_started_from_python
H2O cluster total nodes:,1
H2O cluster total memory:,1.72 GB
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster healthy:,True
H2O Connection ip:,127.0.0.1
H2O Connection port:,54321


## Data Loading

In [3]:
train_users = h2o.import_file("../data/raw/train_users.csv")


Parse Progress: [##################################################] 100%


Remove train users's ID's since we don't need them to train our classifier

## Preprocessing

In [4]:
train_users = train_users.drop('id')

In [5]:
train_users, validation_users = train_users.split_frame(ratios=[0.8])

In [6]:
X = train_users.col_names[:-1]
# Remove date_first_booking
del X[2]

y = train_users.col_names[-1]

### Deep Learning Model

In [7]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

gbe = H2OGradientBoostingEstimator(ntrees=43,
                                  max_depth=6, 
                                  distribution='multinomial',
                                  stopping_metric="logloss",
                                  learn_rate=0.25,
                                  sample_rate=0.6,
                                  balance_classes=True, 
                                  nfolds=0)

In [8]:
gbe.train(X, y, training_frame=train_users, validation_frame=validation_users)


gbm Model Build Progress: [##################################################] 100%


In [9]:
print "R2:", gbe.r2()

R2: 0.932094308166


### Predictions

In [10]:
test_users = h2o.import_file("../data/raw/test_users.csv")


Parse Progress: [##################################################] 100%


In [11]:
predictions = gbe.predict(test_users.drop('id'))

### Make CSV for submission

In [12]:
# Load the predictions into a DataFrame
country_predictions = predictions.as_data_frame()
country_predictions = pd.DataFrame(country_predictions).transpose()

# Use first row as column names
country_predictions.columns = country_predictions.iloc[0]
country_predictions = country_predictions.reindex(country_predictions.index.drop(0))

# Drop labeled prediction
country_predictions.drop('predict', axis=1, inplace=True)

In [13]:
# Load the test users into a DataFrame
test_users = pd.DataFrame(test_users.as_data_frame()).transpose()

# Use first row as column names
test_users.columns = test_users.iloc[0]
test_users = test_users.reindex(test_users.index.drop(0))

In [14]:
submission = []
number_of_users = country_predictions.shape[0]

# Iterate over each user
for user in range(number_of_users):
    # Get the 5 most provable destination countries
    user_prediction = country_predictions.iloc[user].astype(float)
    
    # Sort in descending order
    user_prediction = user_prediction.sort_values(ascending=False)
    
    # Append the 5 with higher provability
    for country in range(5):
        user_id = test_users.iloc[user]['id']
        destination = list(user_prediction.index[0:5])[country]
        submission.append([user_id, destination])

In [15]:
sub_file = pd.DataFrame(submission, columns = ['id', 'country'])

Write CSV file:

In [16]:
sub_file.to_csv('../data/submissions/h2o-gb.csv', index=False)

In [17]:
h2o.shutdown()

Are you sure you want to shutdown the H2O instance running at localhost:54321 (Y/N)? y
