In [176]:
# Libraries
import pandas as pd
from readtable import *
import numpy as np
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error

In [177]:
# Data
airbnb = get_dists()

In [178]:
airbnb['host_since'] = (pd.to_datetime(airbnb['last_scraped']) - pd.to_datetime(airbnb['host_since'])) / np.timedelta64(1, 'D')
# columns we don't need for ML model
drop = ['id', 'month', 'last_scraped', 'host_id', 'host_name', 'neighbourhood',
       'latitude', 'longitude',
       'first_review', 'last_review',
       'scrape_batch', 'batch_YRMO',
       'instant_bookable',
       'host_location',
       'host_is_superhost',
       'cum_sum',
       'has_availability',
       'amenities',
       'neighbourhood_cleansed',
       'neighbourhood_group_cleansed',
       'property_type']
data = airbnb.drop(drop, axis = 1)
data.dtypes
dict_ints = {"List_month": float,
            'host_is_superhost_dum': float,
             'room_type_dum':float,
             'instant_bookable_dum':float,
            'List_month_byhost_month': float,
             'List_month_host_overall': float,
             'List_month_id_overall': float,
             'hotel_dum':float,
             'List_month_byneigh':float,
            'bathrooms': float}
data = data.astype(dict_ints)
X = data.dropna(subset = ['price']).loc[:, data.columns != 'price']
y = data['price'].dropna()



In [179]:
# One hot encoding
# which variables are categorical?
dict_cats = {"host_response_time": "category",
             "room_type": "category"}

# drop amenities for now
# Need to make host_response rate as int
X['host_response_rate'] = X['host_response_rate'].str[:-1].astype(float)


In [180]:
X = X.astype(dict_cats)
encoder = OneHotEncoder(handle_unknown = "ignore")
encoder_hrt = pd.DataFrame(encoder.fit_transform(X[["host_response_time"]]).toarray())
encoder_hrt.columns = list(X["host_response_time"].unique())

# Merge with original dataset
X = X.join(encoder_hrt)

# Do same for room type
encoder_rt = pd.DataFrame(encoder.fit_transform(X[["room_type"]]).toarray())
encoder_rt.columns = list(X["room_type"].unique())

X = X.join(encoder_rt)

## Drop original variables
original = list(dict_cats.keys())
X = X.drop(original, axis = 1)

In [181]:
## GB Model
X.columns = X.columns.astype(str)
print(X.dtypes)
X = X.fillna(0)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
params = {
    "n_estimators": 100,
    "max_depth": 5,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error"}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)


List_month                        float64
host_since                        float64
host_response_rate                float64
host_listings_count               float64
host_total_listings_count         float64
accommodates                      float64
bathrooms                         float64
bedrooms                          float64
beds                              float64
minimum_nights                    float64
maximum_nights                    float64
availability_30                   float64
availability_60                   float64
availability_90                   float64
availability_365                  float64
number_of_reviews                 float64
review_scores_rating              float64
review_scores_accuracy            float64
review_scores_cleanliness         float64
review_scores_checkin             float64
review_scores_communication       float64
review_scores_location            float64
review_scores_value               float64
calculated_host_listings_count    

GradientBoostingRegressor(learning_rate=0.01, max_depth=5, min_samples_split=5,
                          n_estimators=500)

In [68]:
rmse = sqrt(mean_squared_error(y_test, reg.predict(X_test))) # manually extract rmse from mse
print(rmse) #out-of-sample rmse

Index([                    'List_month',                     'host_since',
                   'host_response_time',             'host_response_rate',
                  'host_listings_count',      'host_total_listings_count',
               'neighbourhood_cleansed',   'neighbourhood_group_cleansed',
                        'property_type',                      'room_type',
                         'accommodates',                      'bathrooms',
                             'bedrooms',                           'beds',
                       'minimum_nights',                 'maximum_nights',
                      'availability_30',                'availability_60',
                      'availability_90',               'availability_365',
                    'number_of_reviews',           'review_scores_rating',
               'review_scores_accuracy',      'review_scores_cleanliness',
                'review_scores_checkin',    'review_scores_communication',
               'review_sc

             id  station_dist  station_dist2  park_dist  park_dist2
0          3831      0.249637       0.301503   0.676109    1.021501
1          5136      0.210077       0.306295   0.638516    1.018294
2          5178      0.127743       0.168968   0.230626    0.296666
3          5803      0.186458       0.227987   0.617910    0.963935
4          7064      0.287763       0.350759   0.556234    0.814657
...         ...           ...            ...        ...         ...
50416  53662330      5.655837       7.857410   0.797285    1.293766
50417  53662542      5.604813       7.698872   0.722559    1.135228
50418  53662772      0.074870       0.103195   0.774796    0.948325
50419  53663081      0.120837       0.121759   0.714898    0.844179
50420  53665099      0.177626       0.262641   0.347464    0.473793

[50421 rows x 5 columns]
