# NYC AirBnB 

### Loading data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('AB_NYC_2019.csv')

Let's look at the different columns

In [3]:
for c in df: print(c)

id
name
host_id
host_name
neighbourhood_group
neighbourhood
latitude
longitude
room_type
price
minimum_nights
number_of_reviews
last_review
reviews_per_month
calculated_host_listings_count
availability_365


In [3]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [4]:
df_id = df.loc[:,'id']
def id_sanity():
    A = len(df_id.unique())
    B = len(df_id)
    C = ('is' if A==B else 'is not')
    print("There are {} unique ID's and {} entries. Data {} sane".format(A,B,C))
    
id_sanity()

There are 48895 unique ID's and 48895 entries. Data is sane


Beelow we count the number of hosts

In [5]:
def count_hosts():
    A = len(df.loc[:,'host_id'].unique())
    B = len(df)
    print('Unique hosts: {} \nTotal bookings: {}'.format(A,B))
count_hosts()

Unique hosts: 37457 
Total bookings: 48895


We clean the dataframe by removing columns and replacing NaN values

In [3]:
def clean_nyairbnb(df):
    df_clean = df
    #for now ignore neighbourhoods
    df_clean = df_clean.drop('neighbourhood',axis=1)
   #name is irrelevant. id and host_id number should not be weighted
    df_clean = df_clean.drop(['name','id','host_id'],axis=1)
    #zero prices are illogical
    df_clean = df_clean[df.price>0]
    #host name is unimportant
    df_clean = df_clean.drop(['host_name'],axis=1)
    #last review date is dropped
    df_clean = df_clean.drop(['last_review'],axis=1)
    #replace @reviews_per_month NaN values with 25% of mean
    col = df_clean.reviews_per_month
    filler = round(col.mean()*0.25,2)
    col.fillna(filler,inplace=True)
    df_clean = df_clean.drop('reviews_per_month',axis=1)
    df_clean = pd.concat([df_clean,col],axis=1)
    return df_clean

df_cleaned = clean_nyairbnb(df)

Encode categorical columns using one-hot

In [4]:
df_cleaned = pd.get_dummies(df_cleaned,dummy_na=False)

### Training and test data

In [5]:
#shuffle the data first
df_cleaned = df_cleaned.sample(frac=1).reset_index(drop=True)

In [6]:
x_cols = list(filter(lambda a: a != 'price', df_cleaned.columns))
x = df_cleaned[x_cols]
train_num = int(0.75*len(df_cleaned)) 
x_train = x.iloc[0:train_num]
x_test = x.iloc[train_num:]
y = df_cleaned['price'].astype('float32')
y_train = y.iloc[0:train_num]
y_test = y.iloc[train_num:]

### Check multiple correlation  
latitude + longitude ~ price

In [None]:
import pandas
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
latitude = df.loc[:,'latitude']
longitude = df.loc[:,'longitude']
price = df.loc[:,'price']
df_dict = {"latitude": latitude, "longitude" : longitude, "price" : price}
location_price_data = pandas.DataFrame(df_dict)     
                                

In [None]:
model = ols("price ~ longitude + latitude", data=location_price_data).fit()
print(model.rsquared**.5)
#print(model.params)
#print(model.summary())

### See result for linear regression

In [7]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [8]:
linreg = LinearRegression()
linreg.fit(x_train,y_train)
y_pred = linreg.predict(x_test)

In [9]:
RMSE = (sum((y_pred - np.array(y_test))**2)/len(x_test))**(0.5)
print(RMSE)

229.219706644271


So we see the root mean squared error (RMSE) is very high. Indeed as a percentage of the average value:

In [10]:
ratio = (RMSE/np.mean(y_test))*100
print("RMSE is {0:.2f}% of average price".format(ratio))

RMSE is 149.44% of average price


### Try random forest regression

In [136]:
from sklearn.ensemble import RandomForestRegressor

In [146]:
rfreg = RandomForestRegressor(n_estimators=50)
rfreg.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [158]:
y_pred = rfreg.predict(x_test)
RMSE_forest = (sum((y_test - y_pred)**2))**(0.5) 
print(RMSE_forest/len(y_test))

2.079942916387713


A pretty good error margin one would think

### Building a neural network

In [11]:
import tensorflow as tf

In [56]:
#put things in np arrays, which tf understands
#X_data = np.array(x)
#y_data = np.array(y)
#X_data = tf.placeholder(dtype = tf.float32,shape=[1,16])
#y_data = tf.placeholder(dtype = tf.float32,shape=[1])
x = np.array(x).astype('float32')
y = np.array(y).astype('float32')

dataset = tf.data.Dataset.from_tensor_slices((x, y))

iterator = dataset.make_one_shot_iterator()
X_data, y_data = iterator.get_next()

The neural network will have 1 input layer, 1 hidden layer and 1 output layer

In [124]:
input_dim = x.shape[1]
num_nodes_1 = 10 
num_nodes_2 = 1
X_data = tf.reshape(X_data,[1,input_dim])

The first hidden layer is defined in terms of the input layer

In [125]:
W_1 = tf.Variable(tf.random_uniform([input_dim,num_nodes_1]),dtype=tf.float32)
b_1 = tf.Variable(tf.zeros([num_nodes_1]),dtype=tf.float32)
layer_1 = tf.add(tf.matmul(X_data,W_1), b_1)
layer_1 = tf.nn.relu(layer_1)

In [126]:
W_2 = tf.Variable(tf.random_uniform([num_nodes_1,num_nodes_2]),dtype=tf.float32)
b_2 = tf.Variable(tf.zeros([num_nodes_2]),dtype=tf.float32)
layer_2 = tf.add(tf.matmul(layer_1,W_2), b_2)
layer_2 = tf.nn.relu(layer_2)

Finally the output layer is defined

In [127]:
W_out = tf.Variable(tf.random_uniform([num_nodes_2,1]))
b_out = tf.Variable(tf.zeros([1]))
output = tf.add(tf.matmul(layer_2,W_out), b_out)

finally, the output layer

In [130]:
cost = tf.reduce_mean(tf.square(output - y_data))
optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)

In [134]:
num_iterations = 30
n_samples = x.shape[0]

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    tot_loss = 0
    for i in range(100):
        _, l = sess.run([optimizer,cost])
        tot_loss += l
        print('Epoch {0}: {1}'.format(i, tot_loss/n_samples))
    #print('Loss: {}'.format(tot_loss/n_samples))

Epoch 0: 2.746934388041077
Epoch 1: 5.388221510361263
Epoch 2: 5.4884628921432626
Epoch 3: 5.577537831908191
Epoch 4: 5.923117178550242
Epoch 5: 6.740957227825056
Epoch 6: 6.865122473051637
Epoch 7: 7.159170627867119
Epoch 8: 8.385761575314136
Epoch 9: 8.669729425224128
Epoch 10: 8.89438844341643
Epoch 11: 10.782109566381
Epoch 12: 10.985508792578477
Epoch 13: 11.099641568160722
Epoch 14: 11.621234698709573
Epoch 15: 11.89011767054724
Epoch 16: 12.093150915544324
Epoch 17: 13.580073560223566
Epoch 18: 14.394666045750526
Epoch 19: 15.053862753773602
Epoch 20: 15.101932226914098
Epoch 21: 15.167154656583378
Epoch 22: 16.641411475020902
Epoch 23: 17.161182171018062
Epoch 24: 17.887036841002104
Epoch 25: 18.34321239019765
Epoch 26: 18.36455577153951
Epoch 27: 18.72801649616311
Epoch 28: 18.826167819640627
Epoch 29: 18.96122641938377
Epoch 30: 19.059292440474515
Epoch 31: 19.260695211255857
Epoch 32: 19.38902504086524
Epoch 33: 19.70452095258354
Epoch 34: 20.159673951903358
Epoch 35: 21.753

### Using an out-of-the-box neural network

In [148]:
from sklearn.neural_network import MLPRegressor

In [149]:
num_neurons = (10,)
mlpreg = MLPRegressor(hidden_layer_sizes = num_neurons, activation = 'relu')

In [150]:
mlpreg.fit(x_train,y_train)



MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [152]:
y_pred_nn = mlpreg.predict(x_test)

In [157]:
RMSE_nn = (sum((y_pred_nn - y_test)**2))**(0.5)
RMSE_nn/len(y_test)
print(RMSE_nn)

25186.91837434137
