In [19]:
import pandas as pd
from pandas.io import sql
import numpy as np

from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [11]:
airbnb_file="data/singapore-airbnb/listings.csv"
o_df = pd.read_csv(airbnb_file)
o_df = o_df.fillna(0)

In [16]:

def prep_price_preds(o_df):
    higher_b = o_df["price"].quantile(0.99)
    lower_b = o_df["price"].quantile(0.005)

    m_df = o_df[o_df['price'] <=higher_b]
    df = m_df[m_df['price'] >= lower_b].copy()
    df.drop("id", axis=1,inplace=True)
    df.drop("host_name", axis=1,inplace=True)
    df.drop("last_review", axis=1,inplace=True)
    df.drop("name", axis=1,inplace=True)
    
    neighbours = list(df["neighbourhood"].unique())
    neighbours_dict = {}
    for i in range(len(neighbours)):
        neighbours_dict[neighbours[i]] = i
    
    
    room_types = []
    for index, row in df.iterrows():
        if row['room_type'] == "Private room":
            room_types.append(0)
        elif row['room_type'] == "Entire home/apt":
            room_types.append(1)
        else:
            room_types.append(2)
    df["room_type"] = room_types
    
    neighbourhood_groups = []
    for index, row in df.iterrows():
        if row['neighbourhood_group'] == "North Region":
            neighbourhood_groups.append(0)
        elif row['neighbourhood_group'] == "Central Region":
            neighbourhood_groups.append(1)
        elif row['neighbourhood_group'] == "East Region":
            neighbourhood_groups.append(2)
        elif row['neighbourhood_group'] == "West Region":
            neighbourhood_groups.append(3)
        else:
            neighbourhood_groups.append(4)
    df["neighbourhood_group"] = neighbourhood_groups
    
    
    neighbour_types = []
    for index, row in df.iterrows():
            neighbour_types.append(neighbours_dict[row["neighbourhood"]])
    df["neighbourhood"] = neighbour_types
    
    
    for column in df.columns:
        df[column] = pd.to_numeric(df[column])
    return df

In [17]:
df = prep_price_preds(o_df)

In [18]:
df

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,266763,0,0,1.44255,103.79580,0,83,180,1,0.01,2,365
1,227796,1,1,1.33235,103.78521,0,81,90,18,0.28,1,365
2,266763,0,0,1.44246,103.79667,0,69,6,20,0.20,2,365
3,367042,2,2,1.34541,103.95712,0,206,1,14,0.15,9,353
4,367042,2,2,1.34567,103.95963,0,94,1,22,0.22,9,355
5,367042,2,2,1.34702,103.96103,0,104,1,39,0.38,9,346
6,367042,2,2,1.34348,103.96337,0,208,1,25,0.25,9,172
7,1017645,2,3,1.32304,103.91363,0,50,90,174,1.88,4,59
8,1017645,2,3,1.32458,103.91163,0,54,90,198,2.08,4,133
9,1017645,2,3,1.32461,103.91191,0,42,90,236,2.53,4,147


In [23]:
def price_prediction(df,queries=[]):
    X = df.drop("price",axis=1).values
    y = df["price"].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)
    
    model = RandomForestRegressor(n_estimators=110, max_depth=10)
    model.fit(X_train,y_train)
    preds = model.predict(X_test)
    print(X_test)
    return mean_squared_error(y_test,preds)
    

In [24]:
price_prediction(df)

[[2.60100811e+08 1.00000000e+00 6.00000000e+00 ... 0.00000000e+00
  2.40000000e+01 2.99000000e+02]
 [2.08726450e+07 3.00000000e+00 2.60000000e+01 ... 1.20000000e-01
  2.00000000e+00 3.31000000e+02]
 [2.75025685e+08 1.00000000e+00 2.40000000e+01 ... 1.00000000e+00
  2.00000000e+00 1.72000000e+02]
 ...
 [6.34489120e+07 1.00000000e+00 2.30000000e+01 ... 0.00000000e+00
  4.30000000e+01 2.20000000e+01]
 [1.56564680e+07 1.00000000e+00 2.00000000e+01 ... 1.60000000e-01
  1.00000000e+00 0.00000000e+00]
 [3.24101680e+07 1.00000000e+00 4.00000000e+00 ... 4.45000000e+00
  1.00000000e+00 1.17000000e+02]]


5663.6807038545385