In [None]:
import pandas as pd
from pandas.io import sql
import numpy as np

from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
airbnb_file="data/singapore-airbnb/listings.csv"
o_df = pd.read_csv(airbnb_file)
o_df = o_df.fillna(0)
o_df.head(5)

In [23]:
o_df['minimum_nights'].unique()

array([ 180,   90,    6,    1,   15,   30,    2,    4,   92,    3,   60,
        365,    5,   18,   28,   32,    9,    7,  108,   14,  183,   21,
         10,   13,   12,   84,   70,   29,    8,   25,   24,   93,   20,
         80,   50,  200,  120,  100,  150,  190,   91,   45,   31,  500,
         11,   26,  360,   17,  188,   88,  185,  240,   75,  356,  210,
         59,   89, 1000,   85,  186,   19,   48,  181,   40,   22,  700,
         96,   55,   62,   65,   35,   16,   27])

In [None]:

class ML_model:
    def __init__(self):
        
        self.price_model = RandomForestRegressor(n_estimators=110, max_depth=10)
        self.data = pd.DataFrame(data={})
        self.geo_dict = {}
        
    def prep_price_preds(self, o_df):
        higher_b = o_df["price"].quantile(0.99)
        lower_b = o_df["price"].quantile(0.005)

        m_df = o_df[o_df['price'] <=higher_b]
        df = m_df[m_df['price'] >= lower_b].copy()
        df.drop("id", axis=1,inplace=True)
        df.drop("host_name", axis=1,inplace=True)
        df.drop("host_id", axis=1,inplace=True)
        df.drop("last_review", axis=1,inplace=True)
        df.drop("name", axis=1,inplace=True)
        df.drop("calculated_host_listings_count", axis=1,inplace=True)

        neighbours = list(df["neighbourhood"].unique())
        neighbours_dict = {}
        for i in range(len(neighbours)):
            neighbours_dict[neighbours[i]] = i


        room_types = []
        for index, row in df.iterrows():
            if row['room_type'] == "Private room":
                room_types.append(0)
            elif row['room_type'] == "Entire home/apt":
                room_types.append(1)
            else:
                room_types.append(2)
        df["room_type"] = room_types

        neighbourhood_groups = []
        for index, row in df.iterrows():
            if row['neighbourhood_group'] == "North Region":
                neighbourhood_groups.append(0)
            elif row['neighbourhood_group'] == "Central Region":
                neighbourhood_groups.append(1)
            elif row['neighbourhood_group'] == "East Region":
                neighbourhood_groups.append(2)
            elif row['neighbourhood_group'] == "West Region":
                neighbourhood_groups.append(3)
            else:
                neighbourhood_groups.append(4)
        df["neighbourhood_group"] = neighbourhood_groups


        neighbour_types = []
        for index, row in df.iterrows():
                neighbour_types.append(neighbours_dict[row["neighbourhood"]])
        df["neighbourhood"] = neighbour_types


        for column in df.columns:
            df[column] = pd.to_numeric(df[column])
        self.data =  df
    
    
        for index, row in ml_model.data.iterrows(): 
            if row['neighbourhood_group'] not in self.geo_dict:
                self.geo_dict[row['neighbourhood_group']]={}
            if row['neighbourhood'] not in self.geo_dict[row['neighbourhood_group']]:
                self.geo_dict[row['neighbourhood_group']][row['neighbourhood']] = [0,0]

        for key in self.geo_dict.keys():

            t_df = ml_model.data[ml_model.data['neighbourhood_group'] == key]
            for neigh in self.geo_dict[key].keys():

                df = t_df[t_df['neighbourhood'] == neigh]
                self.geo_dict[key][neigh][0] = df['latitude'].mean(skipna = True) 
                self.geo_dict[key][neigh][1] = df['longitude'].mean(skipna = True)     
        
    
    def build_model(self):
        X = self.data.drop("price",axis=1).values
        y = self.data["price"].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=66)

        #model = RandomForestRegressor(n_estimators=110, max_depth=10)
        self.price_model.fit(X_train,y_train)
        preds = self.price_model.predict(X_test)
        #print(X_test)
        return mean_squared_error(y_test,preds)
        
        
        
        
    def price_prediction(self, df,queries=[]):
        
        preds = self.price_model.predict(X_test)
        print(X_test)
        return mean_squared_error(y_test,preds)

    

In [None]:
ml_model = ML_model()
ml_model.prep_price_preds(o_df)
ml_model.data.head(5)

In [None]:
ml_model.data['number_of_reviews'].mean( skipna = True) 

In [None]:
ml_model.geo_dict

In [None]:
geo_dict = {}
for index, row in ml_model.data.iterrows(): 
    if row['neighbourhood_group'] not in geo_dict:
        geo_dict[row['neighbourhood_group']]={}
    if row['neighbourhood'] not in geo_dict[row['neighbourhood_group']]:
        geo_dict[row['neighbourhood_group']][row['neighbourhood']] = [0,0]
geo_dict
    


In [None]:
for key in geo_dict.keys():
    
    t_df = ml_model.data[ml_model.data['neighbourhood_group'] == key]
    for neigh in geo_dict[key].keys():
        
        df = t_df[t_df['neighbourhood'] == neigh]
        geo_dict[key][neigh][0] = df['latitude'].mean(skipna = True) 
        geo_dict[key][neigh][1] = df['longitude'].mean(skipna = True) 

geo_dict

In [25]:
o_df[o_df['minimum_nights']>900]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
3304,21855881,Where Luxury City Living Reaches New Heights.,28954164,Michelle,Central Region,Tanglin,1.31743,103.82931,Private room,81,1000,0,0,0.0,3,365


In [None]:
df = prep_price_preds(o_df)

In [None]:
ml_model.build_model()

In [None]:
o_df.head(5)