In [1]:
import pandas as pd
from pandas.io import sql
import numpy as np

from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors


  from numpy.core.umath_tests import inner1d


In [2]:
airbnb_file="data/singapore-airbnb/listings.csv"
o_df = pd.read_csv(airbnb_file)
o_df = o_df.fillna(0)
o_df.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,North Region,Woodlands,1.44255,103.7958,Private room,83,180,1,2013-10-21,0.01,2,365
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,Central Region,Bukit Timah,1.33235,103.78521,Private room,81,90,18,2014-12-26,0.28,1,365
2,56334,COZICOMFORT,266763,Francesca,North Region,Woodlands,1.44246,103.79667,Private room,69,6,20,2015-10-01,0.2,2,365
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,East Region,Tampines,1.34541,103.95712,Private room,206,1,14,2019-08-11,0.15,9,353
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,East Region,Tampines,1.34567,103.95963,Private room,94,1,22,2019-07-28,0.22,9,355


In [3]:
o_df['neighbourhood_group'].unique()

array(['North Region', 'Central Region', 'East Region', 'West Region',
       'North-East Region'], dtype=object)

In [7]:

class ML_model:
    def __init__(self):
        
        self.price_model = RandomForestRegressor(n_estimators=110, max_depth=10)
        self.knn_model = NearestNeighbors(10,100, metric='manhattan')
        self.data = pd.DataFrame(data={})
        self.geo_dict = {}
        self.ng_dict = {'North Region':0, 'Central Region':1, 
                        'East Region':2, 'West Region':3,
                        'North-East Region':4 }
        self.n_dict = {}
        
        self.knn_df = pd.DataFrame(data={})
        self.train_df = pd.DataFrame(data={})
        
    def prep_price_preds(self, o_df):
        higher_b = o_df["price"].quantile(0.99)
        lower_b = o_df["price"].quantile(0.005)

        m_df = o_df[o_df['price'] <=higher_b]
        df = m_df[m_df['price'] >= lower_b].copy()
        df.drop("id", axis=1,inplace=True)
        df.drop("host_name", axis=1,inplace=True)
        df.drop("host_id", axis=1,inplace=True)
        df.drop("last_review", axis=1,inplace=True)
        df.drop("name", axis=1,inplace=True)
        #df.drop("calculated_host_listings_count", axis=1,inplace=True)

        neighbours = list(df["neighbourhood"].unique())
        neighbours_dict = {}
        for i in range(len(neighbours)):
            neighbours_dict[neighbours[i]] = i
        self.n_dict = neighbours_dict

        room_types = []
        for index, row in df.iterrows():
            if row['room_type'] == "Private room":
                room_types.append(0)
            elif row['room_type'] == "Entire home/apt":
                room_types.append(1)
            else:
                room_types.append(2)
        df["room_type"] = room_types

        neighbourhood_groups = []
        for index, row in df.iterrows():
            if row['neighbourhood_group'] == "North Region":
                neighbourhood_groups.append(0)
            elif row['neighbourhood_group'] == "Central Region":
                neighbourhood_groups.append(1)
            elif row['neighbourhood_group'] == "East Region":
                neighbourhood_groups.append(2)
            elif row['neighbourhood_group'] == "West Region":
                neighbourhood_groups.append(3)
            else:
                neighbourhood_groups.append(4)
        df["neighbourhood_group"] = neighbourhood_groups


        neighbour_types = []
        for index, row in df.iterrows():
                neighbour_types.append(neighbours_dict[row["neighbourhood"]])
        df["neighbourhood"] = neighbour_types


        for column in df.columns:
            df[column] = pd.to_numeric(df[column])
        self.data =  df
    
    
        for index, row in ml_model.data.iterrows(): 
            if row['neighbourhood_group'] not in self.geo_dict:
                self.geo_dict[row['neighbourhood_group']]={}
            if row['neighbourhood'] not in self.geo_dict[row['neighbourhood_group']]:
                self.geo_dict[row['neighbourhood_group']][row['neighbourhood']] = [0,0]

        for key in self.geo_dict.keys():

            t_df = ml_model.data[ml_model.data['neighbourhood_group'] == key]
            for neigh in self.geo_dict[key].keys():

                df = t_df[t_df['neighbourhood'] == neigh]
                self.geo_dict[key][neigh][0] = df['latitude'].mean(skipna = True) 
                self.geo_dict[key][neigh][1] = df['longitude'].mean(skipna = True)     
        
    
    
    def prep_knn_preds(self):
        
        k_df = ml_model.data[["latitude","longitude","room_type","price","minimum_nights"]].copy()
        rt =k_df['room_type'] ==0
        
        self.knn_df = k_df[rt].copy()
        new_id = list(range(0, len(self.knn_df)))
        self.knn_df['new_id'] = new_id
        self.knn_df['latitude'] = self.knn_df['latitude']*1000
        self.knn_df['longitude'] = self.knn_df['longitude']*1000

        self.train_df = self.knn_df.copy()
        self.train_df = self.train_df.reset_index()
        self.train_df.drop("index", axis=1,inplace=True)
        self.train_df.drop("new_id", axis=1,inplace=True)
    
    
    
    
    
    
    def build_price_model(self):
        X = self.data.drop("price",axis=1).values
        y = self.data["price"].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=66)

        #model = RandomForestRegressor(n_estimators=110, max_depth=10)
        self.price_model.fit(X_train,y_train)
        
        '''
       # neighbourhood_group', 'neighbourhood', 'latitude', 'longitude',
       #'room_type', 'price', 'minimum_nights', 'number_of_reviews',
       #'reviews_per_month', 'calculated_host_listings_count',
       #'availability_365'
        ng = self.ng_dict[]
        n  = self.n_dict[]
        lat = self.geo_dict[ng][n][0]
        lng = self.geo_dict[ng][n][1]
        rt = []
        mn = self.data["price"].mean(skipna = True) 
        nr = self.data["number_of_reviews"].mean(skipna = True) 
        rpm = self.data["reviews_per_month"].mean(skipna = True) 
        chlc = self.data["calculated_host_listings_count"].mean(skipna = True)
        avai = self.data["availability_365"].mean(skipna = True)
        
        
        query = [[ng,b,lat,lng,rt,mn,nr,rpm,chlc,avai]]
        '''
        preds = self.price_model.predict(X_test)
        #print(X_test)
        return mean_squared_error(y_test,preds)
        
    def build_knn_model(self):
        
        X = self.train_df.values
        self.knn_model.fit(X)
        
        latitude = 1.33235
        longitude = 103.78521
        room_type = 0
        price = 81
        minumum_nights = 90
        query = [[latitude*1000, longitude*1000,room_type, price, minumum_nights]]
        preds = self.knn_model.kneighbors(query, 10, return_distance=False)
        
        # for each query
        for item in preds:
            #for each result
            for ele in item[1:]:
                
                ind = self.knn_df[self.knn_df['new_id']==ele].index[0]
        
                #train_df.loc[2264]
                row = self.data.loc[ind]
                print(row.to_string)
        
        
        
    def price_prediction(self, df,queries=[]):
        
        preds = self.price_model.predict(X_test)
        print(X_test)
        return mean_squared_error(y_test,preds)

    
    

In [8]:
ml_model = ML_model()
ml_model.prep_price_preds(o_df)
ml_model.data.head(5)

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,0,0,1.44255,103.7958,0,83,180,1,0.01,2,365
1,1,1,1.33235,103.78521,0,81,90,18,0.28,1,365
2,0,0,1.44246,103.79667,0,69,6,20,0.2,2,365
3,2,2,1.34541,103.95712,0,206,1,14,0.15,9,353
4,2,2,1.34567,103.95963,0,94,1,22,0.22,9,355


In [109]:
ml_model.n_dict

{'Woodlands': 0,
 'Bukit Timah': 1,
 'Tampines': 2,
 'Bedok': 3,
 'Bukit Merah': 4,
 'Newton': 5,
 'Geylang': 6,
 'River Valley': 7,
 'Jurong West': 8,
 'Rochor': 9,
 'Queenstown': 10,
 'Serangoon': 11,
 'Marine Parade': 12,
 'Pasir Ris': 13,
 'Toa Payoh': 14,
 'Outram': 15,
 'Punggol': 16,
 'Tanglin': 17,
 'Hougang': 18,
 'Kallang': 19,
 'Novena': 20,
 'Downtown Core': 21,
 'Bukit Panjang': 22,
 'Singapore River': 23,
 'Orchard': 24,
 'Ang Mo Kio': 25,
 'Bukit Batok': 26,
 'Museum': 27,
 'Sembawang': 28,
 'Choa Chu Kang': 29,
 'Central Water Catchment': 30,
 'Sengkang': 31,
 'Clementi': 32,
 'Jurong East': 33,
 'Bishan': 34,
 'Yishun': 35,
 'Mandai': 36,
 'Southern Islands': 37,
 'Sungei Kadut': 38,
 'Western Water Catchment': 39,
 'Marina South': 40,
 'Lim Chu Kang': 41}

In [None]:
ml_model.geo_dict

In [None]:
geo_dict = {}
for index, row in ml_model.data.iterrows(): 
    if row['neighbourhood_group'] not in geo_dict:
        geo_dict[row['neighbourhood_group']]={}
    if row['neighbourhood'] not in geo_dict[row['neighbourhood_group']]:
        geo_dict[row['neighbourhood_group']][row['neighbourhood']] = [0,0]
geo_dict
    


In [None]:
for key in geo_dict.keys():
    
    t_df = ml_model.data[ml_model.data['neighbourhood_group'] == key]
    for neigh in geo_dict[key].keys():
        
        df = t_df[t_df['neighbourhood'] == neigh]
        geo_dict[key][neigh][0] = df['latitude'].mean(skipna = True) 
        geo_dict[key][neigh][1] = df['longitude'].mean(skipna = True) 

geo_dict

In [25]:
o_df[o_df['minimum_nights']>900]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
3304,21855881,Where Luxury City Living Reaches New Heights.,28954164,Michelle,Central Region,Tanglin,1.31743,103.82931,Private room,81,1000,0,0,0.0,3,365


In [None]:
df = prep_price_preds(o_df)

In [94]:
ml_model.build_price_model()

6564.959569303197

In [95]:
ml_model.prep_knn_preds()

In [96]:
ml_model.build_knn_model()

<bound method Series.to_string of neighbourhood_group                 1.00000
neighbourhood                       1.00000
latitude                            1.33255
longitude                         103.79093
room_type                           0.00000
price                             100.00000
minimum_nights                     90.00000
number_of_reviews                   0.00000
reviews_per_month                   0.00000
calculated_host_listings_count      1.00000
availability_365                  177.00000
Name: 4391, dtype: float64>
<bound method Series.to_string of neighbourhood_group                 1.00000
neighbourhood                       1.00000
latitude                            1.31818
longitude                         103.78670
room_type                           0.00000
price                              99.00000
minimum_nights                     90.00000
number_of_reviews                   1.00000
reviews_per_month                   0.05000
calculated_host_listings

In [None]:
o_df.head(5)

In [36]:
o_df[o_df['price']>4000]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
1319,11544732,P,60916030,Yin,Central Region,Outram,1.28348,103.84135,Private room,7000,2,5,2017-12-12,0.18,1,365
2100,16395877,The Club Residences - Contemporary Manor,84411185,Darren,Central Region,Southern Islands,1.25284,103.82225,Entire home/apt,8900,2,0,0,0.0,6,361
2392,17831833,Comfortable & Quiet Master Bedroom,68223771,Yolivia,West Region,Bukit Panjang,1.34775,103.77181,Private room,10000,1,1,2017-05-27,0.04,1,0
2764,19739630,Lakeside Master room of condo 裕廊湖畔公寓主人房,139204582,X-Roy,West Region,Jurong East,1.33633,103.72569,Private room,7000,2,0,0,0.0,1,0
2813,19986946,旅行家,34732327,Xia,East Region,Bedok,1.32184,103.93501,Private room,5000,3,0,0,0.0,1,79
2978,20791161,YOUR entire PRIVATE LUXURY PENTHOUSE condo unit,122991242,Jj,West Region,Tuas,1.31909,103.64656,Entire home/apt,10000,2,5,2017-10-31,0.21,1,89
3485,22617828,The Club Residences - Contemporary Manor (A),84411185,Darren,Central Region,Southern Islands,1.25054,103.82551,Entire home/apt,8900,2,0,0,0.0,6,0
4399,27115768,Hotel style master bedroom,188629774,Jo,Central Region,Kallang,1.30004,103.85992,Private room,6000,1,0,0,0.0,1,0
5125,29549880,2-bedroom luxury penthouse + Jacuzzi + BBQ deck,800558,Kim (金),Central Region,Rochor,1.31313,103.85394,Entire home/apt,4162,2,0,0,0.0,6,364
5823,32026675,Testing,20307016,David,Central Region,Kallang,1.2961,103.86613,Private room,10000,1,0,0,0.0,1,0


In [62]:
knn_df = ml_model.data[["latitude","longitude","room_type","price","minimum_nights"]].copy()
rt =knn_df['room_type'] ==0
new_id = list(range(0, len(k_df)))
k_df = knn_df[rt].copy()
k_df['new_id'] = new_id
k_df['latitude'] = k_df['latitude']*1000
k_df['longitude'] = k_df['longitude']*1000

train_df = k_df.copy()
train_df = train_df.reset_index()
train_df.drop("index", axis=1,inplace=True)
train_df.drop("new_id", axis=1,inplace=True)

In [63]:
train_df.head(10)

Unnamed: 0,latitude,longitude,room_type,price,minimum_nights
0,1442.55,103795.8,0,83,180
1,1332.35,103785.21,0,81,90
2,1442.46,103796.67,0,69,6
3,1345.41,103957.12,0,206,1
4,1345.67,103959.63,0,94,1
5,1347.02,103961.03,0,104,1
6,1343.48,103963.37,0,208,1
7,1323.04,103913.63,0,50,90
8,1324.58,103911.63,0,54,90
9,1324.61,103911.91,0,42,90


In [74]:
k_df.head(10)

Unnamed: 0,latitude,longitude,room_type,price,minimum_nights,new_id
0,1442.55,103795.8,0,83,180,0
1,1332.35,103785.21,0,81,90,1
2,1442.46,103796.67,0,69,6,2
3,1345.41,103957.12,0,206,1,3
4,1345.67,103959.63,0,94,1,4
5,1347.02,103961.03,0,104,1,5
6,1343.48,103963.37,0,208,1,6
7,1323.04,103913.63,0,50,90,7
8,1324.58,103911.63,0,54,90,8
9,1324.61,103911.91,0,42,90,9


In [67]:
X = train_df.values
model = NearestNeighbors(10,100, metric='manhattan')
model.fit(X) 

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=100)

In [68]:
latitude = 1.33235
longitude = 103.78521
room_type = 0
price = 81
minumum_nights = 90
query = [[latitude*1000, longitude*1000,room_type, price, minumum_nights]]

In [79]:
preds = model.kneighbors(query, 10, return_distance=False)

In [70]:
train_df.loc[2264]

latitude            1332.55
longitude         103790.93
room_type              0.00
price                100.00
minimum_nights        90.00
Name: 2264, dtype: float64

In [77]:
k_df[k_df['new_id']==2264].index[0]

4391

In [84]:
for item in preds:
    for ele in item[1:]:
        print(ele)

2264
1740
314
282
1021
741
1563
2799
1926


In [82]:
preds

array([[   1, 2264, 1740,  314,  282, 1021,  741, 1563, 2799, 1926]])