### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import pickle
import folium

import haversine as hs
from haversine import Unit

pd.pandas.set_option('display.max_columns', None)

c:\Users\edmun\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\edmun\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


### Importing the dataset 

In [2]:
listings = pd.read_pickle('./data/listings_cleaned.pkl')

In [3]:
listings.shape

(2901, 41)

In [4]:
listings.head()

Unnamed: 0,id,name,description,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_group_cleansed,latitude,longitude,availability_30,availability_60,availability_90,availability_365,property_type,room_type,accommodates,bedrooms,beds,amenities,instant_bookable,price,number_of_reviews,reviews_per_month,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,bathroom_qty,bathroom_type
0,50646,Pleasant Room along Bukit Timah,Fully furnished bedroom with a nice view on th...,227796,Sujatha,a few days or more,0.0,0.730118,f,t,t,Bukit Timah,Central Region,1.33432,103.78521,30,60,90,365,Apartment,Private room,2,1.0,1.0,"[""Gym"", ""Washer"", ""Kitchen"", ""TV with standard...",f,80.0,18,0.18,4.56,4.72,4.78,4.78,4.94,4.72,4.5,1,0,1,1.0,baths
1,71609,Ensuite Room (Room 1 & 2) near EXPO,For 3 rooms.Book room 1&2 and room 4<br /><br ...,367042,Belinda,within an hour,1.0,1.0,f,t,t,Tampines,East Region,1.34537,103.95887,5,35,65,340,House,Private room,6,2.0,3.0,"[""Children\u2019s books and toys"", ""Iron"", ""Ke...",f,145.0,20,0.15,4.44,4.37,4.0,4.63,4.78,4.26,4.32,6,0,6,1.0,private
2,289234,Booking for 3 bedrooms,This whole place can accomodate 8 pax with own...,367042,Belinda,within an hour,1.0,1.0,f,t,t,Tampines,East Region,1.3449,103.95979,0,0,10,285,Apartment,Private room,4,3.0,5.0,"[""Children\u2019s books and toys"", ""Iron"", ""Pa...",t,184.0,12,0.1,4.83,4.67,4.75,4.58,4.67,4.33,4.45,6,0,6,3.0,baths
3,294281,5 mins walk from Newton subway,I have 3 bedrooms in a charming British style...,1521514,Elizabeth,within a day,0.8,0.21,f,t,t,Newton,Central Region,1.31142,103.83924,30,60,90,365,Apartment,Private room,2,1.0,1.0,"[""Private patio or balcony"", ""Iron"", ""Air cond...",f,79.0,133,1.03,4.43,4.33,4.16,4.5,4.66,4.52,4.39,7,1,6,1.0,shared
4,324945,Cozy Blue Room with large window!,"<b>The space</b><br />Great Location, Great pe...",1439258,Kay,within an hour,0.99,0.82,f,t,t,Bukit Merah,Central Region,1.28828,103.8102,5,5,5,181,Apartment,Private room,1,1.0,1.0,"[""Hot water"", ""Washer"", ""Kitchen"", ""Smart lock...",f,49.0,17,0.14,3.62,3.63,4.0,4.5,4.06,4.0,3.88,44,2,42,0.0,baths


As we are going to build an application where users can input their Airbnb listings information to generate a price prediction. \
We will have to focus on using the information provided to train the model to predict the listing price.

In [5]:
# Let us view the features requiring user input and target variable
user_inputs = ['price','latitude', 'longitude', 'property_type', 'room_type', 'bedrooms', 'beds', 'bathroom_qty','bathroom_type']

In [6]:
listings = listings[user_inputs]

In [7]:
listings.head()

Unnamed: 0,price,latitude,longitude,property_type,room_type,bedrooms,beds,bathroom_qty,bathroom_type
0,80.0,1.33432,103.78521,Apartment,Private room,1.0,1.0,1.0,baths
1,145.0,1.34537,103.95887,House,Private room,2.0,3.0,1.0,private
2,184.0,1.3449,103.95979,Apartment,Private room,3.0,5.0,3.0,baths
3,79.0,1.31142,103.83924,Apartment,Private room,1.0,1.0,1.0,shared
4,49.0,1.28828,103.8102,Apartment,Private room,1.0,1.0,0.0,baths


In [8]:
# Save the listings dataframe to a csv file
listings.to_csv('./data/listings_user_inputs.csv', index=False)

### Feature Engineering

#### Proximity to MRT Stations and City Centre

As with properties in Singapore, Listings are likely to be higher in value if they are close to MRT stations.

We can retrieve the location of the MRT stations from the public dataset:
- https://www.kaggle.com/datasets/yxlee245/singapore-train-station-coordinates

#### MRT Stations

In [9]:
mrt = pd.read_csv('./data/mrt_lrt_data.csv')

In [10]:
mrt.head()

Unnamed: 0,station_name,type,lat,lng
0,Jurong East,MRT,1.333207,103.742308
1,Bukit Batok,MRT,1.349069,103.749596
2,Bukit Gombak,MRT,1.359043,103.751863
3,Choa Chu Kang,MRT,1.385417,103.744316
4,Yew Tee,MRT,1.397383,103.747523


In [11]:
# We see that there are LRT services in the dataset
# However, we will only be using MRT services for this project

mrt = mrt[mrt['type'] == 'MRT']

In [12]:
# Save the mrt dataframe to a csv file
mrt.to_csv('./data/mrt_data.csv', index=False)

In [13]:
mrt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 0 to 118
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_name  119 non-null    object 
 1   type          119 non-null    object 
 2   lat           119 non-null    float64
 3   lng           119 non-null    float64
dtypes: float64(2), object(2)
memory usage: 4.6+ KB


In [14]:
# Find the distance for the first listing to its nearest MRT station using Haversine formula
mrtCoord = mrt[['lat', 'lng']]
firstListings = listings[['latitude', 'longitude']].iloc[0]
Disp = [hs.haversine((firstListings[0], firstListings[1]), (station[0], station[1]), unit = Unit.KILOMETERS) for station in mrtCoord.values.tolist()]
minDisp = min(Disp)
mrtIdx = Disp.index(minDisp)
mrtName = mrt['station_name'].iloc[mrtIdx]
print(f'The nearest MRT station to the first listing is {mrtName} with a distance of {round((minDisp),3)} km')

The nearest MRT station to the first listing is King Albert Park with a distance of 0.272 km


In [15]:
# Find the distance from each listing to the nearest MRT station using the Haversine formula
mrtCoord = mrt[['lat', 'lng']]
listingsCoord = listings[['latitude', 'longitude']]
mrtDisp = [min([hs.haversine((listing[0],listing[1]), (station[0],station[1]), unit=Unit.KILOMETERS) for station in mrtCoord.values.tolist()]) for listing in listingsCoord.values.tolist()]
mrtIdx = [np.argmin([hs.haversine((listing[0],listing[1]), (station[0],station[1]), unit=Unit.KILOMETERS) for station in mrtCoord.values.tolist()]) for listing in listingsCoord.values.tolist()]
mrtName = [mrt['station_name'].iloc[idx] for idx in mrtIdx]
listings["mrtDisp"] = mrtDisp
listings["nearestMRT"] = mrtName

In [16]:
listings.head()

Unnamed: 0,price,latitude,longitude,property_type,room_type,bedrooms,beds,bathroom_qty,bathroom_type,mrtDisp,nearestMRT
0,80.0,1.33432,103.78521,Apartment,Private room,1.0,1.0,1.0,baths,0.272114,King Albert Park
1,145.0,1.34537,103.95887,House,Private room,2.0,3.0,1.0,private,0.456774,Upper Changi
2,184.0,1.3449,103.95979,Apartment,Private room,3.0,5.0,3.0,baths,0.353956,Upper Changi
3,79.0,1.31142,103.83924,Apartment,Private room,1.0,1.0,1.0,shared,0.300294,Newton
4,49.0,1.28828,103.8102,Apartment,Private room,1.0,1.0,0.0,baths,0.748483,Redhill


#### City Centre

From Wikipedia, we can obtain the latitude and longitude of Singapore City Centre which is of Latitude: 1.291667, Longitude: 103.85. This is at approximately the location of Funan Shopping Centre.

In [17]:
# Create a dataframe for City centre coordinate 
city_centre = pd.DataFrame({'city_centre': ['City Centre'], 'lat': [1.2833], 'lng': [103.8500]})

In [18]:
city_centre

Unnamed: 0,city_centre,lat,lng
0,City Centre,1.2833,103.85


In [19]:
# Find the distance from each listing to city centre
cityCoord = city_centre[["lat", "lng"]]
listingCoord = listings[["latitude", "longitude"]]
cityDisp = [min([hs.haversine((listing[0],listing[1]), (city[0],city[1]), unit=Unit.KILOMETERS) for city in cityCoord.values.tolist()]) for listing in listingCoord.values.tolist()]
listings["cityDisp"] = cityDisp

In [20]:
listings.head()

Unnamed: 0,price,latitude,longitude,property_type,room_type,bedrooms,beds,bathroom_qty,bathroom_type,mrtDisp,nearestMRT,cityDisp
0,80.0,1.33432,103.78521,Apartment,Private room,1.0,1.0,1.0,baths,0.272114,King Albert Park,9.168433
1,145.0,1.34537,103.95887,House,Private room,2.0,3.0,1.0,private,0.456774,Upper Changi,13.932315
2,184.0,1.3449,103.95979,Apartment,Private room,3.0,5.0,3.0,baths,0.353956,Upper Changi,13.995598
3,79.0,1.31142,103.83924,Apartment,Private room,1.0,1.0,1.0,shared,0.300294,Newton,3.347789
4,49.0,1.28828,103.8102,Apartment,Private room,1.0,1.0,0.0,baths,0.748483,Redhill,4.458968


In [21]:
# Save the listings_final dataframe to a csv file
listings.to_csv('./data/listings_user_final.csv', index=False)