In [1]:
import numpy as np
import pandas as pd
import os
print(os.listdir("seattle"))
from re import sub
from decimal import Decimal

['reviews.csv', 'listings.csv', 'calendar.csv']


# Grabbing Airbnb Listings Data

We chose specific data points from the listings data that we thought were easy for us to categorize during our data cleaning process, and, intuitively, seemed most relevant to predicting the pricing.

Data points:
- host_is_superhost
- neighbourhood_group_cleansed
- property_type
- room_type
- latitude
- longitude
- guests_included
- bathrooms
- bedrooms
- beds
- bed_type
- amenities
- price
- cleaning_fee
- instant_bookable
- cancellation_policy

In [2]:
listings = pd.read_csv('seattle/listings.csv')
ld = listings.loc[:,['host_is_superhost','neighbourhood_group_cleansed', 'property_type', 'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'cleaning_fee', 'instant_bookable', 'cancellation_policy']] 

In [3]:
display(listings.head())

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15
3,7421966,https://www.airbnb.com/rooms/7421966,20160104002432,2016-01-04,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,...,,f,,WASHINGTON,f,flexible,f,f,1,
4,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,9.0,f,,WASHINGTON,f,strict,f,f,1,0.89


In [4]:
ld.count()

host_is_superhost               3816
neighbourhood_group_cleansed    3818
property_type                   3817
room_type                       3818
latitude                        3818
longitude                       3818
guests_included                 3818
bathrooms                       3802
bedrooms                        3812
beds                            3817
bed_type                        3818
amenities                       3818
price                           3818
cleaning_fee                    2788
instant_bookable                3818
cancellation_policy             3818
dtype: int64

# Removing all the listings with missing values

In this step, we are looking for all the listings containing missing values.  We will remove them, and store them in another dataframe. 

In [5]:
ld = ld.dropna(subset=['host_is_superhost','neighbourhood_group_cleansed', 'property_type', 'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'instant_bookable', 'cancellation_policy']) 
ld.count()

host_is_superhost               3793
neighbourhood_group_cleansed    3793
property_type                   3793
room_type                       3793
latitude                        3793
longitude                       3793
guests_included                 3793
bathrooms                       3793
bedrooms                        3793
beds                            3793
bed_type                        3793
amenities                       3793
price                           3793
cleaning_fee                    2771
instant_bookable                3793
cancellation_policy             3793
dtype: int64

In [6]:
ld['cleaning_fee'] = ld['cleaning_fee'].fillna(0)

In [7]:
ld['cleaning_fee'].head()

0          0
1     $40.00
2    $300.00
3          0
4    $125.00
Name: cleaning_fee, dtype: object

In [8]:
ld.head()

Unnamed: 0,host_is_superhost,neighbourhood_group_cleansed,property_type,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,amenities,price,cleaning_fee,instant_bookable,cancellation_policy
0,f,Queen Anne,Apartment,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,f,moderate
1,t,Queen Anne,Apartment,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,f,strict
2,f,Queen Anne,House,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,f,strict
3,f,Queen Anne,Apartment,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,f,flexible
4,f,Queen Anne,House,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,f,strict


## Column 1: host_is_superhost
- Boolean declaring whether host fulfills Airbnb's superhost requirements: https://www.airbnb.ca/help/article/829/how-do-i-become-a-superhost
- Convert `True = 1` and `False = 0`

In [9]:
ld.loc[ld.loc[:, 'host_is_superhost'] == 't', 'host_is_superhost'] = 1
ld.loc[ld.loc[:, 'host_is_superhost'] == 'f', 'host_is_superhost'] = 0

In [10]:
ld.head()

Unnamed: 0,host_is_superhost,neighbourhood_group_cleansed,property_type,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,amenities,price,cleaning_fee,instant_bookable,cancellation_policy
0,0,Queen Anne,Apartment,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,f,moderate
1,1,Queen Anne,Apartment,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,f,strict
2,0,Queen Anne,House,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,f,strict
3,0,Queen Anne,Apartment,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,f,flexible
4,0,Queen Anne,House,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,f,strict


## Column 2: neighbourhood_group_cleansed
- neighborhood category string that states which neighbourhood it belongs to

In [11]:
# all the possible values
set(ld['neighbourhood_group_cleansed'])

{'Ballard',
 'Beacon Hill',
 'Capitol Hill',
 'Cascade',
 'Central Area',
 'Delridge',
 'Downtown',
 'Interbay',
 'Lake City',
 'Magnolia',
 'Northgate',
 'Other neighborhoods',
 'Queen Anne',
 'Rainier Valley',
 'Seward Park',
 'University District',
 'West Seattle'}

In [12]:
neighbourhood = pd.get_dummies(ld['neighbourhood_group_cleansed'].str.lower().str.replace(' ', '_'))

In [13]:
neighbourhood.head()

Unnamed: 0,ballard,beacon_hill,capitol_hill,cascade,central_area,delridge,downtown,interbay,lake_city,magnolia,northgate,other_neighborhoods,queen_anne,rainier_valley,seward_park,university_district,west_seattle
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [14]:
ld_1 = pd.merge(ld, neighbourhood, left_index=True, right_index=True)
ld_1 = ld_1.drop('neighbourhood_group_cleansed', 1)

In [15]:
ld_1.head()

Unnamed: 0,host_is_superhost,property_type,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,...,interbay,lake_city,magnolia,northgate,other_neighborhoods,queen_anne,rainier_valley,seward_park,university_district,west_seattle
0,0,Apartment,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,...,0,0,0,0,0,1,0,0,0,0
1,1,Apartment,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,...,0,0,0,0,0,1,0,0,0,0
2,0,House,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,...,0,0,0,0,0,1,0,0,0,0
3,0,Apartment,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,...,0,0,0,0,0,1,0,0,0,0
4,0,House,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,...,0,0,0,0,0,1,0,0,0,0


 ## Column 3: Property type
 
 - Column indicates which property type it is(ex. house, apartment, etc)

In [16]:
property_type = pd.get_dummies(ld_1['property_type'])

In [17]:
property_type.head()

Unnamed: 0,Apartment,Bed & Breakfast,Boat,Bungalow,Cabin,Camper/RV,Chalet,Condominium,Dorm,House,Loft,Other,Tent,Townhouse,Treehouse,Yurt
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [18]:
ld_2 = pd.merge(ld_1, property_type, left_index=True, right_index=True)
ld_2 = ld_2.drop('property_type', 1)

In [19]:
ld_2.head()

Unnamed: 0,host_is_superhost,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,amenities,...,Chalet,Condominium,Dorm,House,Loft,Other,Tent,Townhouse,Treehouse,Yurt
0,0,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",...,0,0,0,0,0,0,0,0,0,0
1,1,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",...,0,0,0,0,0,0,0,0,0,0
2,0,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",...,0,0,0,1,0,0,0,0,0,0
3,0,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",...,0,0,0,0,0,0,0,0,0,0
4,0,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",...,0,0,0,1,0,0,0,0,0,0


## Column 4: Room type
 
 - Column indicates which room type it is(ex. Entire home/apt )

In [20]:
room_type = pd.get_dummies(ld_2['room_type'])

In [21]:
room_type.head()

Unnamed: 0,Entire home/apt,Private room,Shared room
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [22]:
ld_3= pd.merge(ld_2, room_type, left_index=True, right_index=True)
ld_3 = ld_3.drop('room_type', 1)

## Column 5: Bed type
 
 - Column states what kind of bed the listing has(ex. Real Bed, Futon, etc )

In [23]:
bed_type = pd.get_dummies(ld_3['bed_type'])

In [24]:
bed_type.head()

Unnamed: 0,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [25]:
ld_4= pd.merge(ld_3, bed_type, left_index=True, right_index=True)
ld_4 = ld_4.drop('bed_type', 1)

## Column 6: Instant Bookable
 
- Boolean declaring whether or not the listing can be instant booked. 
- Convert `True = 1` and `False = 0`

In [26]:
ld_4.loc[ld_4.loc[:, 'instant_bookable'] == 't', 'instant_bookable'] = 1
ld_4.loc[ld_4.loc[:, 'instant_bookable'] == 'f', 'instant_bookable'] = 0

In [27]:
ld_4.head()

Unnamed: 0,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,...,Treehouse,Yurt,Entire home/apt,Private room,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,0,47.636289,-122.371025,2,1.0,1.0,1.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,...,0,0,1,0,0,0,0,0,0,1
1,1,47.639123,-122.365666,1,1.0,1.0,1.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,...,0,0,1,0,0,0,0,0,0,1
2,0,47.629724,-122.369483,10,4.5,5.0,7.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,...,0,0,1,0,0,0,0,0,0,1
3,0,47.638473,-122.369279,1,1.0,0.0,2.0,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,...,0,0,1,0,0,0,0,0,0,1
4,0,47.632918,-122.372471,6,2.0,3.0,3.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,...,0,0,1,0,0,0,0,0,0,1


## Column 7: Cancellation policy
- Column indicates which kind of standardlized cancellation policy the host chooses.
- There are three cancellation policies - flexible, moderate and strict 

In [28]:
cancellation = pd.get_dummies(ld_4['cancellation_policy'])

In [29]:
cancellation.head()

Unnamed: 0,flexible,moderate,strict
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0
4,0,0,1


In [30]:
ld_5= pd.merge(ld_4, cancellation, left_index=True, right_index=True)
ld_5 = ld_5.drop('cancellation_policy', 1)

# Column 8: Guest included 
- column states the number of guests can be accomodated for each listing
- we need to normalize the value to 0-1

In [31]:
ld_5['guests_included'].max()

15

In [32]:
def normalizing(column):
    new_column = (column - column.min()) / (column.max() - column.min())
    return new_column

In [33]:
ld_5['guests_included'] = normalizing(ld_5['guests_included'])

In [34]:
ld_5.head()

Unnamed: 0,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,...,Private room,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict
0,0,47.636289,-122.371025,0.133333,1.0,1.0,1.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,...,0,0,0,0,0,0,1,0,1,0
1,1,47.639123,-122.365666,0.066667,1.0,1.0,1.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,...,0,0,0,0,0,0,1,0,0,1
2,0,47.629724,-122.369483,0.666667,4.5,5.0,7.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,...,0,0,0,0,0,0,1,0,0,1
3,0,47.638473,-122.369279,0.066667,1.0,0.0,2.0,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,...,0,0,0,0,0,0,1,1,0,0
4,0,47.632918,-122.372471,0.4,2.0,3.0,3.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,...,0,0,0,0,0,0,1,0,0,1


# Column 8, 9, 10: bathrooms, bedrooms, beds
- column states the number of bathrooms, bedrooms, and beds in each listing
- normalize the value to 0-1

In [35]:
ld_5['bathrooms'] = normalizing(ld_5['bathrooms'])

In [36]:
ld_5['bedrooms'] = normalizing(ld_5['bedrooms'])

In [37]:
ld_5['beds'] = normalizing(ld_5['beds'])

In [38]:
ld_5.head()

Unnamed: 0,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,...,Private room,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict
0,0,47.636289,-122.371025,0.133333,0.125,0.142857,0.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,...,0,0,0,0,0,0,1,0,1,0
1,1,47.639123,-122.365666,0.066667,0.125,0.142857,0.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,...,0,0,0,0,0,0,1,0,0,1
2,0,47.629724,-122.369483,0.666667,0.5625,0.714286,0.428571,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,...,0,0,0,0,0,0,1,0,0,1
3,0,47.638473,-122.369279,0.066667,0.125,0.0,0.071429,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,...,0,0,0,0,0,0,1,1,0,0
4,0,47.632918,-122.372471,0.4,0.25,0.428571,0.142857,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,...,0,0,0,0,0,0,1,0,0,1


# Column 11, 12: Longitude and Latitude 
- column states the longitude and latitude of each listing 
- we can use these two values, and map them to x, y and z coordinates. In this way we can make sure close points in the 3D space are close to each other. 
- x = cos(lat) * cos(lon)
- y = cos(lat) * sin(lon), 
- z = sin(lat) 

In [39]:
## seattle airport: 47.4502° N, 122.3088° W
airport_lat = 47.4502
airport_lon = -122.3088

## downtown: 47.6050° N, 122.3344° W
dt_lat = 47.6050
dt_lon = -122.3344

## pike place: 47.6101° N, 122.3421° W
pp_lat = 47.6101
pp_lon = -122.3421

## seattle amazon headquarter: 47.6062° N, 122.3321° W
amazon_lat = 47.6062
amazon_lon = -122.3321

## longitude and latitude in datasets
lat_data = ld_5['latitude']
lon_data = ld_5['longitude']

In [40]:
lat_data[1]

47.63912312136253

In [41]:
lon_data[1]

-122.36566646439582

In [42]:
airport_lat

47.4502

In [45]:
AVG_EARTH_RADIUS = 6371

In [48]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arctan2(np.sqrt(d), np.sqrt(1-d))
    return h

In [49]:
haversine_array(airport_lat, airport_lon, lat_data[1], lon_data[1])

21.436525877765494

In [65]:
for i, s in enumerate(ld_5):
    ld_5['d_airport'][i] = haversine_array(airport_lat, airport_lon, lat_data[i], lon_data[i])
    ld_5['d_pikeplace'][i] = haversine_array(pp_lat, pp_lon, lat_data[i], lon_data[i])
    ld_5['d_amazon'][i] = haversine_array(amazon_lat, amazon_lon, lat_data[i], lon_data[i])
    ld_5['d_downtown'][i] = haversine_array(dt_lat, dt_lon, lat_data[i], lon_data[i])
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [66]:
ld_5.head()

Unnamed: 0,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,...,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,d_airport,d_downtown,d_pikeplace,d_amazon
0,0,47.636289,-122.371025,0.133333,0.125,0.142857,0.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,...,0,0,1,0,1,0,21.212736,4.431697,3.630386,4.439052
1,1,47.639123,-122.365666,0.066667,0.125,0.142857,0.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,...,0,0,1,0,0,1,21.436526,4.459606,3.678898,4.441933
2,0,47.629724,-122.369483,0.666667,0.5625,0.714286,0.428571,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,...,0,0,1,0,0,1,20.475301,3.804344,2.995638,3.833194
3,0,47.638473,-122.369279,0.066667,0.125,0.0,0.071429,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,...,0,0,1,1,0,0,21.421534,4.548324,3.755367,4.543407
4,0,47.632918,-122.372471,0.4,0.25,0.428571,0.142857,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,...,0,0,1,0,0,1,20.87191,4.21657,3.408671,4.240533


In [68]:
ld_5['d_airport'] = normalizing(ld_5['d_airport'])
ld_5['d_downtown'] = normalizing(ld_5['d_downtown'])
ld_5['d_pikeplace'] = normalizing(ld_5['d_pikeplace'])
ld_5['d_amazon'] = normalizing(ld_5['d_amazon'])


In [70]:
ld_5.head()

Unnamed: 0,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,...,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,d_airport,d_downtown,d_pikeplace,d_amazon
0,0,47.636289,-122.371025,0.133333,0.125,0.142857,0.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,...,0,0,1,0,1,0,0.699555,0.65296,0.654888,0.646445
1,1,47.639123,-122.365666,0.066667,0.125,0.142857,0.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,...,0,0,1,0,0,1,0.833689,0.670189,0.6847,0.648256
2,0,47.629724,-122.369483,0.666667,0.5625,0.714286,0.428571,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,...,0,0,1,0,0,1,0.257556,0.265696,0.264812,0.265532
3,0,47.638473,-122.369279,0.066667,0.125,0.0,0.071429,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,...,0,0,1,1,0,0,0.824703,0.724954,0.731693,0.712055
4,0,47.632918,-122.372471,0.4,0.25,0.428571,0.142857,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,...,0,0,1,0,0,1,0.495273,0.520162,0.518636,0.521633


In [71]:
ld_6 = ld_5.drop('latitude', 1)
ld_7 = ld_6.drop('longitude', 1)

In [72]:
ld_7.head()

Unnamed: 0,host_is_superhost,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,instant_bookable,ballard,...,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,d_airport,d_downtown,d_pikeplace,d_amazon
0,0,0.133333,0.125,0.142857,0.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,0,0,...,0,0,1,0,1,0,0.699555,0.65296,0.654888,0.646445
1,1,0.066667,0.125,0.142857,0.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,0,0,...,0,0,1,0,0,1,0.833689,0.670189,0.6847,0.648256
2,0,0.666667,0.5625,0.714286,0.428571,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,0,0,...,0,0,1,0,0,1,0.257556,0.265696,0.264812,0.265532
3,0,0.066667,0.125,0.0,0.071429,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,0,0,...,0,0,1,1,0,0,0.824703,0.724954,0.731693,0.712055
4,0,0.4,0.25,0.428571,0.142857,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,0,0,...,0,0,1,0,0,1,0.495273,0.520162,0.518636,0.521633


# Column 13: price, cleaning fee
- column includes the avrage price and cleaning_fee per night for each listing

In [None]:
ld_7['price'] = ld_7['price'].replace('[\$,]','',regex=True).astype(float)
ld_7['cleaning_fee'] = ld_7['cleaning_fee'].replace('[\$,]','',regex=True).astype(float)
ld_7['price'] = normalizing(ld_7['price'])
ld_7['cleaning_fee'] = normalizing(ld_7['cleaning_fee'] )

In [None]:
ld_7.head()

# Column 14: Amenities 
- column indudes all the amentities for each listing

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

ld_7["amenities"] = ld_7["amenities"].str.lower().str.replace('{','').str.replace('}','').str.replace('"','').str.replace(' ','_').str.split(',')
ld_7.head()
mlb = MultiLabelBinarizer()
final_df = ld_7.join(pd.DataFrame(mlb.fit_transform(ld_7.pop('amenities')),
                          columns=mlb.classes_,
                          index=ld_7.index))



In [None]:
final_df.head()

In [None]:
# Exporting as csv for later use
final_df.to_csv('aps360_airbnb1.csv')