# Boston Airbnb Analysis

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data input and cleaning

In [260]:
listings = pd.read_csv("/Users/yumengxiao/Documents/7374/data_science_blog/boston/listings.csv")
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,9.0,f,,,t,moderate,f,f,1,1.3
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,10.0,f,,,f,moderate,t,f,1,0.47
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,10.0,f,,,f,moderate,f,f,1,1.0
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,10.0,f,,,f,flexible,f,f,1,2.25


In [261]:
listings.shape

(3585, 95)

In [262]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [263]:
listings = listings[['id','host_response_time', 'host_response_rate','host_is_superhost','host_neighbourhood','property_type', 'room_type', 'accommodates','price', 'security_deposit',
       'cleaning_fee','minimum_nights','review_scores_rating','cancellation_policy']]

In [264]:
listings.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,property_type,room_type,accommodates,price,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,cancellation_policy
0,12147973,,,f,Roslindale,House,Entire home/apt,4,$250.00,,$35.00,2,,moderate
1,3075044,within an hour,100%,f,Roslindale,Apartment,Private room,2,$65.00,$95.00,$10.00,2,94.0,moderate
2,6976,within a few hours,100%,t,Roslindale,Apartment,Private room,2,$65.00,,,3,98.0,moderate
3,1436513,within a few hours,100%,f,,House,Private room,4,$75.00,$100.00,$50.00,1,100.0,moderate
4,7651065,within an hour,100%,t,Roslindale,House,Private room,2,$79.00,,$15.00,2,99.0,flexible


In [265]:
# Convert string of pecentages to floats
listings.host_response_rate = listings.host_response_rate.str.replace('%','')
listings.host_response_rate = listings.host_response_rate.astype(float)/100
listings.host_response_rate.head()

0    NaN
1    1.0
2    1.0
3    1.0
4    1.0
Name: host_response_rate, dtype: float64

In [266]:
# Convert string of prices to floats
listings.price=listings.price.str.replace('$','')
listings.price=listings.price.str.replace(',','').astype(float)

listings.security_deposit=listings.security_deposit.str.replace('$','')
listings.security_deposit=listings.security_deposit.str.replace(',','').astype(float)

listings.cleaning_fee=listings.cleaning_fee.str.replace('$','')
listings.cleaning_fee=listings.cleaning_fee.str.replace(',','').astype(float)

listings.price.head()

0    250.0
1     65.0
2     65.0
3     75.0
4     79.0
Name: price, dtype: float64

In [267]:
listings.isnull().any() 

id                      False
host_response_time       True
host_response_rate       True
host_is_superhost       False
host_neighbourhood       True
property_type            True
room_type               False
accommodates            False
price                   False
security_deposit         True
cleaning_fee             True
minimum_nights          False
review_scores_rating     True
cancellation_policy     False
dtype: bool

In [268]:
listings['host_response_rate'] = listings['host_response_rate'].fillna(listings['host_response_rate'].mean())
listings['security_deposit'] = listings['security_deposit'].fillna(listings['security_deposit'].mean())
listings['cleaning_fee'] = listings['cleaning_fee'].fillna(listings['cleaning_fee'].mean())
listings['review_scores_rating'] = listings['review_scores_rating'].fillna(listings['review_scores_rating'].mean())

In [269]:
listings['host_response_time'] = listings['host_response_time'].fillna(listings['host_response_time'][listings['host_response_time'].notnull()].mode()[0])
listings['host_neighbourhood'] = listings['host_neighbourhood'].fillna(listings['host_neighbourhood'][listings['host_neighbourhood'].notnull()].mode()[0])
listings['property_type'] = listings['property_type'].fillna(listings['property_type'][listings['property_type'].notnull()].mode()[0])

In [270]:
# type(listings['host_since'][0])

In [271]:
# listings['host_since'] = pd.to_datetime(listings['host_since'], format='%Y-%m-%d')

In [272]:
listings.isnull().any() 

id                      False
host_response_time      False
host_response_rate      False
host_is_superhost       False
host_neighbourhood      False
property_type           False
room_type               False
accommodates            False
price                   False
security_deposit        False
cleaning_fee            False
minimum_nights          False
review_scores_rating    False
cancellation_policy     False
dtype: bool

In [289]:
calendar = pd.read_csv("/Users/yumengxiao/Documents/7374/data_science_blog/boston/calendar.csv")
calendar.head()

Unnamed: 0,listing_id,date,available,price
0,12147973,2017-09-05,f,
1,12147973,2017-09-04,f,
2,12147973,2017-09-03,f,
3,12147973,2017-09-02,f,
4,12147973,2017-09-01,f,


In [290]:
calendar.price = calendar.price.str.replace('$','')
calendar.price = calendar.price.str.replace(',','').astype(float)

calendar['date'] = pd.to_datetime(calendar['date'], format='%Y-%m-%d')
calendar['date'].head()

0   2017-09-05
1   2017-09-04
2   2017-09-03
3   2017-09-02
4   2017-09-01
Name: date, dtype: datetime64[ns]

In [291]:
calendar.isnull().any() 

listing_id    False
date          False
available     False
price          True
dtype: bool

In [292]:
calendar.date.min()

Timestamp('2016-09-06 00:00:00')

In [293]:
calendar.date.max()

Timestamp('2017-09-05 00:00:00')

In [294]:
def fill_price(calendar,listings):
    calendar['price'] = calendar['price'].fillna(0)
    
    price = calendar['price']
    listing_id = calendar['listing_id']
    num = len(price)
    
    for i in range(0,num):
        if price[i] == 0:
            list_id = listing_id[i]
            list_price = listings[listings['id'] == list_id].price

In [282]:
calendar.isnull().any() 

listing_id    False
date          False
available     False
price          True
dtype: bool

In [152]:
reviews = pd.read_csv("/Users/yumengxiao/Documents/7374/data_science_blog/boston/reviews.csv")
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [153]:
reviews['date'] = pd.to_datetime(reviews['date'], format='%Y-%m-%d')

In [154]:
reviews.isnull().any()

listing_id       False
id               False
date             False
reviewer_id      False
reviewer_name    False
comments          True
dtype: bool

In [204]:
reviews.shape

(68275, 6)

In [210]:
reviews.date.min()

'2009-03-21'

In [209]:
reviews.date.max()

'2016-09-06'

## Revenue

In [214]:
reviews_2015 = reviews[(reviews.date >= '2015-01-01') & (reviews.date < '2016-01-01')]
    
reviews_2015_agg = reviews_2015.groupby(['listing_id'],as_index=False).agg({'id':'count'})
reviews_2015_agg.rename(columns={"id": "number_of_reviews_2015"},inplace=True)
reviews_2015_agg.head()

Unnamed: 0,listing_id,number_of_reviews_2015
0,3353,8
1,5506,3
2,6695,2
3,6976,7
4,8792,7


In [301]:
listing_agg = pd.merge(listings, reviews_2015_agg, how='inner', left_on='id',right_on='listing_id')
del listing_agg['listing_id']
listing_agg.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,property_type,room_type,accommodates,price,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,cancellation_policy,listing_id,number_of_reviews_2015,revenue
0,3075044,within an hour,1.0,f,Roslindale,Apartment,Private room,2,65.0,95.0,10.0,2,94.0,moderate,3075044,17,2210.0
1,6976,within a few hours,1.0,t,Roslindale,Apartment,Private room,2,65.0,324.698212,68.380145,3,98.0,moderate,6976,7,1365.0
2,7651065,within an hour,1.0,t,Roslindale,House,Private room,2,79.0,324.698212,15.0,2,99.0,flexible,7651065,14,2212.0
3,5706985,within an hour,0.98,f,Roslindale,Apartment,Entire home/apt,3,100.0,324.698212,68.380145,1,90.0,strict,5706985,17,1700.0
4,2843445,within a few hours,1.0,t,Roslindale,House,Private room,2,75.0,324.698212,10.0,1,96.0,moderate,2843445,41,3075.0


In [300]:
listing_agg['revenue'] = listing_agg['price']*listing_agg['minimum_nights']*listing_agg['number_of_reviews_2015']

## Rgression

In [317]:
listing_agg.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,property_type,room_type,accommodates,price,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,cancellation_policy,listing_id,number_of_reviews_2015,revenue
0,3075044,within an hour,1.0,f,Roslindale,Apartment,Private room,2,65.0,95.0,10.0,2,94.0,moderate,3075044,17,2210.0
1,6976,within a few hours,1.0,t,Roslindale,Apartment,Private room,2,65.0,324.698212,68.380145,3,98.0,moderate,6976,7,1365.0
2,7651065,within an hour,1.0,t,Roslindale,House,Private room,2,79.0,324.698212,15.0,2,99.0,flexible,7651065,14,2212.0
3,5706985,within an hour,0.98,f,Roslindale,Apartment,Entire home/apt,3,100.0,324.698212,68.380145,1,90.0,strict,5706985,17,1700.0
4,2843445,within a few hours,1.0,t,Roslindale,House,Private room,2,75.0,324.698212,10.0,1,96.0,moderate,2843445,41,3075.0


In [353]:
df_regress = listing_agg.copy(deep = True)

In [354]:
del df_regress['id']
del df_regress['number_of_reviews_2015']

In [355]:
host_response_time = pd.get_dummies(df_regress['host_response_time'])
host_is_superhost = pd.get_dummies(df_regress['host_is_superhost'])
property_type = pd.get_dummies(df_regress['property_type'])
room_type = pd.get_dummies(df_regress['room_type'])
host_neighbourhood = pd.get_dummies(df_regress['host_neighbourhood'])
cancellation_policy = pd.get_dummies(df_regress['cancellation_policy'])

In [356]:
del df_regress['host_response_time']
del df_regress['host_is_superhost']
del df_regress['property_type']
del df_regress['room_type']
del df_regress['host_neighbourhood']
del df_regress['cancellation_policy']

df_regress = df_regress.apply(lambda x : (x-np.mean(x))/np.std(x))
df_regress.head()

Unnamed: 0,host_response_rate,accommodates,price,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,revenue
0,0.412207,-0.576188,-0.937159,-0.991328,-1.299933,-0.112751,0.270739,-0.274143
1,0.412207,-0.576188,-0.937159,-0.025385,-0.02546,-0.024582,0.756541,-0.355183
2,0.412207,-0.576188,-0.812475,-0.025385,-1.19078,-0.112751,0.877992,-0.273952
3,0.210649,-0.034482,-0.625447,-0.025385,-0.02546,-0.200921,-0.215064,-0.323055
4,0.412207,-0.576188,-0.848099,-0.025385,-1.299933,-0.200921,0.51364,-0.191186


In [357]:
df_regress = pd.concat([host_response_time,host_is_superhost,property_type,room_type,host_neighbourhood,cancellation_policy,df_regress], axis=1, join='inner', ignore_index=False)
df_regress.rename(columns={"f": "not_superhost", "t":"is_spuer_host"},inplace=True)
df_regress.head()

Unnamed: 0,a few days or more,within a day,within a few hours,within an hour,not_superhost,is_spuer_host,Apartment,Bed & Breakfast,Boat,Condominium,...,strict,super_strict_30,host_response_rate,accommodates,price,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,revenue
0,0,0,0,1,1,0,1,0,0,0,...,0,0,0.412207,-0.576188,-0.937159,-0.991328,-1.299933,-0.112751,0.270739,-0.274143
1,0,0,1,0,0,1,1,0,0,0,...,0,0,0.412207,-0.576188,-0.937159,-0.025385,-0.02546,-0.024582,0.756541,-0.355183
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0.412207,-0.576188,-0.812475,-0.025385,-1.19078,-0.112751,0.877992,-0.273952
3,0,0,0,1,1,0,1,0,0,0,...,1,0,0.210649,-0.034482,-0.625447,-0.025385,-0.02546,-0.200921,-0.215064,-0.323055
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0.412207,-0.576188,-0.848099,-0.025385,-1.299933,-0.200921,0.51364,-0.191186


In [386]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score

In [387]:
# Split features and label
df_x = df_regress.iloc[:,0:65]
df_y = df_regress.iloc[:,65]

In [391]:
# Split dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2)

# Choose random forest regressor with 10 tress
rf = RandomForestRegressor(n_estimators= 10)

# Train the model on training data
rf.fit(X_train,y_train)

# Assess the model
rf.score(X_test, y_test)

0.6544388733686042

In [395]:
importance = rf.feature_importances_
features = df_regress.columns[0:65]
indices = np.argsort(importance)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, features[indices[f]], importance[indices[f]]))

 1) minimum_nights                 0.233704
 2) price                          0.183471
 3) Jamaica Plain                  0.112181
 4) review_scores_rating           0.080357
 5) moderate                       0.057344
 6) cleaning_fee                   0.041436
 7) security_deposit               0.037519
 8) is_spuer_host                  0.029207
 9) host_response_rate             0.024953
10) accommodates                   0.020549
11) Entire home/apt                0.019337
12) Back Bay                       0.019139
13) not_superhost                  0.018635
14) East Boston                    0.014830
15) Apartment                      0.011306
16) strict                         0.010197
17) Beacon Hill                    0.010112
18) North End                      0.010010
19) within an hour                 0.008648
20) within a few hours             0.008634
21) South End                      0.007406
22) Roxbury                        0.006809
23) Charlestown                 

## User-based collaborative recommendation

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(user_raings,dense_output=True)
user_similarity

In [None]:
def cosineSimilarity(list1, list2):
    x_y = 0
    x_x = 0
    y_y = 0
    for (x, y) in zip(list1, list2):
        x_y += x * y
        x_x += x ** 2
        y_y += y ** 2
    return x_y / ((math.sqrt(x_x)) * (math.sqrt(y_y)))

In [None]:
def find_top5_similar_users(user_id, user_similarity):
    similarity = user_similarity[user_similarity['user_id']== user_id]
    similarity = similarity.T.sort_values(by = user_id, ascending= False)
    top5 = similarity.iloc[0:4]
    return top5

In [None]:
similar_users = find_top5_similar_users().columns
user = reviews['lsiting_id'][reviews['reviewer_id']== user_id].value
user_1 = reviews['lsiting_id'][reviews['reviewer_id']== similar_users[0]].value
user_2 = reviews['lsiting_id'][reviews['reviewer_id']== similar_users[1]].value
user_3 = reviews['lsiting_id'][reviews['reviewer_id']== similar_users[2]].value
user_4 = reviews['lsiting_id'][reviews['reviewer_id']== similar_users[3]].value
user_5 = reviews['lsiting_id'][reviews['reviewer_id']== similar_users[4]].value

In [None]:
union = user_1 + user_2 + user_3 + user_3 + user_5
rest = list(set(union).difference(user))
rest = pd.Series(rest)
rest.value_counts()