# Capstone: Airbnb Price Listing Prediction
## Part 2 Full EDA and Time Series Analysis

_Authors: Evonne Tham_

##  1. Import Necessary Libraries & Load Data 

In [None]:
import pandas as pd
import numpy as np

# Plots
import matplotlib.pyplot as plt
import seaborn as sns   
%matplotlib inline

# Geographical Analysis
import geopandas as gpd

#Hide warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load in Data 
final = pd.read_csv('../datasets/final.csv')
print(f"Total Number of Listing: {final.shape[0]} | Total Number of Features: {final.shape[1]}")
final.head().T

---
##  2. Full Exploratory Analysis

##### Price (Target Variable)

In [None]:
print(f"Nightly advertised prices range from {min(final.price)}¥ to {max(final.price)}¥.")

In [None]:
# Distribution of prices up to 50_000 yen 
plt.figure(figsize=(15,5))

final.price.hist(bins=100, 
                 range=(0, 50_000),
                 color ='#FF5A5F')

plt.margins(x=0)
plt.title("Airbnb advertised nightly prices in Tokyo up to 50000¥", fontsize=20)
plt.xlabel("Price (¥)", fontsize=15)
plt.ylabel("Number of listings", fontsize=15);

In [None]:
# Distribution of prices up to 50_000 yen 
plt.figure(figsize=(15,5))

final.price.hist(bins=100, 
                 range=(50_000, max(final.price)),
                 color ='#FF5A5F')

plt.title("Airbnb advertised nightly prices in Tokyo above 50000¥", fontsize=20)
plt.xlabel("Price (¥)", fontsize=15)
plt.ylabel("Number of listings", fontsize=15);

In [None]:
plt.figure(figsize =(15,5))

sns.boxplot(final.price, color = '#FF5A5F')
plt.title('Price Boxplot With Extreme Outliers', fontsize=20)
plt.xlabel('Price (¥)', fontsize = 15);

_There are quite a number of listing that are price above 100000 yen. Hence I will be leaving those untouch and drop those that are priced above that and are of lesser frequency._


<img src="../image/listing_over_100000.png"
	title="Listing over 100000¥" width="1050" height="1000" />

In [None]:
# Drop rows that are listed above 100000 Yen
final.drop(final[final['price'] > 100_000].index, inplace=True)

In [None]:
plt.figure(figsize =(15,5))

sns.boxplot(final.price, color = '#FF5A5F')
plt.title('Price Boxplot Without Extreme Outliers', fontsize=20)
plt.xlabel('Price (¥)', fontsize = 15);

<div class="alert alert-block alert-info">

<b>Observation:</b> Listing prices have a right skewed distribution
</div>

---
##### Host

In [None]:
def binary_count_and_price_plot(col, figsize=(15,4)):
    """
    Plots a simple bar chart of the counts of true and false categories in the column specified,
    next to a bar chart of the median price for each category.
    A figure size can optionally be specified.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
    fig.suptitle(col, fontsize=16, y=1)
    plt.subplots_adjust(top=0.80) # So that the suptitle does not overlap with the ax plot titles
    
    final.groupby(col).size().plot(kind='bar', ax=ax1, color=['#FF5A5F', '#00A699'])
    ax1.set_xticklabels(labels=['false', 'true'], rotation=0)
    ax1.set_title('Category count')
    ax1.set_xlabel('')
    
    final.groupby(col).price.median().plot(kind='bar', ax=ax2, color=['#FF5A5F', '#00A699'])
    ax2.set_xticklabels(labels=['false', 'true'], rotation=0)
    ax2.set_title('Median price (£)')
    ax2.set_xlabel('')
    
    plt.show()

binary_count_and_price_plot('host_is_superhost')
print(final.host_is_superhost.value_counts(normalize=True));

In [None]:
binary_count_and_price_plot('host_identity_verified')
print(final.host_identity_verified.value_counts(normalize=True));

<div class="alert alert-block alert-info">

<b>Observation:</b> About 37% of hosts are superhosts and about 20% of hosts are verified. However, it does not seem to improve the median price per night of their Airbnb listings.
</div>

---
##### Host Listing Count

In [None]:
plt.figure(figsize=(16,5))
plt.hist(final['host_listings_count'], bins=30, color ='#FF5A5F')
plt.title('Distribution of Host Listings Count', fontsize=20)
plt.xlabel('Number of Listing a Host has', fontsize=15)
plt.ylabel('Number of Host', fontsize=15);

In [None]:
final[final['host_listings_count']==0]

In [None]:
final.drop(final[final['host_listings_count']==0].index, inplace=True)
final.shape

In [None]:
print("Median number of listings per host:", int(final.host_listings_count.median()))
print("Mean number of listings per host:", int(round(final.host_listings_count.mean())))
print(f"{int(round(100*len(final[final.host_listings_count == 1])/len(final)))}% of listings are from hosts with one listing.")

<div class="alert alert-block alert-info">

<b>Assumption:</b> Listing which leaves host with zero listing counts are dropped. Assumption being these users signed up as host but have yet to list any property. Hence, it will not affect the prediction. 

<b>Observation:</b> According to the numbers above, it seems like hosts in Tokyo manages quite a number of properties. 

</div>

In [None]:
top_host=final.host_id.value_counts().head(10)

fig = plt.figure(1, figsize=(16,5))
ax = fig.add_subplot(111)

title = 'Hosts with the most listings in Tokyo'

transparency = 0.7
color = ['#FF5A5F', '#00A699', '#FC642D']

top_host.plot(kind = 'bar', 
              color = color, 
              width = 0.9,
              title = title)

ax.grid(False)
ax.set_frame_on(False)

ax.set_title(ax.get_title(), fontsize=20, alpha=transparency, ha='left')
plt.subplots_adjust(top=0.9)
ax.title.set_position((0,1.1))

ax.xaxis.set_label_position('top')
xlab = 'Hosts'
ax.set_xlabel(xlab, fontsize=15, alpha=transparency, ha='left')
ax.xaxis.set_label_coords(0, 1.01)
plt.xticks(rotation=0)

ylab = 'Listing Count'
ax.set_ylabel(ylab, fontsize=15, alpha=transparency);

<div class="alert alert-block alert-info">

<b>Observation:</b> These are the host with most number of listing in Tokyo. Host 75820284 has over 80 listings. 


</div>

---
##### Neighbourhod

In [None]:
# Create new dataframe for neighbourhood count
neighbourhood_count = pd.DataFrame()
neighbourhood_count['total_hosts'] = final['neighbourhood'].value_counts()

# Filter out neighbourhood with less than 20 listings
neighbourhood_count = neighbourhood_count[neighbourhood_count['total_hosts'] > 20]
neighbourhood_count.sort_index(ascending=True, inplace=True)

In [None]:
fig = plt.figure(1, figsize=(20,25))
ax = fig.add_subplot(111)

title="Distribution of listings based on neighbourhood in Tokyo"

transparency = 0.7
cmap = sns.color_palette("autumn")

neighbourhood_count['total_hosts'].plot(kind = 'barh',
                                        ax = ax,
                                        color = cmap,
                                        title = title, 
                                        width = 0.9)

ax.grid(False)
ax.set_frame_on(False)

ax.set_title(ax.get_title(), fontsize=26, alpha=transparency, ha='left')
plt.subplots_adjust(top=0.9)
ax.title.set_position((0,1.04))

ax.xaxis.set_label_position('top')
xlab = 'Total number of listings'
ax.set_xlabel(xlab, fontsize=20, alpha=transparency, ha='left')
ax.xaxis.set_label_coords(0, 1.02)
ax.xaxis.tick_top()

ylab = 'Neighbourhood in Tokyo'
ax.set_ylabel(ylab, fontsize=20, alpha=transparency);

<div class="alert alert-block alert-info">

<b>Observation:</b> In the chart above, Shinjuku Ku, Taito Ku, Toshima Ku, Sumida Ku, and Shibuya Ku are listed to have the highest number of listing. With about 2000 listings in Shinjuku Ku alone. This is not surprising as Shinjuku Ku is only of the most popular spots to visit as a tourist.


</div>

In [None]:
plt.figure(1, figsize=(15,5))
sns.violinplot('neighbourhood', 'price', data=final)
plt.xticks(rotation=90);

---
##### Property Type

In [None]:
fig = plt.figure(1, figsize=(16,5))
ax = fig.add_subplot(111)

title = 'Property Type'

transparency = 0.7
color = sns.color_palette("autumn")

final['property_type'].value_counts().plot(kind='bar', 
                                           color = color, 
                                           width = 0.9,
                                           title = title)

ax.grid(False)
ax.set_frame_on(False)

ax.set_title(ax.get_title(), fontsize=20, alpha=transparency, ha='left')
plt.subplots_adjust(top=0.9)
ax.title.set_position((0,1.04))

ylab = 'Listing Count'
ax.set_ylabel(ylab, fontsize=15, alpha=transparency);

In [None]:
print(final.property_type.value_counts(normalize=True))

<div class="alert alert-block alert-info">

<b>Observation:</b> Most common listings property type: Apartment (More than 61%)


</div>

---
##### Room Type

In [None]:
fig = plt.figure(1, figsize=(15,5))
ax = fig.add_subplot(111)

title = 'Room Type'

transparency = 0.7
color = sns.color_palette("autumn")

final['room_type'].value_counts().plot(kind='bar',
                                       color = color,
                                       width = 0.9, 
                                       title = title)

ax.grid(False)
ax.set_frame_on(False)

ax.set_title(ax.get_title(), fontsize=20, alpha=transparency, ha='left')
plt.subplots_adjust(top=0.9)
ax.title.set_position((0,1.04))
plt.xticks(rotation=0)

ylab = 'Listing Count'
ax.set_ylabel(ylab, fontsize=15, alpha=transparency);
print()

In [None]:
print(final.room_type.value_counts(normalize=True))

<div class="alert alert-block alert-info">

<b>Observation:</b> With more than 11000 listing, about 8000 of them are entire_home_apt and only 20% are private room. 

</div>

---
##### Property Vs Room Type

In [None]:
# Create a dataframe with property_type and room_type
property_room = final.groupby(['property_type','room_type']).room_type.count().unstack()

# Sum up the total number of room types in a property type
property_room['total'] = property_room.iloc[:,0:4].sum(axis = 1)
property_room.sort_values(by='total', inplace=True)

# Filter out properties that are less than 100
property_room = property_room[property_room['total']>=100]
property_room = property_room.drop(['total'], axis=1)

In [None]:
color = sns.color_palette("autumn")

property_room.plot(kind='barh',
                   stacked=True, 
                   color = color,
                   width=0.9,
                   figsize=(13,5), 
                   alpha = 0.8)

plt.title('Property vs Room Type', fontsize=20)
plt.xlabel('Number of listings', fontsize=14)
plt.ylabel("Property Type", fontsize=15);

<div class="alert alert-block alert-info">

<b>Observation:</b> As shown in the figure above, most listings are categorised as apartments which comes in 3 top room types which are entire home, hotel room, and private room


</div>

---
##### Accomodates

In [None]:
fig = plt.figure(1, figsize=(16,5))
ax = fig.add_subplot(111)

title = 'Accomodates (Number of People)'

transparency = 0.7
color = sns.color_palette("autumn")

final['accommodates'].value_counts().sort_index().plot(kind='bar', 
                                             color = color, 
                                             width = 0.9, 
                                             title = title)

ax.grid(False)
ax.set_frame_on(False)

ax.set_title(ax.get_title(), fontsize=20, alpha=transparency, ha='left')
plt.subplots_adjust(top=0.9)
ax.title.set_position((0,1.04))
plt.xticks(rotation=0)

ylab = 'Listing Count'
ax.set_ylabel(ylab, fontsize=15, alpha=transparency);

<div class="alert alert-block alert-info">

<b>Observation:</b> Most listings in Tokyo are for 2-4 people

</div>

In [None]:
plt.figure(figsize=(15, 10))

color = '#FF5A5F'

final.groupby('accommodates').price.mean().plot.barh(color = color, 
                                                     width=0.9)

plt.title("Average price for accommodating different number of guests", fontsize=20)
plt.xlabel('Average price (¥)', fontsize=15)
plt.ylabel('Neighbourhood in Tokyo', fontsize=15);

<div class="alert alert-block alert-info">

<b>Observation:</b>


</div>

---
##### Rooms

In [None]:
final[['bathrooms', 'bedrooms', 'beds']].hist(figsize=(8,6));

##### Review_Scores

In [None]:
review_col = list(final.columns[final.columns.str.startswith("review_scores") == True])

fig = plt.figure(figsize=(12,8))
for i, var_name in enumerate(review_col):
    ax = fig.add_subplot(3,3,i+1)
    final[var_name].hist(bins=10, ax=ax, color = '#FF5A5F')
    ax.set_title(var_name)
    
fig.tight_layout()

In [None]:
review_group = pd.DataFrame(final.groupby('number_of_reviews').sum()['host_id'])
most_reviewed = (review_group.sort_values('number_of_reviews', ascending=False)).head(10)
most_reviewed['total_reviews'] = most_reviewed.index
most_reviewed.reset_index(drop=True, inplace=True)
most_reviewed

---
## 3. Geographical Analysis

In [None]:
tokyo_map = gpd.read_file('../datasets/japan/neighbourhoods.geojson')
tokyo_map.head()

In [None]:
# Drop neighhourhood_group (None)
tokyo_map.drop('neighbourhood_group', axis=1, inplace=True)

# Create a dataframe for geographical analysis
neighbourhood_df = pd.DataFrame(final.groupby('neighbourhood').size())
neighbourhood_df.rename(columns={0: 'total_listings'}, inplace=True)
neighbourhood_df['median_price'] = final.groupby('neighbourhood').price.median().values
tokyo_map = tokyo_map.set_index('neighbourhood').join(neighbourhood_df)
tokyo_map.head()

In [None]:
fig1, ax1 = plt.subplots(1, figsize=(15, 6))

tokyo_map.plot(column='total_listings', 
               cmap='Blues', 
               ax=ax1)

ax1.set_title('Number of Airbnb listings in neighbourhood', fontsize=14);

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(x = 'latitude', 
                y = 'longitude', 
                hue = 'property_type', 
                data = final, 
                alpha = 0.7)
plt.title("Property Type", fontsize=20);

<div class="alert alert-block alert-info">

<b>Observation:</b> Judging from the this scatter plot, there is some listings that are pretty out of norm (based on latitude and longitute. Hence I will be dropping those.

</div>

In [None]:
final.drop(final[final.latitude < 35].index, inplace=True)

In [None]:
# plot scatter
plt.figure(figsize=(15,8))
sns.scatterplot(x = 'latitude', 
                y = 'longitude', 
                hue = 'property_type', 
                data = final, 
                alpha = 0.7)
plt.title("Property Type", fontsize=20);

In [None]:
plt.figure(figsize=(15,9))
sns.scatterplot(x = 'latitude', 
                y = 'longitude', 
                hue = 'room_type', 
                data = final)
plt.title('Room Type', fontsize=20);

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(x = 'latitude', 
                y = 'longitude', 
                hue = 'availability_365', 
                data = final, 
                alpha = 0.7)
plt.title("Availability 365", fontsize=20);

---
##  3. Time Series Analysis

##### a. Time Since First Review

In [None]:
# final['time_since_first_review'] = final.last_review - final.first_review
# final.time_since_first_review.hist(figsize=(11,9), bins=30);

---
##### Save Final Dataframe

In [None]:
final.to_csv("../datasets/final_df.csv", index = False)

----> Proceed to the next notebook for Feature Engineering and Model Bench Mark