# Data visualization of Boston and Seattle Airbnb Open Data

### Data visualization using Plotly library
### This notebook is built to test Plotly libraries 

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import re

Collecting Data - Reading csv files

We will first load and analyse data from Boston collected in 2020-02-13 <br>
The dataset was downloaded from the link https://www.kaggle.com/airbnb/boston and <br> 
https://www.kaggle.com/airbnb/seattle/data 


In [2]:
def clean_data(df):
    """ Clean original data

    Arguments: 
    df: Pandas DataFrame

    Return:
    df: Pandas DataFrame after data cleaning
    """
    # Transform monetary columns that is originaly a string to float. <br>
    df['price'] = df['price'].astype(str) \
                             .apply(lambda x: re.sub('[$,]', '', x) if x is not None else x) \
                             .astype(float)


    # drop columns with less than 25% of the data filled
    thresh=df.shape[0]*.25
    df = df.dropna(thresh=thresh, axis=1)

    # remove the upper limit quantile
    upper_1_5_IQR = df.groupby('neighbourhood_cleansed')['price'].agg(
            lambda x: np.quantile(x, 0.75) + (np.quantile(x, 0.75) - np.quantile(x, 0.25))*1.5
            )
    upper_1_5_IQR = upper_1_5_IQR.to_dict()

    df['upper_1_5_IQR'] = df['neighbourhood_cleansed'].map(upper_1_5_IQR)

    df = df[df['price']<=df['upper_1_5_IQR']]
    df = df.drop(columns='upper_1_5_IQR')
    
    return df

In [3]:
df_boston_listings = pd.read_csv('../data/boston/old/listings.csv')
df_boston_listings.shape[0]

3585

In [4]:
df_boston_listings = clean_data(df_boston_listings)
df_boston_listings.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['upper_1_5_IQR'] = df['neighbourhood_cleansed'].map(upper_1_5_IQR)


3383

In [5]:
hue='require_guest_phone_verification'
prices_by_accommodates = df_boston_listings.groupby(['accommodates', hue])['price'].mean().reset_index()

title='Average Price by total people the house accommodates'
fig = px.line(prices_by_accommodates, x="accommodates", y="price", color=hue,
              title=title,labels={'price':'Average Price','require_guest_phone_verification':'guest_phone_verification'}
              )
fig.show()

In [6]:
title='Average Price by Neighbourhood'
mean_by_neighbourhood = df_boston_listings.groupby('neighbourhood_cleansed')['price'].mean().sort_values().reset_index()
px.bar(mean_by_neighbourhood, x='price', y='neighbourhood_cleansed', 
       orientation='h', title=title,labels={'price':'Average Price', 'neighbourhood_cleansed':''})

In [7]:
title = 'Charactiristics that influence price'
price_correlation = df_boston_listings.corr()['price'].sort_values().iloc[:-2].reset_index()
px.bar(price_correlation, x='price', y='index', 
       orientation='h', title=title,
       labels={'price':'Correlation Rate', 'index':''})

### Loading Seattle dataset

In [8]:
# https://www.kaggle.com/airbnb/seattle/data

df_seattle_listings = pd.read_csv('../data/seattle/old/listings.csv')

In [9]:
df_seattle_listings = clean_data(df_seattle_listings)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Plot comparative graphics

In [10]:
df_boston_listings['city_cleansed'] = 'Boston'
df_seattle_listings['city_cleansed'] = 'Seattle'

seattle_boston = df_boston_listings[['price', 'city_cleansed']].append(df_seattle_listings[['price', 'city_cleansed']])


In [11]:
title = 'Listings counts by price <br>Each color is one city'

fig = px.histogram(seattle_boston, x="price", color="city_cleansed", nbins=45,
                   color_discrete_sequence=px.colors.qualitative.Plotly,
                   title=title,labels={'price':'Price','city_cleansed':'City'}
                   )
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.85)
fig.show()