# Would neighborhood contribute on Airbnb ratings

In Hong Kong, Airbnb is getting popular. According to Inside Airbnb, there are over eight thousand listing in business. Accommodates can leave ratings after their stay. New comers on Airbnb should understand the market before their investment on premise. They should aware of the neighborhood around the premise. 

Report will study the existing listings. Provide recommendation on neighborhood worth for investment. A focus on relationship of neighborhood and ratings. Foursquare API could contribute to the research by its broad database of locations. 

By using k-means clustering, could group listings into clusters for further study.

Foursquare API will provide popular venues around each clusters for further study

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

basePath = r"E:\JUPYTER_NOTEBOOK\week 6 ffinal assignment machine learning of python\\"
print(basePath)

#### Define Foursquare Credentials and Version

In [None]:
CLIENT_ID = 'xxx' #  Foursquare ID
CLIENT_SECRET = 'xxx' #  Foursquare Secret
VERSION = '20190401' # Foursquare API version

print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET: ' + CLIENT_SECRET)

#### Inside Airbnb data

In [None]:
# http://data.insideairbnb.com/china/hk/hong-kong/2019-03-11/visualisations/listings.csv
df_list = pd.read_csv(r"E:\JUPYTER_NOTEBOOK\week 6 ffinal assignment machine learning of python\Airbnb\listings.csv")
print(df_list.shape)
df_list.head(1)

In [None]:
# Drop unnecessary columns
df_list = df_list[['id','host_since','host_response_rate','host_acceptance_rate','host_neighbourhood','host_listings_count','host_total_listings_count','street','neighbourhood','neighbourhood_cleansed','smart_location','latitude','longitude','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','price','security_deposit','cleaning_fee','guests_included','extra_people','minimum_nights','maximum_nights','number_of_reviews','first_review','last_review','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','reviews_per_month']]

In [None]:
# Turn text into numbers
def cleanMoney(col):
    df_list[col].fillna(np.nan,inplace=True)
    df_list[col] = df_list[col].str.replace(',', '')
    df_list[col] = df_list[col].str.replace('$', '')
    df_list[col] = df_list[col].astype(float)
    df_list[col].fillna(-1,inplace=True)
    df_list[col] = df_list[col].astype(int)
    return

cleanMoney('price')
cleanMoney('security_deposit')
cleanMoney('cleaning_fee')
cleanMoney('extra_people')

In [None]:
# Turn text into numbers
def floatToInt(col):
    df_list[col].fillna(-1,inplace=True)
    df_list[col] = df_list[col].astype(int)
    return

floatToInt('review_scores_rating')
floatToInt('review_scores_accuracy')
floatToInt('review_scores_cleanliness')
floatToInt('review_scores_checkin')
floatToInt('review_scores_communication')
floatToInt('review_scores_location')
floatToInt('review_scores_value')

In [None]:
# Remove listings without ratings
print(df_list.shape)
df_list = df_list[df_list['review_scores_rating']>0]
print(df_list.shape)

In [None]:
print(df_list.neighbourhood_cleansed.value_counts(dropna=False))
print("=======================================================")
print(df_list.groupby('neighbourhood_cleansed')['review_scores_rating'].mean().sort_values())
print("=======================================================")
print(df_list.groupby('neighbourhood_cleansed')['review_scores_rating'].min().sort_values())

===
On this example, when there is no correlation between 2 variables (when correlation is 0 or near 0) the color is gray. The darkest red means there is a perfect positive correlation, while the darkest blue means there is a perfect negative correlation.

In [None]:
# Heatmap showing correlation of dependent variables against independent variable (rating)
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = df_list.corr()
corr_matrix.fillna(0,inplace=True)

plt.figure(figsize=(20,20)) 


sns.heatmap(corr_matrix, annot=True, fmt=".1f",annot_kws={'size':11})

plt.xlabel('x',fontsize=17)
plt.ylabel('y',fontsize=17)

plt.show()

##### === Location coordinates ===

### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 16 clusters.

In [None]:
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
# set number of clusters 
kclusters = 16

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0,max_iter=1200).fit(df_location_lat_lng)

In [None]:
df_clus_center = pd.DataFrame(data=kmeans.cluster_centers_,columns=['Latitude','Longitude'],index=arr_labels).rename_axis('ClusterLabel').reset_index()
print(df_clus_center.shape)
df_clus_center

In [None]:
# create map for cluster center
map_clusters = fol.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(df_a['Latitude'], df_a['Longitude'], df_a['ClusterLabel']):
    label = fol.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    fol.Circle(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

#### Display coordinates of 18 clusters onto map

In [None]:
import folium as fol
import matplotlib.cm as cm
import matplotlib.colors as colors

from geopy.geocoders import Nominatim

address = 'Hong Kong, HK'

geolocator = Nominatim(user_agent="To_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of HK are {}, {}.'.format(latitude, longitude))

In [None]:
# create map
map_clusters = fol.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_a['latitude'], df_a['longitude'], df_a['neighbourhood_cleansed'], df_a['Cluster Labels']):
    label = fol.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    fol.Circle(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

## Get Foursqure popular places of each cluster

In [None]:
import requests

def getNearbyVenues(names, latitudes, longitudes, radius=2000, LIMIT=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
hk_venues = getNearbyVenues(names=df_clus_center['ClusterLabel'],
                                   latitudes=df_clus_center['Latitude'],
                                   longitudes=df_clus_center['Longitude']
                                  )

In [None]:
# create map
map_clusters = fol.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hk_venues['Venue Latitude'], hk_venues['Venue Longitude'], hk_venues['Venue'], hk_venues['Neighborhood']):
    label = fol.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    fol.Circle(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

In [None]:
print('There are {} uniques categories.'.format(len(hk_venues['Venue Category'].unique())))