## Data Science Capstone Week 4

To find the suitable place to live in New York / Toronto based on the neighbourhood characteristics.

### Import packages

In [78]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium # map rendering library
from bs4 import BeautifulSoup
from urllib.request import urlopen

### Data Preparation - New York

In [79]:
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json --output 'newyork_data.json'

with open('newyork_data.json', 'r') as json_data:
    newyork_data = json.load(json_data)
    
neighborhoods_data = newyork_data['features']
neighborhoods = pd.DataFrame(columns=['Borough', 'Neighbourhood', 'Latitude', 'Longitude'] )

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighbourhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  113k  100  113k    0     0  43361      0  0:00:02  0:00:02 --:--:-- 43361


In [80]:
neighborhoods.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


### Data Preparation - Toronto

In [81]:
pc_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
pc_html = urlopen(pc_url)
pc_bs = BeautifulSoup(pc_html, 'html.parser')
pc_tbl = pc_bs.find_all('table')
pc_tbl2 = pc_tbl[0]

header = [i.text.strip() for i in pc_tbl2.find_all('th')]
cell = pc_tbl2.find_all('td')

pc_df = pd.DataFrame(columns=header)

for i in range(0, int(len(cell)/3)):
    row = [j.text.strip() for j in cell[3*i:3*i+3]]
    if row[1] != "Not assigned":
        if ',' in row[2]:
            for nei in row[2].split(','):
                pc_df = pc_df.append({header[0]:row[0],header[1]:row[1],header[2]:nei.strip()},
                                 ignore_index=True)
        else:
            pc_df = pc_df.append({header[0]:row[0],header[1]:row[1],header[2]:row[2]},
                                 ignore_index=True)

def toronto_geocoder(row):
    geolocator = Nominatim(user_agent="ny_explorer")
    print(row["Neighbourhood"])
    try:
        location = geolocator.geocode(row["Neighbourhood"] + ", Toronto, Canada")
        row["Latitude"] = location.latitude
        row["Longitude"] = location.longitude
    finally:
        return row

geo_df = pd.read_csv("https://cocl.us/Geospatial_data")
pc_df2 = pc_df.merge(geo_df, on="Postal Code")
pc_df2 = pc_df2.apply(toronto_geocoder, axis=1)

Parkwoods
Victoria Village
Regent Park
Harbourfront
Lawrence Manor
Lawrence Heights
Queen's Park
Ontario Provincial Government
Islington Avenue
Humber Valley Village
Malvern
Rouge
Don Mills
Parkview Hill
Woodbine Gardens
Garden District
Ryerson
Glencairn
West Deane Park
Princess Gardens
Martin Grove
Islington
Cloverdale
Rouge Hill
Port Union
Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate
Bloordale Gardens
Old Burnhamthorpe
Markland Wood
Guildwood
Morningside
West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
Wilson Heights
Downsview North
Thorncliffe Park
Richmond
Adelaide
King
Dufferin
Dovercourt Village
Scarborough Village
Fairview
Henry Farm
Oriole
Northwood Park
York University
East Toronto
Broadview North (Old East York)
Harbourfront East
Union Station
Toronto Islands
Little Portugal
Trinity
Kennedy Park
Ionview
East Birchmount Park
Bayview Village
Downsvi

In [82]:
pc_df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457
3,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
4,M6A,North York,Lawrence Manor,43.722079,-79.437507


In [95]:
print(pc_df2.shape)
print(neighborhoods.shape)

(217, 5)
(306, 4)


In [355]:
CLIENT_ID = '-' # your Foursquare ID
CLIENT_SECRET = '-' # your Foursquare Secret
VERSION = '20210201' # Foursquare API version
LIMIT = 200 # A default Foursquare API limit value

### Function for Foursquare API

In [97]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Explore neighbourhood using Foursquare API

In [99]:
ny_nearby = getNearbyVenues(neighborhoods["Neighbourhood"], neighborhoods["Latitude"], neighborhoods["Longitude"], radius=1000)

In [100]:
tr_nearby = getNearbyVenues(pc_df2["Neighbourhood"], pc_df2["Latitude"], pc_df2["Longitude"], radius=1000)

In [101]:
ny_nearby.shape

(20577, 7)

In [102]:
tr_nearby.shape

(11622, 7)

In [103]:
ny_nearby.groupby('Neighbourhood').count()["Venue"]

Neighbourhood
Allerton          68
Annadale          17
Arden Heights     24
Arlington         23
Arrochar          23
                ... 
Woodhaven         64
Woodlawn          50
Woodrow           17
Woodside         100
Yorkville        100
Name: Venue, Length: 302, dtype: int64

In [104]:
tr_nearby.groupby('Neighbourhood').count()["Venue"].sort_values()

Neighbourhood
Humber Bay           4
Humber Summit        4
Maple Leaf Park      4
Highland Creek       5
Oakridge             8
                  ... 
Lawrence Park      104
Don Mills          146
Runnymede          160
St. James Town     200
Willowdale         282
Name: Venue, Length: 208, dtype: int64

### Function to calculate the score of neighbourhood based on my own preference 
#### (+1 score of each category existed in pos_list, -1 score for neg_list)

In [105]:
def neibourhood_score(df):
    pos_list = ["Light Rail Station", "Metro Station", "Tram Station", "Train Station",
                "Bus Stop", "Bus Station", "Police Station", "Fire Station", "Hospital",
                "Medical Center", "Pier", "Shopping Plaza", "Shopping Mall", "Supermarket",
               "Grocery Store", "Department Store", "Convenience Store", "Waterfront", "Park",
               "National Park","Harbor / Marina", "Beach", "Bay", "Art Museum", "Museum"]
    neg_list = ["Prison", "Military Base", "Power Plant", "Waste Facility", "Airport",
                "Gas Station", "Funeral Home", "Cemetery"]
    onehot = pd.get_dummies(df[['Venue Category']], prefix="", prefix_sep="")
    onehot['Neighbourhood'] = df['Neighbourhood']
    onehot_grp = onehot.groupby("Neighbourhood").sum()
    onehot_col = onehot_grp.columns
    score = onehot_grp.get(set(onehot_col).intersection(set(pos_list))).sum(axis=1) - onehot_grp.get(set(onehot_col).intersection(set(neg_list))).sum(axis=1)
    return score

### Score for neighbourhood in New York

In [106]:
ny_score = neibourhood_score(ny_nearby)
ny_score.sort_values()

Neighbourhood
Chinatown          0
Todt Hill          0
Butler Manor       0
Rossville          1
West Brighton      1
                  ..
Marble Hill       16
Rockaway Beach    17
Melrose           18
Baychester        18
Starrett City     19
Length: 302, dtype: int64

### Score for neighbourhood in Toronto

In [107]:
tr_score = neibourhood_score(tr_nearby)
tr_score.sort_values()

Neighbourhood
Downsview           -4
Downsview North     -1
York University     -1
Parkwoods            0
Albion Gardens       0
                    ..
Bathurst Quay       12
The Beaches         12
The Beaches West    12
Yorkville           12
Willowdale          21
Length: 208, dtype: int64

#### Add the score column to dataframe

In [342]:
temp = pd.DataFrame(ny_score,columns=["Score"])
temp.reset_index(level="Neighbourhood",inplace=True)
ny_df2 = neighborhoods.merge(temp, on="Neighbourhood").drop_duplicates("Neighbourhood")
ny_df2.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Score
0,Bronx,Wakefield,40.894705,-73.847201,3
1,Bronx,Co-op City,40.874294,-73.829939,16
2,Bronx,Eastchester,40.887556,-73.827806,8
3,Bronx,Fieldston,40.895437,-73.905643,7
4,Bronx,Riverdale,40.890834,-73.912585,4


In [343]:
temp = pd.DataFrame(tr_score,columns=["Score"])
temp.reset_index(level="Neighbourhood",inplace=True)
tr_df2 = pc_df2.merge(temp, on="Neighbourhood").drop_duplicates("Neighbourhood")
tr_df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Score
0,M3A,North York,Parkwoods,43.7588,-79.320197,0
1,M4A,North York,Victoria Village,43.732658,-79.311189,2
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,5
3,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015,7
4,M6A,North York,Lawrence Manor,43.722079,-79.437507,6


## Plot result onto map using Folium

In [344]:
import folium
from folium.plugins import HeatMap

def heatmap_helper(data, map_obj):
    map_obj.add_child(HeatMap(data, radius=13, gradient={0.1:'darkblue', 0.4:'green',
                                                         0.6:'yellow', 0.7:'orange',
                                                         0.8:'red', 0.9:'darkred', 1.0:'black'}))

def circlemarker(row, map_obj):
    nei = row["Neighbourhood"].values[0]
    folium.vector_layers.CircleMarker(
        location=[row["Latitude"], row["Longitude"]],
        radius=10, popup="#{} - {}".format(str(row.name + 1), row["Neighbourhood"]),
        stroke=False,
        fill_color='black',
        fill_opacity=0.4).add_to(map_obj)

def marker(row, map_obj):
    folium.Marker(
        location=[row["Latitude"], row["Longitude"]],
        popup="Top {} - {}".format(str(row.name + 1), row["Neighbourhood"])).add_to(map_obj)

### Create Map for New York

#### Color of the heatmap: the darker color the higher score

In [351]:
ny_map = folium.Map(location=[40.7306, -73.9352], zoom_start=10)
heatmap_helper(ny_df2[["Latitude", "Longitude", "Score"]], ny_map)

ny_df_sorted = ny_df2.sort_values(by="Score", ascending=False).iloc[0:5,:].reset_index(drop=True)
ny_df_sorted.apply(lambda x: marker(x, ny_map), axis=1)
ny_df_sorted

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Score
0,Brooklyn,Starrett City,40.647589,-73.87937,19
1,Bronx,Melrose,40.819754,-73.909422,18
2,Bronx,Baychester,40.866858,-73.835798,18
3,Queens,Rockaway Beach,40.582802,-73.822361,17
4,Bronx,Mount Hope,40.848842,-73.908299,16


In [354]:
ny_map

### Create Map for Toronto

#### Color of the heatmap: the darker color the higher score

In [352]:
tr_map = folium.Map(location=[43.6011, -79.3470], zoom_start=10)
heatmap_helper(tr_df2[["Latitude", "Longitude", "Score"]], tr_map)

tr_df_sorted = tr_df2.sort_values(by="Score", ascending=False).iloc[0:5,:].reset_index(drop=True)
tr_df_sorted.apply(lambda x: marker(x, tr_map), axis=1)
tr_df_sorted

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Score
0,M2M,North York,Willowdale,43.76151,-79.410923,21
1,M5R,Central Toronto,Yorkville,43.671386,-79.390168,12
2,M5V,Downtown Toronto,Bathurst Quay,43.63579,-79.398329,12
3,M4E,East Toronto,The Beaches,43.671024,-79.296712,12
4,M4L,East Toronto,The Beaches West,43.671024,-79.296712,12


In [356]:
tr_map

##### Top 5 in New York: ["Starrett City", "Melrose", "Baychester", "Rockaway Beach", "Mount Hope"]

##### Top 5 in Toronto: ["Willowdale", "Yorkville", "Bathurst Quay", "The Beaches", "The Beaches West"]