# Applied Data Science Capstone Week 3 -- City of Toronto

## PART I -- scrape data from Wiki page, then create a dataframe contains the neighborhoods of Toronto

``` step 1: scrape Wiki page using pd.read.html ```

In [175]:
import pandas as pd
import numpy as np

In [28]:
l = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = l[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


``` step 2: drop rows where 'borough'='Not assigned' ```

In [29]:
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


``` step 3: merge neighborhoods which has the same postal code ```

In [30]:
df = df.groupby(['Postcode','Borough'], sort = False).agg(lambda x: ",".join(x)).reset_index()
df.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Not assigned
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


``` display the number of rows of this dataframe ```

In [31]:
df.shape

(103, 3)

## PART II -- get the latitude and the longitude coordinates of each neighborhood

``` step 1: read geocoder data from csv file ```

In [32]:
df1 = pd.read_csv('http://cocl.us/Geospatial_data')
df1.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


``` step 2: sort df (neighborhoods) and df1 (locations) in same order, and append 'Latitude' and 'Longitude' in df1 to df ```

In [45]:
df = df.sort_values('Postcode')
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
6,M1B,Scarborough,"Rouge,Malvern"
12,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
18,M1E,Scarborough,"Guildwood,Morningside,West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [46]:
df1 = df1.sort_values('Postal Code')
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [53]:
df['Latitude'] = df1['Latitude']
df['Longitude'] = df1['Longitude']
df.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
6,M1B,Scarborough,"Rouge,Malvern",43.727929,-79.262029
12,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7942,-79.262029
18,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.778517,-79.346556
22,M1G,Scarborough,Woburn,43.77012,-79.408493
26,M1H,Scarborough,Cedarbrae,43.745906,-79.352188
32,M1J,Scarborough,Scarborough Village,43.728496,-79.495697
38,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.70906,-79.363452
44,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.72802,-79.38879
51,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.667967,-79.367675
58,M1N,Scarborough,"Birch Cliff,Cliffside West",43.650571,-79.384568


## PART III -- segment and clustering the neighborhoods in Downtown Toronto

In [61]:
import matplotlib as mpl
from sklearn.cluster import KMeans
import json
from pandas.io.json import json_normalize

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geographiclib-1.50   | 34 KB     | ##################################### | 100% 
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environ

``` step 1: slice df from part 2 and create a new dataframe where Borough=Downtown Toronto ```

In [59]:
df2 = df[df['Borough'] == 'Downtown Toronto']
df2

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
91,M4W,Downtown Toronto,Rosedale,43.636258,-79.498509
96,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.756303,-79.565963
99,M4Y,Downtown Toronto,Church and Wellesley,43.696319,-79.532242
2,M5A,Downtown Toronto,Harbourfront,43.763573,-79.188711
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.692657,-79.264848
15,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389
20,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714
24,M5G,Downtown Toronto,Central Bay Street,43.782736,-79.442259
30,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.737473,-79.464763
36,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.695344,-79.318389


``` step 2: create a map of Downtown Toronto, add markers to map ```

In [76]:
map_dtt=folium.Map(location=[43.636258, -79.498509], zoom_start=10)

for lat, lng, label in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dtt)  

map_dtt

``` step 3: utilizing Foursquare API```

In [77]:
CLIENT_ID = 'ZNSID3JACAT4524JYYIWEF3UYOAGQ1CJFGKOQVFGWTXPK1L3'
CLIENT_SECRET = '5E0LASVEW5N50XVEP4MKU0FN4YZ5LYKFVPITQDJBIYG1H4S1'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZNSID3JACAT4524JYYIWEF3UYOAGQ1CJFGKOQVFGWTXPK1L3
CLIENT_SECRET:5E0LASVEW5N50XVEP4MKU0FN4YZ5LYKFVPITQDJBIYG1H4S1


``` step 4: get venues within 1000 meters from CN Tower in Downtown Toronto```

In [110]:
df2.loc[df2['Postcode'] == 'M5V']

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
87,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.662744,-79.321558


In [128]:
LIMIT = 100
radius = 1000
m5v_lati = 43.662744
m5v_longi = -79.321558

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    m5v_lati,
    m5v_longi,
    radius, 
    LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e1c06a9542890001b643e21'},
 'response': {'headerLocation': 'Leslieville',
  'headerFullLocation': 'Leslieville, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 49,
  'suggestedBounds': {'ne': {'lat': 43.671744009000015,
    'lng': -79.30914024580542},
   'sw': {'lat': 43.65374399099999, 'lng': -79.33397575419457}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '58d4650e2f91cb5ccec03447',
       'name': 'Rorschach Brewing Co.',
       'location': {'address': '1001 Eastern Ave',
        'crossStreet': 'Woodfield Rd',
        'lat': 43.6634831695922,
        'lng': -79.31982368639481,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.6634831695922,
          'lng': -79.31982368639481}],
    

In [129]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [134]:
venues = results['response']['groups'][0]['items']
    
m5v_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
m5v_venues = m5v_venues.loc[:, filtered_columns]

# filter the category for each row
m5v_venues['venue.categories'] = m5v_venues.apply(get_category_type, axis=1)

# clean columns
m5v_venues.columns = [col.split(".")[-1] for col in m5v_venues.columns]

m5v_venues.head(20)

Unnamed: 0,name,categories,lat,lng
0,Rorschach Brewing Co.,Brewery,43.663483,-79.319824
1,Leslieville Farmers Market,Farmers Market,43.664901,-79.319784
2,The Sidekick,Comic Shop,43.664484,-79.325162
3,Chino Locos,Burrito Place,43.664653,-79.325584
4,Queen Margherita Pizza,Pizza Place,43.664685,-79.324164
5,Black Lab Brewing,Brewery,43.661839,-79.329137
6,Hastings Snack Bar,Snack Place,43.663697,-79.328994
7,Woodbine Park,Park,43.66486,-79.315109
8,Hitch Bar,Bar,43.66325,-79.330649
9,Descendant Detroit Style Pizza,Pizza Place,43.662802,-79.33238


In [131]:
print('{} venues were returned by Foursquare.'.format(m5v_venues.shape[0]))

49 venues were returned by Foursquare.


``` step 5: group venues by their categories, and take means of the frequency of occurance of each category ```

In [143]:
m5v_venues.rename(columns={'name':'Venue Name', 'categories':'Venue Category', 'lat':'Venue Latitude', 'lng':'Venue Longitude'}, inplace=True)
m5v_venues['Neighborhood']
m5v_venues.head()

Unnamed: 0,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,Rorschach Brewing Co.,Brewery,43.663483,-79.319824
1,Leslieville Farmers Market,Farmers Market,43.664901,-79.319784
2,The Sidekick,Comic Shop,43.664484,-79.325162
3,Chino Locos,Burrito Place,43.664653,-79.325584
4,Queen Margherita Pizza,Pizza Place,43.664685,-79.324164


In [150]:
m5v_onehot = pd.get_dummies(m5v_venues[['Venue Category']], prefix="", prefix_sep="")

m5v_onehot['Neighborhood'] = df2['Neighborhood']

m5v_onehot.head()

Unnamed: 0,Arts & Crafts Store,BBQ Joint,Bakery,Bar,Beach,Bistro,Breakfast Spot,Brewery,Burger Joint,Burrito Place,...,Pet Store,Pizza Place,Pub,Restaurant,Skate Park,Snack Place,Steakhouse,Sushi Restaurant,Thai Restaurant,Neighborhood
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Harbourfront
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Queen's Park


In [151]:
m5v_onehot.shape

(49, 35)

In [152]:
m5v_grouped = m5v_onehot.groupby('Neighborhood').mean().reset_index()
m5v_grouped

Unnamed: 0,Neighborhood,Arts & Crafts Store,BBQ Joint,Bakery,Bar,Beach,Bistro,Breakfast Spot,Brewery,Burger Joint,...,Park,Pet Store,Pizza Place,Pub,Restaurant,Skate Park,Snack Place,Steakhouse,Sushi Restaurant,Thai Restaurant
0,"Adelaide,King,Richmond",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,Berczy Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Central Bay Street,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Christie,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Commerce Court,Victoria Hotel",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"Design Exchange,Toronto Dominion Centre",0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Harbourfront East,Toronto Islands,Union Station",0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,Queen's Park,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,"Ryerson,Garden District",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [194]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [195]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = m5v_grouped['Neighborhood']

for ind in np.arange(m5v_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(m5v_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

TypeError: '<' not supported between instances of 'numpy.ndarray' and 'str'

``` step 6: clustering neighborhoods in M5V area ```

In [154]:
# set number of clusters
kclusters = 5

m5v_grouped_clustering = m5v_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(m5v_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 0, 0, 0, 0, 2, 0, 1, 1], dtype=int32)

In [193]:
# create map
map_clusters = folium.Map(location=[m5v_venues['Venue Latitude'].tolist(), m5v_venues['Venue Longitude'].tolist()], zoom_start=10)
       
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi in zip(m5v_venues['Venue Latitude'], m5v_venues['Venue Longitude'], m5v_grouped['Neighborhood']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

UndefinedError: 'None' has no attribute 'replace'

<folium.folium.Map at 0x7f10457c2f98>