# Webscraping of Toronto districts

## Part 1 : data on boroughs

### Importation of libraries

In [91]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd

### Scraping of the relevant Wikipedia page

In [92]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

### Data formatting

In [93]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [94]:
# Size of the data
df.shape

(103, 3)

## Part 2 - Find longitude and latitude 

In [95]:
#Ajout des colonnes à compléter
df['Latitude'] = 0
df['Longitude'] = 0
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,0,0
1,M4A,North York,Victoria Village,0,0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",0,0
4,M7A,Queen's Park,Ontario Provincial Government,0,0


On fait la requête en utilisant l'API google pour localiser (les exemples donnés dans le cours ne semblent pas convenir). Cela nécessite une clef API qui n'est pas rendue publique car donne le droit à un nombre limité de requêtes.

In [96]:
import requests

API_KEY_GOOGLE = 'AIzaSyAMQtbchHLkdagOt1uc_mGtecOLE7p6lIo'

for cp in df['PostalCode']:
    req = 'https://maps.googleapis.com/maps/api/geocode/json?address='+cp+', Toronto&key='+API_KEY_GOOGLE
    r = requests.get(req)
    r = r.json()
    df.loc[df['PostalCode'] == cp, 'Latitude'] = r['results'][0]['geometry']['location']['lat']
    df.loc[df['PostalCode'] == cp, 'Longitude'] = r['results'][0]['geometry']['location']['lng']
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


## Part 3 : Clustering the neighborhoods in Toronto

In [97]:
# Installation de la librairie Folium
!conda install -c conda-forge folium --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



On crée une carte de Toronto où on localise les différents codes postaux

In [98]:
import folium 
req = 'https://maps.googleapis.com/maps/api/geocode/json?address=Toronto&key='+API_KEY_GOOGLE
r = requests.get(req).json()
map_toronto = folium.Map(location=[r['results'][0]['geometry']['location']['lat'], r['results'][0]['geometry']['location']['lng']], zoom_start=10)

for lat, lng, borough, pc in zip(df['Latitude'], df['Longitude'], df['Borough'], df['PostalCode']):
    label = '{} : {}'.format(pc, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto


In [99]:
# Foursquare API

CLIENT_ID = 'SGUEN0WZBU3P34MKPYOQA2TSTWHIBEZ5PVNXSOYZD0MK2HXZ' # your Foursquare ID
CLIENT_SECRET = '4WWZK1JEOUFDSJPHB2CIKDOUK1JSQCN01GBMBHV1KJIUWVVE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: SGUEN0WZBU3P34MKPYOQA2TSTWHIBEZ5PVNXSOYZD0MK2HXZ
CLIENT_SECRET:4WWZK1JEOUFDSJPHB2CIKDOUK1JSQCN01GBMBHV1KJIUWVVE


In [100]:
df.loc[0, 'Neighborhood']
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

# type your answer here
url = 'https://api.foursquare.com/v2/venues/explore?&client_id='+CLIENT_ID+'&client_secret='+CLIENT_SECRET+'&v='+VERSION+'&ll='+str(neighborhood_latitude)+','+str(neighborhood_longitude)+'&radius=5000&limit=100'
url

results = requests.get(url).json()
results


Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


{'meta': {'code': 200, 'requestId': '61085a5e24aae1771b8a5571'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 219,
  'suggestedBounds': {'ne': {'lat': 43.79825864500005,
    'lng': -79.26747389849278},
   'sw': {'lat': 43.70825855499996, 'lng': -79.39183910150722}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b8991cbf964a520814232e3',
       'name': "Allwyn's Bakery",
       'location': {'address': '81 Underhill drive',
        'lat': 43.75984035203157,
        'lng': -79.32471879917513,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.75984035203157,
   

In [101]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']


nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()



Unnamed: 0,name,categories,lat,lng
0,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719
1,Donalda Golf & Country Club,Golf Course,43.752816,-79.342741
2,Graydon Hall Manor,Event Space,43.763923,-79.342961
3,Island Foods,Caribbean Restaurant,43.745866,-79.346035
4,Galleria Supermarket,Supermarket,43.75352,-79.349518


In [102]:
# On écrit une fonction pour "génériquer" ce que l'on vient de faire sur un exemple

def getNearbyVenues(names, latitudes, longitudes, radius=5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# On exécute la fonction pour avoir la liste des points d'intérêt
toronto_venues = getNearbyVenues(df['Neighborhood'], 
                df['Latitude'],
                df['Longitude'], 
                radius=5000)
toronto_venues.head()

In [None]:
# On transforme les données pour mettre les types de lieux en colonnes
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
toronto_onehot.head()

In [None]:
# On groupe par neighborhood pour avoir la répartition des lieux
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.columns

In [None]:
import numpy as np

# On regarde les lieux les plus fréquents de chaque zone

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()




In [None]:
# On passe au clustering
from sklearn.cluster import KMeans

# On définit un nombre de clusters : arbitrairement 5
kclusters = 5

# On enlève la première colonne de la table (qui contient des noms)
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# On fait tourner les clusters
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
print(min(kmeans.labels_))
print(max(kmeans.labels_))

In [None]:
# Mise en forme des résultats 
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


In [None]:
toronto_merged = df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 



In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# On représente les clusters sur une carte

# create map
req = 'https://maps.googleapis.com/maps/api/geocode/json?address=Toronto&key='+API_KEY_GOOGLE
r = requests.get(req).json()
map_clusters = folium.Map(location=[r['results'][0]['geometry']['location']['lat'], r['results'][0]['geometry']['location']['lng']], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters