# Data Scraping

In [55]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

In [66]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'html5lib')

postal_codes_dict = {} # initialize an empty dictionary to save the data in
for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.b.text # get the postal code
        postal_code_investigate = table_cell.span.text
        neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
        borough = neighborhoods_data.split('(')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            try:
                neighborhoods = neighborhoods_data.split('(')[1]
            
                # remove parantheses from neighborhoods string
                neighborhoods = neighborhoods.replace('(', ' ')
                neighborhoods = neighborhoods.replace(')', ' ')

                neighborhoods_names = neighborhoods.split('/')
                neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
            except:
                borough = borough.strip('\n')
                neighborhoods_clean = borough
 
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
    except:
        pass
    
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    toronto_data = toronto_data.append({"PostalCode": postal_code, 
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)

# print number of rows of dataframe
toronto_data.shape[0]

103

# GeoSpacial

In [68]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request,urllib.parse , requests
import re 
import geocoder

In [69]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = urllib.request.urlopen(url).read()
soup = BeautifulSoup(data,'lxml')
soup_data = soup.find_all('table')

df = pd.read_html(str(soup_data))[0]

data_dict = {
    'PostalCode':[],
    'Borough':[],
    'Neighborhood':[]
    }

for i in range(len(df.columns)):
    for j in range(len(df)):
        postal_code = df[i][j][0:3]
        borough = re.findall('^[^\(]+',df[i][j][3:] )[0] 
        if borough != 'Not assigned':
            neighborhoods = re.findall('(?<=[(])[^\)]+',df[i][j][3:])[0].split(' / ')
            data_dict['PostalCode'].append(postal_code)
            data_dict['Borough'].append(borough)
            data_dict['Neighborhood'].append(','.join(neighborhoods)) 

dataframe = pd.DataFrame(data_dict)

In [70]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

r = requests.get(url, allow_redirects=True)
open('GeoSpatialDataset.csv', 'wb').write(r.content)

geodata = pd.read_csv('GeoSpatialDataset.csv')
geodata = geodata.rename(columns = {'Postal Code':'PostalCode'})
full_data = pd.merge(dataframe,geodata ,on='PostalCode')
full_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Exploration in Toronto

In [71]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request,urllib.parse , requests
import re 
import geocoder
import folium
import matplotlib
import matplotlib.cm as cm
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

In [72]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = urllib.request.urlopen(url).read()
soup = BeautifulSoup(data,'lxml')
soup_data = soup.find_all('table')

df = pd.read_html(str(soup_data))[0]

data_dict = {
    'PostalCode':[],
    'Borough':[],
    'Neighborhood':[]
    }

for i in range(len(df.columns)):
    for j in range(len(df)):
        postal_code = df[i][j][0:3]
        borough = re.findall('^[^\(]+',df[i][j][3:] )[0] 
        if borough != 'Not assigned':
            neighborhoods = re.findall('(?<=[(])[^\)]+',df[i][j][3:])[0].split(' / ')
            data_dict['PostalCode'].append(postal_code)# it is a string
            data_dict['Borough'].append(borough)
            data_dict['Neighborhood'].append(','.join(neighborhoods)) 

dataframe = pd.DataFrame(data_dict)

In [73]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

r = requests.get(url, allow_redirects=True)
open('GeoSpatialDataset.csv', 'wb').write(r.content)

geodata = pd.read_csv('GeoSpatialDataset.csv')
geodata = geodata.rename(columns = {'Postal Code':'PostalCode'})
full_data = pd.merge(dataframe,geodata ,on='PostalCode')
full_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [74]:
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode('toronto')
toronto_latitude = location.latitude
toronto_longitude = location.longitude

toronto_map = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(full_data['Latitude'], full_data['Longitude'], full_data['Borough'], full_data['Neighborhood'] ):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius=5,  popup=label,  fill=True, fill_opacity=0.7 , parse_html=False).add_to(toronto_map)  
    
toronto_map

In [75]:
scarborough_df = full_data[full_data['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [76]:
scarborough_df = full_data[full_data['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [77]:
eolocator = Nominatim(user_agent="scarborough_explorer")
location = geolocator.geocode('Scarborough,Toronto')
borough_latitude = location.latitude
borough_longitude = location.longitude

scarborough_map = folium.Map(location=[borough_latitude, borough_longitude], zoom_start=11)

# add markers to map
for lat,lng, neighborhood in zip(scarborough_df['Latitude'], scarborough_df['Longitude'], scarborough_df['Neighborhood'] ):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius=5,  popup=label,  fill=True, fill_opacity=0.7 , parse_html=False).add_to(scarborough_map)  
    
scarborough_map

In [78]:
CLIENT_ID = ' WHPBHJ1EY1I1XCD3K3LW3GOJMZQDLR2MMMLKLCNPLJKWUWUL'
CLIENT_SECRET = 'Y0QYGPJOUJTXDZTIVC5FKYABFFQ02YSALB15QSW35EZLALVK ' # your Foursquare Secret
VERSION = '20210429' 
LIMIT = 100
radius = 500

In [79]:
postal_code, neighborhood_lat, neighborhood_long, neighborhood_name = scarborough_df.iloc[10,:][0],scarborough_df.iloc[10,:][3], scarborough_df.iloc[10,:][4],scarborough_df.iloc[10,:][2]

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_lat, neighborhood_long, radius, LIMIT )

venues = requests.get(url).json()['response']['groups'][0]['items']
venue_data = pd.json_normalize(venues)[['venue.id','venue.name','venue.categories','venue.location.lat','venue.location.lng']]
venue_data['venue.categories'] = venue_data['venue.categories'].apply(lambda x:x[0]['name'])

venue_data

Unnamed: 0,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,4b6475aef964a520eab42ae3,Kim Kim restaurant,Chinese Restaurant,43.753833,-79.276611
1,5226562611d2cd49d83ef03b,Kairali,Indian Restaurant,43.754915,-79.276945
2,4bf96c435317a593a23a017f,Karaikudi Chettinad South Indian Restaurant,Indian Restaurant,43.756042,-79.276276
3,4bc75a3c6501c9b630433e29,Pho Vietnam,Vietnamese Restaurant,43.75777,-79.278572
4,4bda1b6c63c5c9b641c32268,Scarborough LRT,Light Rail Station,43.756465,-79.272194
5,4b0dae6af964a520ce4d23e3,Big Al's Pet Supercentre,Pet Store,43.759279,-79.278325
6,587cf49a75e13712977d9849,Omescape Scarborough,Gaming Cafe,43.754158,-79.27623


In [80]:
columns_to_pick = ['venue.id','venue.name','venue.categories','venue.location.lat','venue.location.lng']
venue_data = pd.DataFrame(columns=['PostalCode']+columns_to_pick)
for postal_code,lat,longt in zip(scarborough_df['PostalCode'],scarborough_df['Latitude'],scarborough_df['Longitude']):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( 
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, longt, radius, LIMIT )
    try:
        venues = requests.get(url).json()['response']['groups'][0]['items']
        norm_df = pd.json_normalize(venues)[columns_to_pick]
        venue_data = pd.concat([venue_data , norm_df ],axis=0)
        venue_data.iloc[-len(norm_df):,0]  = postal_code
    except:
        pass


venue_data['venue.categories'] = venue_data['venue.categories'].apply(lambda x:x[0]['name'])
venue_data = venue_data.reset_index(drop=True)
print(venue_data.shape)
venue_data.head()

(94, 6)


Unnamed: 0,PostalCode,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,M1B,4bb6b9446edc76b0d771311c,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
1,M1C,4c23d3aaf7ced13a5ed7216d,Royal Canadian Legion,Bar,43.782533,-79.163085
2,M1E,4beee041e24d20a1cd857314,RBC Royal Bank,Bank,43.76679,-79.191151
3,M1E,4c62f34bde1b2d7fec89e370,G & G Electronics,Electronics Store,43.765309,-79.191537
4,M1E,57fd24f6cd1083addfd77bf9,Sail Sushi,Restaurant,43.765951,-79.191275


In [81]:
columns_to_pick = ['venue.id','venue.name','venue.categories','venue.location.lat','venue.location.lng']
venue_data = pd.DataFrame(columns=['PostalCode']+columns_to_pick)
for postal_code,lat,longt in zip(full_data['PostalCode'],full_data['Latitude'],full_data['Longitude']):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( 
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, longt, radius, LIMIT )
    try:
        venues = requests.get(url).json()['response']['groups'][0]['items']
        norm_df = pd.json_normalize(venues)[columns_to_pick]
        venue_data = pd.concat([venue_data , norm_df ],axis=0)
        venue_data.iloc[-len(norm_df):,0]  = postal_code
    except:
        pass


venue_data['venue.categories'] = venue_data['venue.categories'].apply(lambda x:x[0]['name'])
venue_data = venue_data.reset_index(drop=True)
print(venue_data.shape)
venue_data.head()

(2106, 6)


Unnamed: 0,PostalCode,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,M1B,4bb6b9446edc76b0d771311c,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
1,M1C,4c23d3aaf7ced13a5ed7216d,Royal Canadian Legion,Bar,43.782533,-79.163085
2,M1E,4beee041e24d20a1cd857314,RBC Royal Bank,Bank,43.76679,-79.191151
3,M1E,4c62f34bde1b2d7fec89e370,G & G Electronics,Electronics Store,43.765309,-79.191537
4,M1E,57fd24f6cd1083addfd77bf9,Sail Sushi,Restaurant,43.765951,-79.191275


In [82]:
len(venue_data['venue.categories'].unique())

272

In [83]:
toronto_data_grouped = venue_data.groupby(['PostalCode','venue.categories'])['venue.categories'].count().reset_index(name="count")
toronto_data_grouped.head(10)

Unnamed: 0,PostalCode,venue.categories,count
0,M1B,Fast Food Restaurant,1
1,M1C,Bar,1
2,M1E,Bank,1
3,M1E,Breakfast Spot,1
4,M1E,Donut Shop,1
5,M1E,Electronics Store,1
6,M1E,Intersection,1
7,M1E,Medical Center,1
8,M1E,Mexican Restaurant,1
9,M1E,Rental Car Location,1


In [84]:
toronto_filled = toronto_data_grouped.pivot(values='count',columns='venue.categories').fillna(0)

kclusters = 7

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_filled)

toronto_data_grouped['Cluster labels'] = kmeans.labels_

toronto_full = pd.merge(toronto_data_grouped,full_data[['PostalCode','Neighborhood','Latitude','Longitude']] , on = 'PostalCode')

toronto_full.head()

Unnamed: 0,PostalCode,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
0,M1B,Fast Food Restaurant,1,0,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Bar,1,0,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Bank,1,4,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1E,Breakfast Spot,1,0,"Guildwood,Morningside,West Hill",43.763573,-79.188711
4,M1E,Donut Shop,1,0,"Guildwood,Morningside,West Hill",43.763573,-79.188711


In [85]:
# View clusters and their counts
print(np.unique(toronto_full['Cluster labels'], return_counts=True))

# Use this to see those not classified as class/cluster zero , you can comment it out
# I will ignore cluster zero since it has too many values to plot
toronto_full = toronto_full[toronto_full['Cluster labels']!=0]

(array([0, 1, 2, 3, 4, 5, 6], dtype=int32), array([1424,    2,    8,   13,   27,   17,   26]))


In [86]:
# create map
map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [matplotlib.colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_full['Latitude'], toronto_full['Longitude'],toronto_full['Neighborhood'], toronto_full['Cluster labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [87]:
toronto_full.loc[toronto_full['Cluster labels'] == 1, toronto_full.columns[[1] + list(range(2, toronto_full.shape[1]))]]

Unnamed: 0,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
104,Clothing Store,8,1,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
580,Clothing Store,9,1,"Garden District, Ryerson",43.657162,-79.378937


In [88]:
toronto_full.loc[toronto_full['Cluster labels'] == 2, toronto_full.columns[[1] + list(range(2, toronto_full.shape[1]))]]

Unnamed: 0,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
581,Coffee Shop,9,2,"Garden District, Ryerson",43.657162,-79.378937
736,Coffee Shop,11,2,Central Bay Street,43.657952,-79.387383
787,Coffee Shop,10,2,"Richmond,Adelaide,King",43.650571,-79.384568
844,Coffee Shop,13,2,"Harbourfront East,Union Station,Toronto Islands",43.640816,-79.381752
904,Coffee Shop,11,2,"Toronto Dominion Centre,Design Exchange",43.647177,-79.381576
959,Coffee Shop,14,2,"Commerce Court,Victoria Hotel",43.648198,-79.379817
1137,Coffee Shop,12,2,Enclave of M5E,43.646435,-79.374846
1193,Coffee Shop,11,2,"First Canadian Place,Underground city",43.648429,-79.38228


In [89]:
toronto_full.loc[toronto_full['Cluster labels'] == 3, toronto_full.columns[[1] + list(range(2, toronto_full.shape[1]))]]

Unnamed: 0,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
105,Coffee Shop,5,3,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
142,Coffee Shop,3,3,Willowdale,43.77012,-79.408493
266,Coffee Shop,4,3,Leaside,43.70906,-79.363452
314,Coffee Shop,4,3,"The Danforth West,Riverdale",43.679557,-79.352188
361,Coffee Shop,3,3,Studio District,43.659526,-79.340923
463,Coffee Shop,3,3,"St. James Town,Cabbagetown",43.667967,-79.367675
500,Coffee Shop,6,3,Church and Wellesley,43.66586,-79.38316
551,Coffee Shop,7,3,"Regent Park,Harbourfront",43.65426,-79.360636
649,Coffee Shop,6,3,St. James Town,43.651494,-79.375418
699,Coffee Shop,6,3,Berczy Park,43.644771,-79.373306


In [90]:
toronto_full.loc[toronto_full['Cluster labels'] == 4, toronto_full.columns[[1] + list(range(2, toronto_full.shape[1]))]]

Unnamed: 0,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
2,Bank,1,4,"Guildwood,Morningside,West Hill",43.763573,-79.188711
16,Bank,1,4,Cedarbrae,43.773136,-79.239476
61,Bank,1,4,"Clarks Corners,Tam O'Shanter,Sullivan",43.781638,-79.304302
77,Bank,1,4,"Steeles West,L'Amoreaux West",43.799525,-79.318389
95,Bank,2,4,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
133,Bank,1,4,Bayview Village,43.786947,-79.385975
139,Bank,1,4,Willowdale,43.77012,-79.408493
195,Bank,2,4,"Bathurst Manor,Wilson Heights,Downsview North",43.754328,-79.442259
223,Bank,1,4,Downsview,43.739015,-79.506944
240,Bank,1,4,"Parkview Hill,Woodbine Gardens",43.706397,-79.309937


In [91]:
toronto_full.loc[toronto_full['Cluster labels'] == 5, toronto_full.columns[[1] + list(range(2, toronto_full.shape[1]))]]

Unnamed: 0,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
460,Café,3,5,"St. James Town,Cabbagetown",43.667967,-79.367675
549,Café,3,5,"Regent Park,Harbourfront",43.65426,-79.360636
578,Café,3,5,"Garden District, Ryerson",43.657162,-79.378937
646,Café,5,5,St. James Town,43.651494,-79.375418
735,Café,3,5,Central Bay Street,43.657952,-79.387383
785,Café,5,5,"Richmond,Adelaide,King",43.650571,-79.384568
842,Café,4,5,"Harbourfront East,Union Station,Toronto Islands",43.640816,-79.381752
901,Café,6,5,"Toronto Dominion Centre,Design Exchange",43.647177,-79.381576
957,Café,6,5,"Commerce Court,Victoria Hotel",43.648198,-79.379817
1031,Café,3,5,"The Annex,North Midtown,Yorkville",43.67271,-79.405678


In [92]:
toronto_full.loc[toronto_full['Cluster labels'] == 6, toronto_full.columns[[1] + list(range(2, toronto_full.shape[1]))]]

Unnamed: 0,venue.categories,count,Cluster labels,Neighborhood,Latitude,Longitude
68,Italian Restaurant,1,6,"Clarks Corners,Tam O'Shanter,Sullivan",43.781638,-79.304302
190,Italian Restaurant,1,6,Don Mills,43.7259,-79.340923
324,Italian Restaurant,3,6,"The Danforth West,Riverdale",43.679557,-79.352188
341,Italian Restaurant,1,6,"India Bazaar,The Beaches West",43.668999,-79.315572
372,Italian Restaurant,1,6,Studio District,43.659526,-79.340923
405,Italian Restaurant,1,6,North Toronto West,43.715383,-79.405678
428,Italian Restaurant,2,6,Davisville,43.704324,-79.38879
472,Italian Restaurant,2,6,"St. James Town,Cabbagetown",43.667967,-79.367675
519,Italian Restaurant,1,6,Church and Wellesley,43.66586,-79.38316
599,Italian Restaurant,2,6,"Garden District, Ryerson",43.657162,-79.378937
