Visualization with Python Assignment in Toronto Project

In [453]:
#importing required libraries
from bs4 import BeautifulSoup  # BeautifulSoup library is for getting data from webpages
import requests
import pandas as pd
import numpy as np

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Let's get data from Wikipedia and create a JSON file

In [52]:
#retrieving data from wikipedia
URL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response =requests.get(URL)
soup=BeautifulSoup(response.text, 'html.parser')
table = soup.find('table',{'class':'wikitable sortable'}).tbody

#converting the data to dataframe
rows =table.find_all('tr')
columns =[v.text.replace('\n','') for v in rows[0].find_all('th')]

df = pd.DataFrame(columns=columns)

for i in range(1,len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds)==4:
       values=[tds[0].text , tds[1].text,'' , tds[2].text,'' ,tds[3].text.replace('\n','').replace('\xa0','')]
    else:
        values= [td.text.replace('\n', '').replace('\xa0','') for td in tds]
    
    df=df.append(pd.Series(values, index=columns), ignore_index=True)

#convert dataframe to json
df.to_json(r'C:\Users\xxx\ExtractionWikifediaTorontoToJSON.json', orient='split' ,index=False)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [365]:
#filtered Borough column for Not Assigned value
df_filtered = df[df.Borough!='Not assigned']

Adding index is neccesary before grouping for that example

In [366]:
#conver Pstcode and Borough column to index data for using groupby function
df_filtered_index=df_filtered.set_index(['Postcode', 'Borough'])
df_filtered_index

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Downtown Toronto,Queen's Park
M9A,Queen's Park,Not assigned
M1B,Scarborough,Rouge
M1B,Scarborough,Malvern
M3B,North York,Don Mills North


We grouped Borough and Neigburhood column according to Postcode, and collected Neigbourhood values from different rows to the same data cell by using ' , '

In [367]:
#to do groupby by using Postcode and prepare Neigbourhood column like 'Lawrence Heights,Lawrence Manor'
df_filtered_groupby = df_filtered_index.groupby(level=['Postcode', 'Borough'], sort=False).agg( ','.join)
df_filtered_groupby

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Downtown Toronto,Queen's Park
M9A,Queen's Park,Not assigned
M1B,Scarborough,"Rouge,Malvern"
M3B,North York,Don Mills North
M4B,East York,"Woodbine Gardens,Parkview Hill"
M5B,Downtown Toronto,"Ryerson,Garden District"


In [368]:
#Checking result with some examples
df_filtered_groupby.filter(like='M1R', axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1R,Scarborough,"Maryvale,Wexford"


In [369]:
df_filtered_groupby[df_filtered_groupby.Neighbourhood=='Not assigned']
df_filtered_groupby

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Downtown Toronto,Queen's Park
M9A,Queen's Park,Not assigned
M1B,Scarborough,"Rouge,Malvern"
M3B,North York,Don Mills North
M4B,East York,"Woodbine Gardens,Parkview Hill"
M5B,Downtown Toronto,"Ryerson,Garden District"


In [370]:
#removing index
df_filtered_groupby=df_filtered_groupby.reset_index()
df_filtered_groupby

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Not assigned
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


We found Neighbourhood column if it has 'Not assigned' data, and update it with Borough value by using numpy where function

In [372]:
#Updating neigbourhood column if equals to Not assign with Borough value 
df_filtered_groupby["Neighbourhood"] = np.where(df_filtered_groupby["Neighbourhood"] == 'Not assigned', df_filtered_groupby['Borough'], df_filtered_groupby["Neighbourhood"])
df_filtered_groupby

In [375]:
#We printed that how many row data we have in dataframe by using .shape method
print('{} rows exist in dataframe'.format(df_filtered_groupby.shape[0]))

103 rows exist in dataframe


Importing geocoder library

In [None]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

Having Geo data of Toronto from csv to dataframe

In [383]:
Toronto_geo_data = pd.read_csv(r'C:\Users\xxx\Desktop\IBM Coursera-Visualization with Python\assignment4-wikipedia\Geospatial_Coordinates.csv')
Toronto_geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#Merged 2 different tables (df_filtered_groupby , Toronto_geo_data)

In [444]:
df_filtered_groupby_merged=df_filtered_groupby
df_filtered_groupby_merged = df_filtered_groupby_merged.join(Toronto_geo_data.set_index('Postal Code'), on='Postcode')
df_filtered_groupby_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


Normalize the dataset and making clusters

In [450]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering2 = df_filtered_groupby_merged.drop(['Neighbourhood','Borough','Postcode'], 1)
toronto_grouped_clustering2

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering2)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2])

Having Toronto location as starting point in our data

In [469]:
address = 'Toronto'

geolocator = Nominatim(user_agent="Can_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Canada are 43.653963, -79.387207.


Puttin all locations to the Toronto map by using folium library

In [471]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=8)

# add markers to map
for lat, lng, label in zip(df_filtered_groupby_merged['Latitude'], df_filtered_groupby_merged['Longitude'], df_filtered_groupby_merged['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto