# Creating a pandas dataframe from the Wikipedia page

In [108]:
# Imported Libraries

!pip install folium
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
print("Libaries imported")

Libaries imported


# Scraping the data from the Wikipedia webpage into a pandas Dataframe

In [109]:
tordata = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#Parsing the html data into a BeautifulSoup object

In [110]:
soup = BeautifulSoup(tordata, 'html.parser')

#Creating 3 lists to store the data

In [111]:
postalCodeList = []
boroughList = []
neighborhoodList = []

#Created Dataframe, from the three lists

In [113]:
soup.find('table').find_all('tr')
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if (len(cells)>0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\\n')) 

In [114]:
soup
df_toronto = pd.DataFrame({"PostalCode":postalCodeList, "Borough":boroughList, "Neighborhood":neighborhoodList})
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


#Cells not assigned Borough dropped

In [115]:
df_torbordrop = df_toronto[df_toronto.Borough != "Not assigned"].reset_index(drop = True)
df_torbordrop.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n
5,M6A,North York,Lawrence Manor\n
6,M7A,Queen's Park,Not assigned\n
7,M9A,Etobicoke,Islington Avenue\n
8,M1B,Scarborough,Rouge\n
9,M1B,Scarborough,Malvern\n


#Neighborhoods with the same Borough grouped together

In [116]:
df_toronto_borgp = df_torbordrop.groupby(["PostalCode", "Borough"], as_index = False).agg(lambda x:",".join(x))
df_toronto_borgp.head(25)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge\n,Malvern\n,Rouge\n,Malvern\n"
1,M1C,Scarborough,"Highland Creek\n,Rouge Hill\n,Port Union\n,Hig..."
2,M1E,Scarborough,"Guildwood\n,Morningside\n,West Hill\n,Guildwoo..."
3,M1G,Scarborough,"Woburn\n,Woburn\n"
4,M1H,Scarborough,"Cedarbrae\n,Cedarbrae\n"
5,M1J,Scarborough,"Scarborough Village\n,Scarborough Village\n"
6,M1K,Scarborough,"East Birchmount Park\n,Ionview\n,Kennedy Park\..."
7,M1L,Scarborough,"Clairlea\n,Golden Mile\n,Oakridge\n,Clairlea\n..."
8,M1M,Scarborough,"Cliffcrest\n,Cliffside\n,Scarborough Village W..."
9,M1N,Scarborough,"Birch Cliff\n,Cliffside West\n,Birch Cliff\n,C..."


#Replacing Neighborhoods with 'Not assigned' as equal to value of Borough

In [118]:
df_toronto_borgp['Neighborhood'].replace("Not assigned", df_toronto_borgp['Borough'], inplace = True)
df_toronto_borgp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge\n,Malvern\n,Rouge\n,Malvern\n"
1,M1C,Scarborough,"Highland Creek\n,Rouge Hill\n,Port Union\n,Hig..."
2,M1E,Scarborough,"Guildwood\n,Morningside\n,West Hill\n,Guildwoo..."
3,M1G,Scarborough,"Woburn\n,Woburn\n"
4,M1H,Scarborough,"Cedarbrae\n,Cedarbrae\n"


#Using the .shape to print the number of rows in the dataframe

In [119]:
print("The number of rows in the data frame:", df_toronto_borgp.shape)

The number of rows in the data frame: (103, 3)


# Obtaining geospatial data after importing the csv file

In [121]:
geosp_df = pd.read_csv('http://cocl.us/Geospatial_data')
geosp_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
