In [1]:
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library

<b> Installing Beautifulsoup4</b>

In [2]:
!conda install -c anaconda beautifulsoup4

Solving environment: done

# All requested packages already installed.



Need to get data from this address: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [3]:
from bs4 import BeautifulSoup

url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

import requests
response = requests.get(url)
#print (response.status_code) #Should be 200 for a good request
#print (response.content)

soup = BeautifulSoup(response.content, 'html.parser')


OK now we have soup! Let's pase what we got

In [4]:
gdp_table = soup.find("table", attrs={"class": "wikitable"})
gdp_table_data = gdp_table.tbody.find_all("tr") 
print("Number of rows = " + str(len(gdp_table_data)))

#let's get the headers
headings = []
for th in gdp_table_data[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    headings.append(th.string.replace('\n', ' ').replace('Postal code', 'Postal Code').strip())

print("The Table Headers are:")
print(headings)



Number of rows = 181
The Table Headers are:
['Postal Code', 'Borough', 'Neighborhood']


In [5]:
df_q1 = pd.DataFrame(columns=headings)
contents = []

for row in gdp_table_data[1:100]:
    myrow = []
    for td in row.find_all("td"):
        # remove any newlines and extra spaces from left and right
        myrow.append(td.string.replace('\n', ' ').strip())
    #If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
    if myrow[2] == "":
        myrow[2] = myrow[1]
    df_q1.loc[len(df_q1)] = myrow
    
df_q1["Borough"].replace("Not assigned", np.nan, inplace=True)
df_q1.dropna(subset=["Borough"], inplace=True)
df_q1.reset_index(drop=True, inplace=True)
print("Number of rows = ", str(len(df_q1)))
df_q1.tail()


Number of rows =  65


Unnamed: 0,Postal Code,Borough,Neighborhood
60,M3N,North York,Downsview
61,M4N,Central Toronto,Lawrence Park
62,M5N,Central Toronto,Roselawn
63,M6N,York,Runnymede / The Junction North
64,M9N,York,Weston


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma


In [6]:
#Show the issue the question is talking about:
df_q1.loc[df_q1['Postal Code'] == "M5A"]

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M5A,Downtown Toronto,Regent Park / Harbourfront


Hmmm, looks like they are already combined in the raw dataset. We can replace the "/" with a ",". Seems pointless, So here is code that would combine text from same rows with a ",", if they existed in seperate rows.

In [7]:
df_q1 = df_q1.groupby(['Postal Code','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_q1.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
60,M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Gr...
61,M9C,Etobicoke,Eringate / Bloordale Gardens / Old Burnhamthor...
62,M9L,North York,Humber Summit
63,M9M,North York,Humberlea / Emery
64,M9N,York,Weston


In [8]:
print("As you can see the number of rows is still " + str(len(df_q1)))

As you can see the number of rows is still 65


So we ansered the question of combining like zip code neighborhoods, but in this dataset it is unneccesary.

Finnaly, they want us to do this: 
In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [9]:
df_q1.shape

(65, 3)

<b>Question 2:</b>

In [10]:
df_latlon = pd.read_csv("https://cocl.us/Geospatial_data")


In [11]:
df_q2 = pd.merge(df_q1, df_latlon, on=['Postal Code'], how='inner')

Here are the top 12 rows fromt the dataset.

In [12]:
df_q2.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848
