## Part 1

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

#to scrape the following Wikipedia page
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))

In [2]:
#to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe
df_Toronto = pd.DataFrame(df[0])
df_Toronto.columns = ['Postcode', 'Borough', 'Neighbourhood']
#print (df_Toronto.head())
#print (df_Toronto.shape)

#df_Toronto.to_csv("Toronto_wiki2.csv")

In [3]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df_Toronto = df_Toronto[df_Toronto["Borough"]!="Not assigned"]
#print (df_Toronto.head(20))
#print (df_Toronto.shape)

In [4]:
#More than one neighborhood can exist in one postal code area. 
#For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: 
#Harbourfront and Regent Park. 
#These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.


Toronto_grouped = df_Toronto.groupby(by=["Postcode","Borough"], sort=True, squeeze=True) #.set_index("Postcode") #.groups["M3A"]

Toronto_grouped_1 = pd.DataFrame(columns=["Postcode", "Borough", "Neighbourhood"])

for postcode in Toronto_grouped.Postcode:
    #print ("postcode: ", postcode[0][0])
    #print ("Borough: ", postcode[0][1])
    #print (df_Toronto[df_Toronto["Postcode"]==postcode[0][0]]["Neighbourhood"].tolist())
    Toronto_grouped_1 = Toronto_grouped_1.append({'Postcode': postcode[0][0],
                                          'Borough': postcode[0][1],
                                          'Neighbourhood': ", ".join(df_Toronto[df_Toronto["Postcode"]==postcode[0][0]]["Neighbourhood"].tolist()),
                                          }, ignore_index=True)


In [5]:
Toronto_grouped_1[Toronto_grouped_1["Postcode"]=="M5A"]

Unnamed: 0,Postcode,Borough,Neighbourhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [6]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
#So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

for index_num in Toronto_grouped_1[Toronto_grouped_1["Neighbourhood"]=="Not assigned"].index.tolist():
    Toronto_grouped_1.iloc[index_num]["Neighbourhood"] = Toronto_grouped_1.iloc[index_num]["Borough"]
    #print (index_num)

In [7]:
#Toronto_grouped_1.iloc[85]

### Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.

Step 1) Scrape the following Wikipedia page
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

Step 2) Read the website content to find 'table' tag and return the first 'table' tag

Step 3) Read the table tag content into a dataframe

Step 4) Assign column name to the dataframe and filter those rows with "Borough" = "Not assigned"

Step 5) Contruct another dataframe that grouping by "Postcode","Borough" using Step 4's dataframe

Step 6) Assign those rows with "Neighbourhood" = "Not assigned", update the "Neighbourhood" with "Borough" value.

In [8]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
Toronto_grouped_1.shape

(103, 3)

## Part 2

In [None]:
# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# postal_code = "M5G"

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#     g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#     lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [15]:
# Get the latitude, longitude from csv file
#!wget -q -O Geospatial_Coordinates.csv "http://cocl.us/Geospatial_data"

In [16]:
geo_coord = pd.read_csv("Geospatial_Coordinates.csv")
#geo_coord.head()

In [26]:
Toronto_grouped_lat_lon = Toronto_grouped_1.merge (geo_coord, left_on = "Postcode", right_on="Postal Code")
Toronto_grouped_lat_lon.drop("Postal Code", axis = 1, inplace = True)

In [28]:
Toronto_grouped_lat_lon.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
