# The following notebook is for the second assignment of the Capstone Project

## First we import the useful libraries, then we point to the wikipedia url and webscrap the page by making use of BeautifulSoup

## Then we transform the lines we want into a pandas DataFrame and reject those rows with Not assigned Borough

In [61]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_source = requests.get(wiki_url).text

wiki_html  = BeautifulSoup(wiki_source, 'xml')
wiki_html = wiki_html.find('table')

cols = ['Postalcode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = cols)

for tr in wiki_html.find_all('tr'):
    row_data=[]
    for td in tr.find_all('td'):
        row_data.append(td.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

mask1 = (df['Borough']!="Not assigned") 
df1 = df[mask1]

## The following is to assign the same Borough value for those postal codes with Not assigned Neighborhood

In [62]:
mask2 = df1['Neighborhood']=="Not assigned"
df1['Neighborhood'][mask2]=df1['Borough'][mask2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


## Let's have a look at the clean DataFrame

In [63]:
df1.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Now we merge the duplicated postalcodes in a single row

In [64]:
t1 = df1.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
t1 = t1.reset_index(drop=False)
t1.rename(columns={'Neighborhood':'Neighborhood1'}, inplace=True)

df2 = pd.merge(df1, t1, on='Postalcode')
df2.drop(['Neighborhood'], axis=1, inplace=True)
df2.drop_duplicates(inplace=True)
df2.rename(columns={'Neighborhood1':'Neighborhood'}, inplace=True)


## Let's have a look at the DataFrame

In [65]:
df2.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Last line is to get the shape of the DataFrame

In [66]:
df2.shape

(103, 3)

## Retrieving DataFRame from Geospatial_data and merge it to the existing df3 we created before.

## We also cut a DataFrame containing only the information of the Toronto Borough

In [67]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.rename(columns={'Postal Code':'Postalcode'}, inplace=True)
geo_merge = pd.merge(geo_df, df2, on='Postalcode')
df3 = geo_merge[geo_merge['Borough'].str.contains("Toronto")]
df3

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
37,M4E,43.676357,-79.293031,East Toronto,The Beaches
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
42,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
43,M4M,43.659526,-79.340923,East Toronto,Studio District
44,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park
45,M4P,43.712751,-79.390197,Central Toronto,Davisville North
46,M4R,43.715383,-79.405678,Central Toronto,"North Toronto West, Lawrence Park"
47,M4S,43.704324,-79.38879,Central Toronto,Davisville
48,M4T,43.689574,-79.38316,Central Toronto,"Moore Park, Summerhill East"
49,M4V,43.686412,-79.400049,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest..."


## Here we reset the index and have a quick look at the new DataFrame

In [68]:
df3.reset_index(drop=True, inplace=True)
df3.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,M4E,43.676357,-79.293031,East Toronto,The Beaches
1,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
2,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
3,M4M,43.659526,-79.340923,East Toronto,Studio District
4,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


## Here we create a map to visualize by making use of folium and the coordinates we retrieved from the last line

In [69]:
#!pip install folium
import folium
map = folium.Map(location=[43.65, -79.38], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=False,
        fill_color='yellow',
        fill_opacity=0.2,
        parse_html=False).add_to(map)  
    
map