### Create a new Notebook for this assignment and import needed libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Assign wikipedia page link to a variable  

In [2]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Start the work by 
- scrape a Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M,
- Transform the data into a pandas dataframe as shown above
- Make the dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [3]:
wikipedia_page = requests.get(wikipedia_link).text

soup = BeautifulSoup(wikipedia_page,'lxml')

#print(soup.prettify)

my_table = soup.find('table', class_="wikitable sortable")
my_tr = my_table.tr.text
my_columns = my_tr.splitlines()[1:4]
#print(my_columns)

rows =[]
all_tr = my_table.findAll('tr')
for tr in all_tr:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()] #tr.text.strip()
    if row:
        rows.append(row)

#print(rows)
df = pd.DataFrame (rows, columns = my_columns)
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [4]:
df = df[df.Borough !='Not assigned']
df.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


###  Combined rows with the neighborhoods separated with a comma that are exist in one postal code area

In [5]:
new_df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

In [6]:
new_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Make the value of the Neighborhood the same as the Borough for cell that has a borough but a Not assigned neighborhood

In [7]:
NA = df[df.Neighbourhood == 'Not assigned']
print(NA)

df.loc[8, 'Neighbourhood'] = "Queen's Park"

new_df.shape

  Postcode       Borough Neighbourhood
8      M7A  Queen's Park  Not assigned


(103, 3)

### To summary: this the first part of the assignment where I have tried to:
- Create a new Notebook for this assignment
- Scrape a Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, 
- Transform the data into a pandas dataframe as shown above
- make the dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
- Process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
- Combined rows with the neighborhoods separated with a comma that are exist in one postal code area
- Make the value of the Neighborhood the same as the Borough for cell that has a borough but a Not assigned neighborhood
- Clean my Notebook and add the Markdown cell to explain your work and any assumptions you are making

## Second part of the Assignment
### Utilizing the Foursquare location data
- First, we install geocoder 
- Second, we import it and get the longt and lat by address using google 

In [8]:
#!conda install -c conda-forge geocoder --yes

In [9]:
#import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(new_df.Postcode))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

NameError: name 'geocoder' is not defined

### We can use the information in the Geospatial_Coordinates.csv file rather than geocoder.google

In [16]:
coord_df = pd.read_csv('Geospatial_Coordinates.csv')
coord_df.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Make the index name the same in both dataframes, which will make the merging process easier

In [17]:
new_df = new_df.rename(columns= {'Postcode': 'PostalCode'})
new_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [18]:
coord_df = coord_df.rename(columns= {'Postal Code': 'PostalCode'})
coord_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the two dataframe into one as required 

In [19]:
merged_df = pd.merge(new_df, coord_df, on =['PostalCode'])
merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Explore and cluster the neighborhoods in Toronto

### First we need to import some libraries as follows:

In [20]:
import numpy as np # library to handle data in a vectorized manner

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Use geopy library to get the latitude and longitude values of Toronto.

In [21]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create map of Toronto using latitude and longitude values

In [24]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Borough'], merged_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto