## Capstone Week 3

## First We make the appropiate imports

In [1]:
import requests
import pandas as pd
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import numpy as np 

!conda install -c anaconda beautifulsoup4 --yes 
from bs4 import BeautifulSoup



Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    beautifulsoup4: 4.6.0-py35h442a8c9_1 --> 4.6.3-py35_0 anaconda

beautifulsoup4 100% |################################| Time: 0:00:00  38.82 MB/s


## We invoke Beautiful Soup and parse the table into a Dataframe. Then we do cleanup.

In [22]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_random_wikipedia_page=requests.get(wikipedia_link)

soup = BeautifulSoup(raw_random_wikipedia_page.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table), header=0)[0]

#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
df.rename(columns={'Postcode':'Postalcode', 'Borough':'Borough','Neighbourhood':'Neighbourhood'}, inplace=True)

#all column names to string
df.columns = list(map(str, df.columns))

#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df["Borough"] != "Not assigned"]
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
df.loc[df.Neighbourhood=="Not assigned", 'Neighbourhood'] = df["Borough"]

#More than one neighborhood can exist in one postal code area.
df = df.groupby(['Postalcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade
1,M9W,Etobicoke,Northwest
2,M6C,York,Humewood-Cedarvale
3,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
4,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market"


In [10]:
df.shape

(103, 5)

## We call the Geo data and put it into a DF

In [18]:
df_geo=pd.read_csv("http://cocl.us/Geospatial_data")
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
df_final = df.set_index('Postalcode').join(df_geo.set_index('Postal Code'))
df_final = df_final.reset_index()
#df_final = df_final.sample(frac=1).reset_index(drop=True)
df_final.head(20)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846
1,M9W,Etobicoke,Northwest,43.706748,-79.594054
2,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
3,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
4,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049
5,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
6,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
9,M9M,North York,"Emery, Humberlea",43.724766,-79.532242


## We need to install Folium

In [34]:
!conda install -c conda-forge folium --yes
import folium

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.6.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00   2.90 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  15.84 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  36.22 MB/s
folium-0.6.0-p 100% |################################| Time: 0:00:00  41.16 MB/s


## We can now make the GeoMap

In [39]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [38]:
map_final = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_final['Latitude'], df_final['Longitude'], df_final['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        ).add_to(map_final)  
    
map_final