In [125]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.21.0-py_0 conda-forge


Downloading and Extracting Packages
geographiclib-1.50   | 34 KB     | ##################################### | 100% 
geopy-1.21.0         | 58 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environ

## Installing BeautifulSoup and html parser 

In [175]:
 ! easy_install beautifulsoup4

Searching for beautifulsoup4
Best match: beautifulsoup4 4.7.1
Adding beautifulsoup4 4.7.1 to easy-install.pth file

Using /opt/conda/envs/Python36/lib/python3.6/site-packages
Processing dependencies for beautifulsoup4
Finished processing dependencies for beautifulsoup4


In [176]:
! pip install beautifulsoup4
! pip install lxml
! pip install requests



In [177]:
from bs4 import BeautifulSoup
source=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source)
# print(soup.prettify())
table=soup.table


The function below extracts the table from the html script and makes it a dataframe

In [178]:
def parse_html_table(table):
    n_columns = 0
    n_rows=0
    column_names = []
    
    for row in table.find_all('tr'):
    # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
            # Set the number of columns for our table
                n_columns = len(td_tags)
                        
                # Handle column names if we find them
        th_tags = row.find_all('th') 
        if len(th_tags) > 0 and len(column_names) == 0:
            for th in th_tags:
                column_names.append(th.get_text())
    
            # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")
    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,index= range(0,n_rows))
    row_marker = 0
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1
    return df

## Create the dataframe and clean it

In [179]:
df=parse_html_table(table)
df.head()
df[0:] = df[0:].replace('\n','', regex=True)
df.columns=['Postcode','Borough','Neighbourhood']
df=df[df.Borough != 'Not assigned']
df['Neighbourhood']=df['Neighbourhood'].replace('Not assigned', df['Borough'])
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## Grouping neighbourhoods
After removing amd replacing the 'Not assigned' values, we now group the data by postcode and borough and join the values of neighbourhoods that fall in the same postcode, and so as to remove the hierarchial index, we reset the index.

In [180]:
df=df.groupby(['Postcode', 'Borough'], as_index=1, sort=False).agg(lambda col:','.join(col))['Neighbourhood']
df=df.reset_index()
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [181]:
df.shape

(103, 3)

## Part two
Adding latitude an longitude data to the dataframe.

In [182]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
df = df.merge(geo_df, left_on='Postcode', right_on='Postal Code')
df=df.drop('Postal Code', axis=1)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
