# Please note that I used one notebook for all parts of the assignment

In [41]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', 500)

# Part 1 of the assignment  

#### Import Wikipedia Page as a text file and then use Beautiful Soup LXML to parse the text file

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url, 'lxml')

#### Search through the file to locate only the table of data that we need and then extract the column headers

In [3]:
extract_table = soup.find('table',{'class':'wikitable sortable'})
get_columns = extract_table.findAll('th')
titles = []
for c in get_columns:
    titles.append(c.get_text().strip('\n'))
titles

['Postcode', 'Borough', 'Neighbourhood']

#### Extract Data Row by Row

In [4]:
entries = []

for r in extract_table.findAll('tr'):
    get_entry = r.findAll('td')
    entries.append([td.text.strip() for td in get_entry])

#### Remove all rows where value of Bourough is equal to 'Not Assigned'

In [5]:
na_index = []

df = pd.DataFrame(columns = titles, data = entries)
df = df.drop(0)
ndf = df[df.Borough != 'Not assigned']

#### Join Rows together where Postcode and Bourough match. Join the Neighbourhood values together and seperate by comma.
#### Then find the index value of all rows where a Borough is assigned but has a Neighbourhood value of 'Not Assigned'. Use this index value to locate all such rows and then reassign the Neighbourhood to match the value of the rows Borough

In [6]:
ndf = ndf.reset_index(drop = True)
ndf = ndf.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: ', '.join(x)).reset_index()
na_index = ndf.loc[ndf['Neighbourhood'] == 'Not assigned'].index
print(na_index)
for idx in na_index:
    ndf.iloc[idx]['Neighbourhood'] = ndf.iloc[idx]['Borough']

Int64Index([85], dtype='int64')


#### Print Shape

In [7]:
ndf.shape

(103, 3)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


# Part 2 of the Assignment

In [8]:
! pip install geocoder



In [9]:
import geocoder as gc

In [10]:
lat_lng = []


for index, row in ndf.iterrows():
    g = gc.arcgis(row['Borough'] + ', ' + row['Neighbourhood'])
    lat_lng.append(row['Postcode'] + ', ' + str(g.latlng[0]) + ', ' + str(g.latlng[1]))


In [49]:
column_headers2 = ['Postcode' , 'Latitude', 'Longitude']
lat_lng_fixed = []
for entry in lat_lng:
    lat_lng_fixed.append(entry.split(','))

df2 = pd.DataFrame(columns = column_headers2, data = lat_lng_fixed)
merged_df = pd.merge(ndf, df2, on = 'Postcode')
merged_df


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.80977000000007,-79.22083999999995
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.789480000000026,-79.17613999999998
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76601504543756,-79.18538274241669
3,M1G,Scarborough,Woburn,43.767300000000034,-79.22822999999994
4,M1H,Scarborough,Cedarbrae,43.74772769666329,-79.23517417124673
5,M1J,Scarborough,Scarborough Village,43.73852000000005,-79.21691999999996
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.72587000000004,-79.26231999999999
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.72599000000008,-79.28326999999996
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.73852000000005,-79.21691999999996
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69472000000008,-79.26459999999997


In [None]:
merged_df.loc[ndf['Postcode'] == 'M2H']

# Part 3 of the Assignment

In [14]:
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs
from geopy.geocoders import Nominatim

import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

In [15]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_of_toronto = folium.Map(location=[latitude, longitude], zoom_start = 10)

for lat, lng, borough, neighbourhood in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Borough'], merged_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [float(lat),float(lng)],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_of_toronto)
    
map_of_toronto


# K Means Clustering

In [50]:
k_clusters = 7
merged_df_clustering = merged_df.drop('Neighbourhood', 1)
merged_df_clustering = merged_df_clustering.drop('Borough', 1)
merged_df_clustering = merged_df_clustering.drop('Postcode', 1)
merged_df_clustering.head()
k_means = KMeans(n_clusters=k_clusters, random_state = 0).fit(merged_df_clustering)

In [51]:
merged_df.insert(0, 'Cluster', k_means.labels_)

In [55]:
merged_df


Unnamed: 0,Cluster,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge, Malvern",43.80977000000007,-79.22083999999995
1,0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.789480000000026,-79.17613999999998
2,0,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76601504543756,-79.18538274241669
3,0,M1G,Scarborough,Woburn,43.767300000000034,-79.22822999999994
4,0,M1H,Scarborough,Cedarbrae,43.74772769666329,-79.23517417124673
5,0,M1J,Scarborough,Scarborough Village,43.73852000000005,-79.21691999999996
6,0,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.72587000000004,-79.26231999999999
7,0,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.72599000000008,-79.28326999999996
8,0,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.73852000000005,-79.21691999999996
9,0,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69472000000008,-79.26459999999997


In [57]:
cluster_map = folium.Map(location = [latitude,longitude], zoom_start = 10)

x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lat, lng, cluster in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Cluster']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat),float(lng)],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7,
        parse_html=False).add_to(cluster_map)
    
cluster_map

['#8000ff', '#2c7ef7', '#2adddd', '#80ffb4', '#d4dd80', '#ff7e41', '#ff0000']
