# Segmenting and clustering Part 1


In [42]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


# 2 Scrape wikki for data

In [43]:

# send the GET request
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [44]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(url, 'html.parser')

In [45]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [46]:
# Find the table and append the data into the respective lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

In [47]:
# create a new DataFrame from the three lists
tn_df = pd.DataFrame({"PostalCode": postalCodeList,"Borough": boroughList,"Neighborhood": neighborhoodList})
tn_df = tn_df.replace('\n','', regex=True)
tn_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# 3 Drop not assigned Boroughs


In [48]:
# # # drop cells with a borough that is Not assigned
# tn_df_dropna = tn_df[tn_df.Borough != "Not assigned"].reset_index(drop=True)
# tn_df_dropna.head()


#tn_df[~tn_df.team.str.contains('Not assigned')]

tn_df=tn_df[tn_df['Borough']!='Not assigned']


tn_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# 4. Group neighborhoods in the same borough

In [49]:
# group multiple Neighbourhood under one Postcode
tn_df1=tn_df.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
tn_df1=tn_df.reset_index(drop=False)
tn_df1.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [50]:
tn_df1.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood_joined
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [51]:
# join the newly constructed joined data frame
df_merge = pd.merge(tn_df, tn_df1, on='PostalCode')

In [52]:
# drop the Neighbourhood column
df_merge.drop(['Neighborhood_joined'],axis=1,inplace=True)

In [53]:
# drop duplicates from the data frame
df_merge.drop_duplicates(inplace=True)

In [54]:
# drop extra column borough_y added during merging
df_merge.drop(['Borough_y'],axis=1,inplace=True)

In [55]:
# drop extra column index added during merging
df_merge.drop(['index'],axis=1,inplace=True)

In [56]:
# rename column borough_x back to borough
df_merge.rename(columns={'Borough_x':'Borough'},inplace=True)

In [57]:
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [58]:
df_merge.shape

(103, 3)

In [59]:
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [60]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [61]:
geo_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)


In [62]:
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [63]:
geo_merged = pd.merge(geo_df, df_merge, on='PostalCode')

In [64]:
geo_merged.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [66]:
# correcting the sequence of data
geo_data=geo_merged[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]

In [67]:
geo_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
