# Segmenting and Clustering Neighborhoods in Toronto - Postal codes

### Import libraries

In [9]:
import pandas as pd

### Scrap the data from Wikipedia

In [10]:
# Retrieve the HTML data
dfHTML=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

### Prepare data
First it removes all rows when 'Borough' is 'Not assigned'.  
Then the column names will be named and sorted by the postcode.  
The old index row after sort can be dropped.  
As required it sets 'Neighborhood' to 'Borough' when 'Neighborhood' is 'Not assigned'

In [11]:
# Prepare data
dfCAN=dfHTML[0].loc[dfHTML[0][1]!='Not assigned'].copy()
dfCAN.rename(columns={0: 'PostalCode', 1: 'Borough', 2: 'Neighborhood'}, inplace=True)
# dfCAN.sort_values(by='Postcode', inplace=True)
dfCAN.drop([0], inplace=True)
dfCAN.reset_index(inplace=True)
dfCAN.drop(['index'], axis=1, inplace=True)
# Set 'Neighbourhood' to 'Borough' when 'Not assigned'
idx = dfCAN[dfCAN['Neighborhood'] == 'Not assigned'].index
dfCAN.loc[idx,'Neighborhood']=dfCAN.loc[idx, 'Borough']
# print(dfCAN.describe())
dfCAN.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


### Restructure data
The resulting data frame is grouped by 'PostalCode'. Furthermore 'Neighborhood' will be converted to a list of values.

In [12]:
grCAN = dfCAN.groupby('PostalCode') #, as_index=False)
dfCAN_List = pd.DataFrame(
    [[code, 
      ', '.join(set(dfCAN.iloc[grCAN.groups[code],1])), 
      ', '.join(set(dfCAN.iloc[grCAN.groups[code],2]))] for code in grCAN.groups],
    columns=('PostalCode', 'Borough', 'Neighborhood')
    )

### Save the DataFrame for later use

In [13]:
dfCAN_List.to_csv('CAN_neighborhoods.csv')

### Result
First it shows the shape and after that head of the resulting DataFrame

In [19]:
print('The dataframe consists of:', dfCAN_List.shape[0], 'rows.')

The dataframe consists of: 103 rows.


In [20]:
dfCAN_List.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview, East Birchmount Park, Kennedy Park"
7,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
