# Segmenting and Clustering Neighborhoods in Toronto - Postal codes

### Import libraries

In [8]:
import pandas as pd

### Scrap the data from Wikipedia

In [9]:
# Retrieve the HTML data
dfHTML=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

### Prepare data
First it removes all rows when 'Borough' is 'Not assigned'.  
Then the column names will be named and sorted by the postcode.  
The old index row after sort can be dropped.  
As required it sets 'Neighborhood' to 'Borough' when 'Neighborhood' is 'Not assigned'

In [10]:
# Prepare data
dfCAN=dfHTML[0].loc[dfHTML[0][1]!='Not assigned'].copy()
dfCAN.rename(columns={0: 'Postcode', 1: 'Borough', 2: 'Neighborhood'}, inplace=True)
dfCAN.sort_values(by='Postcode', inplace=True)
dfCAN.drop([0], inplace=True)
dfCAN.reset_index(inplace=True)
dfCAN.drop(['index'], axis=1, inplace=True)
# Set 'Neighbourhood' to 'Borough' when 'Not assigned'
idx = dfCAN[dfCAN['Neighborhood'] == 'Not assigned'].index
dfCAN.loc[idx,'Neighborhood']=dfCAN.loc[idx, 'Borough']
print(dfCAN.describe())
dfCAN.head()

       Postcode    Borough Neighborhood
count       210        210          210
unique      103         10          208
top         M9V  Etobicoke    Runnymede
freq          8         45            2


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Malvern
1,M1B,Scarborough,Rouge
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Port Union
4,M1C,Scarborough,Rouge Hill


### Restructure data
The resulting data frame is grouped by 'Postcode'. Furthermore 'Neighborhood' will be converted to a list of values.

In [11]:
grCAN = dfCAN.groupby('Postcode') #, as_index=False)
dfCAN_List = pd.DataFrame(
    [[code, 
      ', '.join(set(dfCAN.iloc[grCAN.groups[code],1])), 
      ', '.join(set(dfCAN.iloc[grCAN.groups[code],2]))] for code in grCAN.groups],
    columns=('Postcode', 'Borough', 'Neighborhood')
    )

### Save the DataFrame for later use

In [12]:
dfCAN_List.to_csv('CAN_neighborhoods.csv')

### Result
First it shows the shape and after that head of the resulting DataFrame

In [13]:
print('Shape of result: ', dfCAN_List.shape)

Shape of result:  (103, 3)


In [14]:
dfCAN_List.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Highland Creek, Rouge Hill"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
