# Segmenting and Clustering Neighborhoods in Toronto

### Importing all necessary libs

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

!pip install geopy
from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip install folium
import folium

!pip install lxml html5lib beautifulsoup4



## Part1

In [48]:
# Fetch HTML data and assign to DataFrame
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables_list = pd.read_html(url)

# Investigate what was retrived from url
x='This DataFrame contains {} tables'.format(len(tables_list))
print(x)

for i in range(len(tables_list)):
    print("Table {}".format(i + 1))
    print(tables_list[i].head(1))
    print("======================================")

This DataFrame contains 3 tables
Table 1
  Postal Code       Borough Neighbourhood
0         M1A  Not assigned  Not assigned
Table 2
    0                      1    2    3    4    5    6    7    8    9    10  \
0  NaN  Canadian postal codes  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN   

    11   12   13   14   15   16   17  
0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  
Table 3
   0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15     16  17
0  NL  NS  PE  NB  QC  QC  QC  ON  ON  ON  ON  ON  MB  SK  AB  BC  NU/NT  YT


As we can see the url have 3 tables and the table we are interested in is the first one

Assign a DataFrame with the object of interest

In [46]:
raw_df = tables_list[0]
raw_df.head()
raw_df.Borough.groupby(raw_df.Borough).count()

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Not assigned        77
Scarborough         17
West Toronto         6
York                 5
Name: Borough, dtype: int64

Cleaning the data

In [32]:
borough_df = raw_df[raw_df.Borough != 'Not assigned'].reset_index(drop=True)
borough_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Certify there are no 'Not assigned' in Borough column

In [33]:
borough_df.Borough.groupby(borough_df.Borough).count()

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Scarborough         17
West Toronto         6
York                 5
Name: Borough, dtype: int64

Assign Borough to 'Not assigned' Neighbourhood

In [42]:
neighbourhood_df = borough_df
neighbourhood_df.Neighbourhood = np.where((borough_df.Neighbourhood == 'Not assigned'), 'Borough', borough_df.Neighbourhood)
#neighbourhood_df.Neighbourhood.groupby(neighbourhood_df.Neighbourhood).count()

 
 
 Verify uniqueness of 'Postal Code'

In [43]:
neighbourhood_df['Postal Code'].groupby([neighbourhood_df['Postal Code']]).count()

Postal Code
M1B    1
M1C    1
M1E    1
M1G    1
M1H    1
M1J    1
M1K    1
M1L    1
M1M    1
M1N    1
M1P    1
M1R    1
M1S    1
M1T    1
M1V    1
M1W    1
M1X    1
M2H    1
M2J    1
M2K    1
M2L    1
M2M    1
M2N    1
M2P    1
M2R    1
M3A    1
M3B    1
M3C    1
M3H    1
M3J    1
M3K    1
M3L    1
M3M    1
M3N    1
M4A    1
M4B    1
M4C    1
M4E    1
M4G    1
M4H    1
M4J    1
M4K    1
M4L    1
M4M    1
M4N    1
M4P    1
M4R    1
M4S    1
M4T    1
M4V    1
M4W    1
M4X    1
M4Y    1
M5A    1
M5B    1
M5C    1
M5E    1
M5G    1
M5H    1
M5J    1
M5K    1
M5L    1
M5M    1
M5N    1
M5P    1
M5R    1
M5S    1
M5T    1
M5V    1
M5W    1
M5X    1
M6A    1
M6B    1
M6C    1
M6E    1
M6G    1
M6H    1
M6J    1
M6K    1
M6L    1
M6M    1
M6N    1
M6P    1
M6R    1
M6S    1
M7A    1
M7R    1
M7Y    1
M8V    1
M8W    1
M8X    1
M8Y    1
M8Z    1
M9A    1
M9B    1
M9C    1
M9L    1
M9M    1
M9N    1
M9P    1
M9R    1
M9V    1
M9W    1
Name: Postal Code, dtype: int64

In [44]:
neighbourhood_df.shape

(103, 3)