# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Import the dependencies

In [4]:
import pandas as pd 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json
from geopy.geocoders import Nominatim
import requests

from pandas.io.json import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library
from bs4 import BeautifulSoup



## Scrape Wikipedia data into a Dataframe

In [5]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [24]:
postalCodesList = []
boroughsList = []
neighborhoodsList = []

### Using beautiful soup to parse the html table and rows and columns

In [27]:
# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodesList.append(cells[0].text)
        boroughsList.append(cells[1].text)
        neighborhoodsList.append(cells[2].text.rstrip('\n')) #remove newline character

###  Dataframe with three columns: PostalCode, Borough, and Neighborhood

In [36]:
# create a new DataFrame from the three lists
torontoDataframe = pd.DataFrame({"PostalCode": postalCodesList,
                           "Borough": boroughsList,
                           "Neighborhood": neighborhoodsList})

torontoDataframe.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Drop cells with a borough that is "Not assigned"

In [37]:
torontoDataframe_na = torontoDataframe[torontoDataframe.Borough != "Not assigned"].reset_index(drop=True)
torontoDataframe_na.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


### Grouping neighborhoods in the same borough

In [38]:
torontoDataframe_Grouped = torontoDataframe_na.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
torontoDataframe_Grouped.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae"
5,M1J,Scarborough,"Scarborough Village, Scarborough Village"
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park, E..."
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge, Clairlea, Gol..."
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village Wes..."
9,M1N,Scarborough,"Birch Cliff, Cliffside West, Birch Cliff, Clif..."


### For Neighborhood with value "Not assigned", make the value the same as Borough

In [39]:
for index, row in torontoDataframe_Grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
torontoDataframe_Grouped.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae"
5,M1J,Scarborough,"Scarborough Village, Scarborough Village"
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park, E..."
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge, Clairlea, Gol..."
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village Wes..."
9,M1N,Scarborough,"Birch Cliff, Cliffside West, Birch Cliff, Clif..."


###  Print the number of rows of the dataframe.

In [40]:
torontoDataframe_Grouped.shape

(103, 3)