# Segmenting and Clustering Neighborhoods in Toronto (Part 1)

### Importing necessary libraries

In [89]:
# library for data analsysis
import pandas as pd

# library to handle requests
import requests

# library to parse HTML documents
from bs4 import BeautifulSoup

### Getting the table from the web and making into a DataFrame

In [90]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Using _BeautifulSoup_

In [91]:
soup = BeautifulSoup(data, 'html.parser')

#### Storing the table cell data values in list format 

In [92]:
PostalCodeList = []
BoroughList = []
NeighborhoodList = []

In [93]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td') # store each cell data in the variable cells
    
    if(len(cells) > 0): # if the table cell is not empty
        
        PostalCodeList.append(cells[0].text[:-1]) # append the Postal Code info in the PostalCodeList
        BoroughList.append(cells[1].text[:-1]) # append the Borough info in the BoroughList
        NeighborhoodList.append(cells[2].text[:-1]) # append the Neighborhood info in the NeighborhoodList

#### Creating a DataFrame using Pandas

In [95]:
# converting the lists into a single DataFrame as required

toronto_df = pd.DataFrame({"PostalCode": PostalCodeList,
                           "Borough": BoroughList,
                           "Neighborhood": NeighborhoodList})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Processing the Dataframe into desired format   
  
#### Dropping the rows with 'Borough' = 'Not assigned'
#### Reseting the index value as after dropping the rows, index is not automatically reset

In [96]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Group neighborhoods in the same borough

In [97]:
toronto_df = toronto_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### if the 'Neighborhood' is 'Not assigned', we update the value which is same as the 'Borough' value

In [98]:
for index, row in toronto_df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Display the result as asked in the question

In [99]:
test_df = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])
test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for postcode in test_list:
    test_df = test_df.append(toronto_df[toronto_df["PostalCode"]==postcode], ignore_index=True)  
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


### Dimensions of the DataFrame toronto_df

In [100]:
toronto_df.shape

(103, 3)