# Segmenting and Clustering Neighborhoods in Toronto

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
!pip install geopy
!pip install folium
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


### Importing the data using BeautifulSoup and Scraping the Data

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

### Creating the empty dataframe to sort the data into, and formatting based on rubric

In [3]:
df = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    if row != [] and row[1] != "Not assigned\n":
        if "Not assigned\n" in row[2]: 
            row[2] = row[1]
        df.append(row)
        
df2 = pd.DataFrame(df, columns = ["PostalCode", "Borough", "Neighborhood"])

### Cleaning up the data and grouping the data by neighborhood

In [4]:
df2['Neighborhood'] = df2['Neighborhood'].str.replace('\n',"")
df2['PostalCode'] = df2['PostalCode'].str.replace('\n',"")
df2['Borough'] = df2['Borough'].str.replace('\n',"")

df2 = df2.groupby(['PostalCode','Borough'])['Neighborhood'].apply(", ".join).reset_index()

### Displaying the first 5 rows of the dataframe

In [5]:
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Display the number of rows using the shape method

In [6]:
df2.shape[0]

103

### Opening the CVS File with the Longitude/Latitude Data and saving as df

In [7]:
df_gsp = pd.read_csv(r'http://cocl.us/Geospatial_data')

### Merging the 2 dataframes

In [20]:
df_toronto = pd.merge(df2, df_gsp, left_on ='PostalCode',right_on = 'Postal Code')
df_toronto = df_toronto[["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]]
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
