# Neighborhoods in Toronto -- part-3

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

## create the dataframe

Use beautifulsoup to do it, and then store the data into 3 lisis, finally use pandas to create the dataframe.

**In this part I use the re library to handle the data**

In [3]:
#I copied the code and pasted it into a local text named canada.html
soup=BeautifulSoup(open('canada.html'),features='html.parser')

In [4]:
#create 3 list
PostalCode=[]
Borough=[]
Neighborhood=[]


In [5]:
for items in soup.find('table').find_all('tr'): # item是一行
    item=items.find_all('td')
    for i in range(len(item)):  #遍历一行的元素
        ##print(item[i].text) #处理每一行第一个元素
        stritem=(str(item[i].text))[1:-1]  #把第一个空格去掉
        PostalCode.append(stritem[0:3])                ##这个是（1）str
        strre=stritem[3:-1] # 去掉前三个元素
        #print(strre)
        if strre=='Not assigned':
            Borough.append('Not assigned')
            Neighborhood.append('Not assigned')
        else:
            try:                                   #使用try防止有不带括号的
                res=re.compile(r'(.*?)[(]', re.S)
                A=(re.findall(res,strre))[0]  #A是Borough(2)str
                Borough.append(A)
                res2=re.compile(r'[()](.*?)[)]', re.S)
                B=(re.findall(res2,strre))[0]  #B是Neibor (3)str
                B=B.replace(' / ',',')   #将B转换为，分割
                Neighborhood.append(B)
            except IndexError: #无neibor
                A=strre
                Borough.append(A)
                B=A
                Neighborhood.append(B)


In [6]:
df=pd.DataFrame({'PostalCode':PostalCode,'Borough':Borough,'Neighborhood':Neighborhood})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"


## Drop the rows with "Not assigned" Borough

In [7]:
df=df[df['Borough']!='Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"
5,M6A,North York,"Lawrence Manor,Lawrence Heights"
6,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government


In [8]:
df.shape

(103, 3)

**we can select the columns as we saw in the homework sample**

In [9]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
dft = pd.DataFrame(columns=column_names)

post_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in post_list:
    dft = dft.append(df[df["PostalCode"]==postcode], ignore_index=True)
dft

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill,Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford,Maryvale"
7,M9V,Etobicoke,"South Steeles,Silverstone,Humbergate,Jamestown..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower,King and Spadina,Railway Lands,Harbou..."


## part 2 begin>>

for the connection reason, I've downloaded the csv document 

In [10]:
dflocation=pd.read_csv('Geospatial_Coordinates.csv')

we can use the loop to make it matched between our 2 dataframes

In [11]:
Latitude=[]
Longitude=[]

for i in range(103):
    for j in range(103):
        if df.iat[i,0] ==dflocation.iat[j,0]:    #loop to find the match position
            Latitude.append(dflocation.iat[j,1])
            Longitude.append(dflocation.iat[j,2])

In [12]:
df['Latitude']=Latitude
df['Longitude']=Longitude
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
6,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


**since the order is not the same as what we saw in our homework sample, I created a test dataframe to see if the data is right** 

In [13]:
column_names = ["PostalCode", "Borough", "Neighborhood","Latitude","Longitude"]
dft = pd.DataFrame(columns=column_names)

post_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in post_list:
    dft = dft.append(df[df["PostalCode"]==postcode], ignore_index=True)
dft

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill,Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford,Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles,Silverstone,Humbergate,Jamestown...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower,King and Spadina,Railway Lands,Harbou...",43.628947,-79.39442


## part 3 begin>>

In [14]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 15 boroughs and 103 neighborhoods.


get the location of Toronto

In [18]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(str(latitude)+'...'+str(longitude))

43.653963...-79.387207


### create the map

In [20]:
# create map of canada Toronto neighbor using latitude and longitude values
Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map



**as we can see, a lot people live in Downtown. So we can observe this area in more detail**

In [21]:
Downtown_data = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True) #得到一个符合条件的新表
Downtown_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


Let's get the geographical coordinates of Downtown Toronto

In [22]:
address = 'Downtown,Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.6541737, -79.38081164513409.


In [23]:
# create map of Downtown Toronto using latitude and longitude values
Down_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Downtown_data['Latitude'], Downtown_data['Longitude'], Downtown_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Down_map)  
    
Down_map

In a nutshell, people are more likely to live in DownTown since there's more shops and something so on and thanks for watching.