# Neighborhoods in Toronto 

#### We need to install webscraping packages; bs4 and requests, and import BeautifulSoup.

#### and also pandas to clean the dataframe.

In [466]:
!pip install bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests



Because we want to check if there are any odd data in the dataframe, thus we set the display function to call on all rows.

In [467]:
pd.set_option("display.max_rows", None)

## 1.  Getting the data.

In [468]:
url = "https://www.zipcodesonline.com/2020/06/postal-code-of-toronto-in-2020.html"
data = requests.get(url).text

In [469]:
soup = BeautifulSoup(data, 'html5lib')

#### There's two tables in the url.

#### We need the one that contains the columns postalcode, borough, neighbourhood.

In [470]:
tables = soup.find_all('table')
len(tables)

2

There are two tables in the website.

In [471]:
for index,table in enumerate(tables):
    if ("POSTAL CODE" in str(table)):
        table_index = index
print(table_index)

1


#### We now know that its the second table index 1 that contains the data. So now, we use pandas to create dataframe from the html.

In [472]:
toronto1 = pd.DataFrame(columns=["no","PostalCode","Borough","Neighbourhood"])

for row in tables[1].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        sl_no = col[0].text
        neighbourhood = col[1].text
        postalcode = col[2].text
        district = col[3].text
        toronto1 = toronto1.append({"no":sl_no, "PostalCode":postalcode, "Borough":district, "Neighbourhood":neighbourhood}, ignore_index=True)   

In [473]:
toronto1.head(10)

Unnamed: 0,no,PostalCode,Borough,Neighbourhood
0,\n \nSL. NO. \n,\n \nPOSTAL CODES\n,\n \nDISTRICT\n,\n \nNEIGHBOURHOOD\n
1,,,,
2,\n \n1\n,\n \nM5H\n,\n \nDowntown Toronto\n,\n \n Adelaide\n
3,\n \n2\n,\n \nM1V \n,\n \nScarborough \n,\n \nAgincourt North\n
4,\n \n3\n,\n \nM1S \n,\n \nScarborough\n,\n \nAgincourt\n
5,\n \n4\n,\n \nM9V \n,\n \nEtobicoke\n,\n \nAlbion Gardens\n
6,\n \n5\n,\n \nM8W \n,\n \nEtobicoke\n,\n \nAlderwood\n
7,\n \n6\n,\n \nM3H\n,\n \nNorth York\n,\n \nBathurst Manor\n
8,\n \n7\n,\n \nM5V\n,\n \nDowntown Toronto\n,\n \nBathurst Quay \n
9,\n \n8\n,\n \nM2K\n,\n \nNorth York\n,\n \nBayview Village\n


Now we notice that, the data is very uncleaned. The most obvious one - "\n " in every string. So now, we proceed to data cleaning.

<h2> 2. Cleaning Data I. </h2>

#### We want to remove the first and second row because its irrelevant.

In [474]:
df = toronto1.iloc[2:,:]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,no,PostalCode,Borough,Neighbourhood
0,\n \n1\n,\n \nM5H\n,\n \nDowntown Toronto\n,\n \n Adelaide\n
1,\n \n2\n,\n \nM1V \n,\n \nScarborough \n,\n \nAgincourt North\n
2,\n \n3\n,\n \nM1S \n,\n \nScarborough\n,\n \nAgincourt\n
3,\n \n4\n,\n \nM9V \n,\n \nEtobicoke\n,\n \nAlbion Gardens\n
4,\n \n5\n,\n \nM8W \n,\n \nEtobicoke\n,\n \nAlderwood\n


#### We need to remove the annoying \n and space in the data.

In [475]:
df = df.replace(('\n',' '),'', regex=True)
df.head()

Unnamed: 0,no,PostalCode,Borough,Neighbourhood
0,1,M5H,DowntownToronto,Adelaide
1,2,M1V,Scarborough,AgincourtNorth
2,3,M1S,Scarborough,Agincourt
3,4,M9V,Etobicoke,AlbionGardens
4,5,M8W,Etobicoke,Alderwood


#### Just checking what types of data each columns is.

In [476]:
df.columns 

Index(['no', 'PostalCode', 'Borough', 'Neighbourhood'], dtype='object')

#### Just renaming the postal code with adding space to make it neater. Also, we will sort this by the postal code and resetting the index.

In [477]:
# and renaming postal code
df.rename(columns={'PostalCode': 'Postal Code'}, inplace=True)
# removing no 
df.drop(columns=['no'],inplace=True)
# and also sorting based on postal code
df.sort_values(by=['Postal Code'],ignore_index = True,inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,Malvern
1,M1B,Scarborough,Rouge
2,M1C,Scarborough,RougeHill
3,M1C,Scarborough,HighlandCreek
4,M1C,Scarborough,PortUnion


In [478]:
df.shape

(205, 3)

## 3. Retrieving Latitude and Longitude.

We just take the given dataframe the website provided by Coursera.

In [479]:
hi = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")
hi.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [480]:
hi.shape

(103, 3)

#### Because we want the neighbourhood to have the latitude and longitude we need to merge this two dataframe: hi and df. 

But, we notice that the length of two dataframe is different; hi is 103 and df is 205.

One obvious reason after inspecting hi is because each postal code only appear once in the column.

#### We now move on to check the the Postal Code column in df.

## 4. Cleaning Data II.

We slice the Postal Code column and change it to list so that we can we view it in the rawest form.

In [481]:
postcode = list(df['Postal Code'])
postcode 

['M1B',
 'M1B',
 'M1C',
 'M1C',
 'M1C',
 'M1E',
 'M1E',
 'M1E',
 'M1G',
 'M1H',
 'M1J',
 'M1K',
 'M1K',
 'M1K',
 'M1L',
 'M1L',
 'M1L',
 'M1M',
 'M1M',
 'M1N',
 'M1N',
 'M1P',
 'M1P',
 'M1P',
 'M1R',
 'M1R',
 'M1S',
 'M1T',
 'M1T',
 'M1T',
 'M1V',
 'M1V',
 'M1V',
 'M1V',
 'M1W',
 'M1W',
 'M1X',
 'M2H',
 'M2J',
 'M2J',
 'M2J',
 'M2K',
 'M2L',
 'M2L',
 'M2M',
 'M2M,M2N,M2R',
 'M2P',
 'M3A',
 'M3B',
 'M3H',
 'M3H',
 'M3H',
 'M3J',
 'M3K',
 'M3M,M3L,M3N',
 'M4A',
 'M4B',
 'M4B',
 'M4C',
 'M4E',
 'M4G',
 'M4H',
 'M4J',
 'M4J',
 'M4K',
 'M4K',
 'M4L',
 'M4L',
 'M4M',
 'M4N',
 'M4P',
 'M4R',
 'M4R',
 'M4S',
 'M4T',
 'M4T',
 'M4V',
 'M4V',
 'M4V',
 'M4V',
 'M4W',
 'M4X',
 'M4X',
 'M4Y',
 'M5A',
 'M5A',
 'M5B',
 'M5B',
 'M5C',
 'M5E',
 'M5G',
 'M5H',
 'M5H',
 'M5J',
 'M5J',
 'M5J',
 'M5K',
 'M5K',
 'M5L',
 'M5L',
 'M5M',
 'M5M',
 'M5N',
 'M5P',
 'M5P',
 'M5R',
 'M5R',
 'M5S',
 'M5S',
 'M5T',
 'M5T',
 'M5T',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5W',
 'M5X',
 'M5X',
 'M6A',


#### We notice that there is two oddities. 
#### 1. Some of the rows contain multiple postal code. <br/> 2. The last five of the postal code contains '\xa0' in their string.

We solve the second problem first because that's the easier one.

In [482]:
o = []
for i in postcode:
    o.append(i.replace('\xa0',''))
o

['M1B',
 'M1B',
 'M1C',
 'M1C',
 'M1C',
 'M1E',
 'M1E',
 'M1E',
 'M1G',
 'M1H',
 'M1J',
 'M1K',
 'M1K',
 'M1K',
 'M1L',
 'M1L',
 'M1L',
 'M1M',
 'M1M',
 'M1N',
 'M1N',
 'M1P',
 'M1P',
 'M1P',
 'M1R',
 'M1R',
 'M1S',
 'M1T',
 'M1T',
 'M1T',
 'M1V',
 'M1V',
 'M1V',
 'M1V',
 'M1W',
 'M1W',
 'M1X',
 'M2H',
 'M2J',
 'M2J',
 'M2J',
 'M2K',
 'M2L',
 'M2L',
 'M2M',
 'M2M,M2N,M2R',
 'M2P',
 'M3A',
 'M3B',
 'M3H',
 'M3H',
 'M3H',
 'M3J',
 'M3K',
 'M3M,M3L,M3N',
 'M4A',
 'M4B',
 'M4B',
 'M4C',
 'M4E',
 'M4G',
 'M4H',
 'M4J',
 'M4J',
 'M4K',
 'M4K',
 'M4L',
 'M4L',
 'M4M',
 'M4N',
 'M4P',
 'M4R',
 'M4R',
 'M4S',
 'M4T',
 'M4T',
 'M4V',
 'M4V',
 'M4V',
 'M4V',
 'M4W',
 'M4X',
 'M4X',
 'M4Y',
 'M5A',
 'M5A',
 'M5B',
 'M5B',
 'M5C',
 'M5E',
 'M5G',
 'M5H',
 'M5H',
 'M5J',
 'M5J',
 'M5J',
 'M5K',
 'M5K',
 'M5L',
 'M5L',
 'M5M',
 'M5M',
 'M5N',
 'M5P',
 'M5P',
 'M5R',
 'M5R',
 'M5S',
 'M5S',
 'M5T',
 'M5T',
 'M5T',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5W',
 'M5X',
 'M5X',
 'M6A',


Now, we have removed '\xa0' from all the strings. We just need to replace this in the Postal Code column in df.

In [483]:
df["Postal Code"] = o
df.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
200,M4V,CentralToronto,SouthHill
201,M5R,CentralToronto,NorthMidtown
202,M8X,Etobicoke,OldMillNorth
203,M8Z,Etobicoke,SouthofBloor
204,M9R,Etobicoke,RichviewGardens


Before anything, it's good to check that we have the df without the 'xa0'.

In [484]:
postcode = list(df['Postal Code'])
postcode 

['M1B',
 'M1B',
 'M1C',
 'M1C',
 'M1C',
 'M1E',
 'M1E',
 'M1E',
 'M1G',
 'M1H',
 'M1J',
 'M1K',
 'M1K',
 'M1K',
 'M1L',
 'M1L',
 'M1L',
 'M1M',
 'M1M',
 'M1N',
 'M1N',
 'M1P',
 'M1P',
 'M1P',
 'M1R',
 'M1R',
 'M1S',
 'M1T',
 'M1T',
 'M1T',
 'M1V',
 'M1V',
 'M1V',
 'M1V',
 'M1W',
 'M1W',
 'M1X',
 'M2H',
 'M2J',
 'M2J',
 'M2J',
 'M2K',
 'M2L',
 'M2L',
 'M2M',
 'M2M,M2N,M2R',
 'M2P',
 'M3A',
 'M3B',
 'M3H',
 'M3H',
 'M3H',
 'M3J',
 'M3K',
 'M3M,M3L,M3N',
 'M4A',
 'M4B',
 'M4B',
 'M4C',
 'M4E',
 'M4G',
 'M4H',
 'M4J',
 'M4J',
 'M4K',
 'M4K',
 'M4L',
 'M4L',
 'M4M',
 'M4N',
 'M4P',
 'M4R',
 'M4R',
 'M4S',
 'M4T',
 'M4T',
 'M4V',
 'M4V',
 'M4V',
 'M4V',
 'M4W',
 'M4X',
 'M4X',
 'M4Y',
 'M5A',
 'M5A',
 'M5B',
 'M5B',
 'M5C',
 'M5E',
 'M5G',
 'M5H',
 'M5H',
 'M5J',
 'M5J',
 'M5J',
 'M5K',
 'M5K',
 'M5L',
 'M5L',
 'M5M',
 'M5M',
 'M5N',
 'M5P',
 'M5P',
 'M5R',
 'M5R',
 'M5S',
 'M5S',
 'M5T',
 'M5T',
 'M5T',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5V',
 'M5W',
 'M5X',
 'M5X',
 'M6A',


As you can see, the last five do not have the 'xa0' string anymore. Thus, now we are good to proceed to the final problem of the Postal Code string.

### We first inspect how many rows have this multiple Postal Code.

In [485]:
o = []
for index, sublist in enumerate(df["Postal Code"]):
    if "," in sublist:
        o += [index]
        
#         df["Postal Code"]==sublist
moo = df.iloc[o,:]
moo
# moo = df.iloc[o,:]        
# moo

Unnamed: 0,Postal Code,Borough,Neighbourhood
45,"M2M,M2N,M2R",NorthYork,Willowdale
54,"M3M,M3L,M3N",NorthYork,Downsview
177,"M9B,M9A",Etobicoke,IslingtonAvenue


We only have three rows as so.

#### We solve this problem by putting them in another dataframe, df2 and making the same borough and neighbourhood for every postal code that appeared in the rows.

In [486]:
df2 = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighbourhood'])

for index, sublist in enumerate(df["Postal Code"]):
    if "," in sublist:
        i = sublist.split(",")
        for ko in i:
            df2 = df2.append({"Postal Code":ko,
                    "Borough":df.iloc[index,1],
                    "Neighbourhood":df.iloc[index,2]},
                      ignore_index = True)

#  df["Postal Code"]==sublist
df2
# moo = df.iloc[o,:]        
# moo

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M2M,NorthYork,Willowdale
1,M2N,NorthYork,Willowdale
2,M2R,NorthYork,Willowdale
3,M3M,NorthYork,Downsview
4,M3L,NorthYork,Downsview
5,M3N,NorthYork,Downsview
6,M9B,Etobicoke,IslingtonAvenue
7,M9A,Etobicoke,IslingtonAvenue


In [487]:
df2.sort_values(by=['Postal Code'])
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M2M,NorthYork,Willowdale
1,M2N,NorthYork,Willowdale
2,M2R,NorthYork,Willowdale
3,M3M,NorthYork,Downsview
4,M3L,NorthYork,Downsview
5,M3N,NorthYork,Downsview
6,M9B,Etobicoke,IslingtonAvenue
7,M9A,Etobicoke,IslingtonAvenue


We now want to remove the ones that appeared multiples in df.

In [488]:
for index, sublist in enumerate(df["Postal Code"]):
    if "," in sublist:
        df.drop([index], axis=0,inplace=True)
#         df["Postal Code"]==sublist
df.head

<bound method NDFrame.head of     Postal Code          Borough                             Neighbourhood
0           M1B      Scarborough                                   Malvern
1           M1B      Scarborough                                     Rouge
2           M1C      Scarborough                                 RougeHill
3           M1C      Scarborough                             HighlandCreek
4           M1C      Scarborough                                 PortUnion
5           M1E      Scarborough                                 Guildwood
6           M1E      Scarborough                                  WestHill
7           M1E      Scarborough                               Morningside
8           M1G      Scarborough                                    Woburn
9           M1H      Scarborough                                 Cedarbrae
10          M1J      Scarborough                        ScarboroughVillage
11          M1K      Scarborough                                   Ion

Now, we want to add df2 to df. and sort them by Postal Code.

In [489]:
df = df.append(df2)
df.sort_values(by=['Postal Code'],inplace=True,ignore_index =True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,Malvern
1,M1B,Scarborough,Rouge
2,M1C,Scarborough,RougeHill
3,M1C,Scarborough,HighlandCreek
4,M1C,Scarborough,PortUnion
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,WestHill
7,M1E,Scarborough,Morningside
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


In [490]:
df.shape

(210, 3)

## 4. Combining different neighbourhoods in df with the same postal code.
### saving it in 'gu'

#### Im just trying out how to do it using M1C postal code

In [491]:
hu = merge.loc[merge['Postal Code'] == 'M1C']
hu
len(hu)
for i in range(0,len(hu)+1):
    print(i)

0
1


Checking if any is null values

In [492]:
hi.isnull().any()

Postal Code    False
Latitude       False
Longitude      False
dtype: bool

In [493]:
wu = hu.loc[:,'Neighbourhood']
print(wu)

1    RougeHill, HighlandCreek, PortUnion
Name: Neighbourhood, dtype: object


In [494]:
full_str = ', '.join([str(elem) for elem in wu])
display(full_str)

# it worked!!

'RougeHill, HighlandCreek, PortUnion'

In [495]:
from collections import Iterable
def flatten(lis):
     for item in lis:
         if isinstance(item, Iterable) and not isinstance(item, str):
             for x in flatten(item):
                 yield x
         else:        
             yield item

#### Putting postal code under one variable

In [496]:
mylist = df[['Postal Code']]
mylist

Unnamed: 0,Postal Code
0,M1B
1,M1B
2,M1C
3,M1C
4,M1C
5,M1E
6,M1E
7,M1E
8,M1G
9,M1H


#### Doing it for all postal code 
#### creating a list for unique postal code and naming it 'full_str'

In [497]:
post = list((np.unique(mylist)))
len(post)

102

In [498]:
full_str = []
for i in range(0, len(post)):
    hu = df.loc[df['Postal Code'] == post[i]]
    wu = hu.loc[:,'Neighbourhood']
    full_str.append(', '.join([str(elem) for elem in wu]))

len(full_str)

102

In [499]:
full_str

['Malvern, Rouge',
 'RougeHill, HighlandCreek, PortUnion',
 'Guildwood, WestHill, Morningside',
 'Woburn',
 'Cedarbrae',
 'ScarboroughVillage',
 'EastBirchmountPark, KennedyPark, Ionview',
 'Clairlea, GoldenMile, Oakridge',
 'Cliffcrest, Cliffside',
 'CliffsideWest, BirchCliff',
 'WexfordHeights, DorsetPark, ScarboroughTownCentre',
 'Maryvale, Wexford',
 'Agincourt',
 "TamO'Shanter, Sullivan, ClarksCorners",
 "L'AmoreauxEast, AgincourtNorth, SteelesEast, Milliken",
 "L'AmoreauxWest, SteelesWest",
 'UpperRouge',
 'HillcrestVillage',
 'HenryFarm, Fairview, Oriole',
 'BayviewVillage',
 'YorkMills, SilverHills',
 'Newtonbrook, Willowdale',
 'Willowdale',
 'YorkMillsWest',
 'Willowdale',
 'Parkwoods',
 'DonMills',
 'WilsonHeights, DownsviewNorth, BathurstManor',
 'NorthwoodPark',
 'Downsview',
 'Downsview',
 'Downsview',
 'Downsview',
 'VictoriaVillage',
 'ParkviewHill, WoodbineGardens,',
 'WoodbineHeights',
 'TheBeaches',
 'Leaside',
 'ThorncliffePark',
 'BroadviewNorth, EastToronto',
 'Ri

In [500]:
gu = df.drop_duplicates(subset =["Postal Code"], keep = "first")
print(gu.shape)
len(gu)

(102, 3)


102

Checking gu doesn't have duplicate of postal code.

In [501]:
gu

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,Malvern
2,M1C,Scarborough,RougeHill
5,M1E,Scarborough,Guildwood
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae
10,M1J,Scarborough,ScarboroughVillage
11,M1K,Scarborough,EastBirchmountPark
14,M1L,Scarborough,Clairlea
17,M1M,Scarborough,Cliffcrest
19,M1N,Scarborough,CliffsideWest


In [502]:
gu.reset_index(drop=True, inplace=True)

Assigning 'gu' Neighbourhood with full_str, that have all the neighbourhoods with the same postal code.

In [503]:
gu.loc[:,'Neighbourhood']= full_str
gu.iloc[-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Postal Code                                  M9W
Borough                                Etobicoke
Neighbourhood    WestHumberClairville, Northwest
Name: 101, dtype: object

In [504]:
gu.reset_index(drop=True, inplace=True)
gu

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"RougeHill, HighlandCreek, PortUnion"
2,M1E,Scarborough,"Guildwood, WestHill, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,ScarboroughVillage
6,M1K,Scarborough,"EastBirchmountPark, KennedyPark, Ionview"
7,M1L,Scarborough,"Clairlea, GoldenMile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside"
9,M1N,Scarborough,"CliffsideWest, BirchCliff"


In [505]:
gu.shape

(102, 3)

## 5. Merging 'gu' and 'hi'

In [506]:
hi.shape

(103, 3)

We know that hi is bigger than gu by one rows. But we're gonna merge by right(hi) anyways, so that we will know which Postal Code gu doesn't have.

In [507]:
merge = gu.merge(hi, on="Postal Code", how="right")

In [508]:
merge

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, WestHill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,ScarboroughVillage,43.744734,-79.239476
6,M1K,Scarborough,"EastBirchmountPark, KennedyPark, Ionview",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, GoldenMile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside",43.716316,-79.239476
9,M1N,Scarborough,"CliffsideWest, BirchCliff",43.692657,-79.264848


In [509]:
merge.isnull().values.any()
merge.isnull().sum()

Postal Code      0
Borough          1
Neighbourhood    1
Latitude         0
Longitude        0
dtype: int64

In [510]:
null_data = merge[merge.isnull().any(axis=1)]
null_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
27,M3C,,,43.7259,-79.340923


Now, we know that gu doesn't have M3C postcode. That's just for our analysis.

We're just going to remove the null rows because we don't know what neighbourhood and borough is that. Plus, it's only one row.

In [511]:
merge.dropna(inplace=True)
merge.reset_index(drop=True, inplace=True)
merge

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, WestHill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,ScarboroughVillage,43.744734,-79.239476
6,M1K,Scarborough,"EastBirchmountPark, KennedyPark, Ionview",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, GoldenMile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside",43.716316,-79.239476
9,M1N,Scarborough,"CliffsideWest, BirchCliff",43.692657,-79.264848


In [512]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(merge['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 13 boroughs and 210 neighbourhoods.


### Naming it toronto.

In [513]:
toronto = merge

## 6. Exploring Neighbourhoods

<h3>FourSquare Credentials<h3>

In [514]:
CLIENT_ID = 'XSSIZCXDKP53HRAXUM3BKFR0DBUGHSXAT4TKR1R4OKH3YHPF' # your Foursquare ID
CLIENT_SECRET = 'RHEUGTCM3J2A2ZXXAMWL4CD34V5NQTK5ZA4L3LVDII2LB1QQ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentials:
CLIENT_ID: XSSIZCXDKP53HRAXUM3BKFR0DBUGHSXAT4TKR1R4OKH3YHPF
CLIENT_SECRET:RHEUGTCM3J2A2ZXXAMWL4CD34V5NQTK5ZA4L3LVDII2LB1QQ


### Finding the nearest venue.
### Trying it on M1B Postal Code.

In [515]:
neighbourhood_latitude = toronto.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = toronto.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = toronto.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Malvern, Rouge are 43.8066863, -79.1943534.


In [516]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
 # create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)

#### Defining url for M1B postal code example

In [517]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '610a68b6ff557a1e049ace4c'},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 1,
  'suggestedBounds': {'ne': {'lat': 43.811186304500005,
    'lng': -79.1881295807304},
   'sw': {'lat': 43.8021862955, 'lng': -79.20057721926959}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bb6b9446edc76b0d771311c',
       'name': 'Wendy’s',
       'location': {'crossStreet': 'Morningside & Sheppard',
        'lat': 43.80744841934756,
        'lng': -79.19905558052072,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'lng': -79.19905558052072}],
        'distance': 387,
        'cc': 'CA',
        'city': 'Toronto',

#### Defining category making function

In [518]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Importing json normalize package

In [519]:
from pandas.io.json import json_normalize # transform JSON file into a pandas dataframe

In [520]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy’s,Fast Food Restaurant,43.807448,-79.199056


In [521]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

1 venues were returned by Foursquare.


<h3> Finding nearby venues doing it for all </h3>

In [522]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [523]:
toronto_venues = getNearbyVenues(names=toronto['Neighbourhood'],
                                   latitudes=toronto['Latitude'], 
                                   longitudes=toronto['Longitude']
                                  )
print(toronto_venues[0:5])

Malvern, Rouge
RougeHill, HighlandCreek, PortUnion
Guildwood, WestHill, Morningside
Woburn
Cedarbrae
ScarboroughVillage
EastBirchmountPark, KennedyPark, Ionview
Clairlea, GoldenMile, Oakridge
Cliffcrest, Cliffside
CliffsideWest, BirchCliff
WexfordHeights, DorsetPark, ScarboroughTownCentre
Maryvale, Wexford
Agincourt
TamO'Shanter, Sullivan, ClarksCorners
L'AmoreauxEast, AgincourtNorth, SteelesEast, Milliken
L'AmoreauxWest, SteelesWest
UpperRouge
HillcrestVillage
HenryFarm, Fairview, Oriole
BayviewVillage
YorkMills, SilverHills
Newtonbrook, Willowdale
Willowdale
YorkMillsWest
Willowdale
Parkwoods
DonMills
WilsonHeights, DownsviewNorth, BathurstManor
NorthwoodPark
Downsview
Downsview
Downsview
Downsview
VictoriaVillage
ParkviewHill, WoodbineGardens,
WoodbineHeights
TheBeaches
Leaside
ThorncliffePark
BroadviewNorth, EastToronto
Riverdale, TheDanforthWest
IndiaBazaar, TheBeachesWest
StudioDistrict
LawrencePark
DavisvilleNorth
LawrencePark, NorthTorontoWest
Davisville
MoorePark, SummerhillEa

In [524]:
print(toronto_venues.shape)
toronto_venues.head()

(2132, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
2,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497,Great Shine Window Cleaning,43.783145,-79.157431,Home Service
3,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,"Guildwood, WestHill, Morningside",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank


In [557]:
k = []
oop = toronto_venues['Venue Category']
for n,i in enumerate(oop):
   if i=='Chinese Restaurant':
    k += [n]  

toronto_venues.iloc[k,:]

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
51,"WexfordHeights, DorsetPark, ScarboroughTownCentre",43.75741,-79.273304,Kim Kim restaurant,43.753833,-79.276611,Chinese Restaurant
66,Agincourt,43.7942,-79.262029,Royal Chinese Seafood Restaurant,43.798496,-79.262196,Chinese Restaurant
69,"TamO'Shanter, Sullivan, ClarksCorners",43.781638,-79.304302,The Royal Chinese Restaurant 避風塘小炒,43.780505,-79.298844,Chinese Restaurant
85,"L'AmoreauxWest, SteelesWest",43.799525,-79.318389,Mr Congee Chinese Cuisine 龍粥記,43.798879,-79.318335,Chinese Restaurant
166,BayviewVillage,43.786947,-79.385975,Sun Star Chinese Cuisine 翠景小炒,43.787914,-79.381234,Chinese Restaurant
473,"LawrencePark, NorthTorontoWest",43.715383,-79.405678,C'est Bon,43.716785,-79.400406,Chinese Restaurant
575,"St.JamesTown, Cabbagetown",43.667967,-79.367675,China Gourmet,43.66418,-79.368359,Chinese Restaurant
787,"Ryerson, GardenDistrict",43.657162,-79.378937,GB Hand-Pulled Noodles,43.656434,-79.383783,Chinese Restaurant
1141,"TorontoIslands, HarbourfrontEast, UnionStation",43.640816,-79.381752,Pearl Harbourfront,43.638157,-79.380688,Chinese Restaurant
1317,"DesignExchange, TorontoDominionCentre",43.647177,-79.381576,Szechuan Express,43.646973,-79.379549,Chinese Restaurant


#### Let's check how many venues were returned for each neighborhood

In [525]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, LongBranch",9,9,9,9,9,9
BayviewVillage,4,4,4,4,4,4
"BedfordPark, ,, LawrenceManorEast",24,24,24,24,24,24
BerczyPark,59,59,59,59,59,59
"BroadviewNorth, EastToronto",4,4,4,4,4,4
"CNTower, HarbourfrontWest, RailwayLands, KingandSpadina, SouthNiagara, BathurstQuay, Islandairport",18,18,18,18,18,18
CaledoniaFairbanks,4,4,4,4,4,4
CanadaPostGatewayProcessingCentre,14,14,14,14,14,14
Cedarbrae,9,9,9,9,9,9


#### Let's find out how many unique categories can be curated from all the returned venues

In [526]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 274 uniques categories.


## 7. Analyzing Neighbourhood.

In [527]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Truck Stop,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"RougeHill, HighlandCreek, PortUnion",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"RougeHill, HighlandCreek, PortUnion",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"RougeHill, HighlandCreek, PortUnion",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, WestHill, Morningside",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [528]:
toronto_onehot.shape

(2132, 275)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [529]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Truck Stop,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, LongBranch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BayviewVillage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"BedfordPark, ,, LawrenceManorEast",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BerczyPark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"BroadviewNorth, EastToronto",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"CNTower, HarbourfrontWest, RailwayLands, Kinga...",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CaledoniaFairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
8,CanadaPostGatewayProcessingCentre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [530]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [531]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Breakfast Spot,Chinese Restaurant,Lounge,Mobile Phone Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant
1,"Alderwood, LongBranch",Pizza Place,Gym,Sandwich Place,Coffee Shop,Pub,Playground,Pool,Pharmacy,Park,Movie Theater
2,BayviewVillage,Japanese Restaurant,Bank,Chinese Restaurant,Café,Accessories Store,Monument / Landmark,Movie Theater,Motel,Moroccan Restaurant,Molecular Gastronomy Restaurant
3,"BedfordPark, ,, LawrenceManorEast",Hobby Shop,Italian Restaurant,Coffee Shop,Sandwich Place,Comfort Food Restaurant,Pub,Restaurant,Café,Butcher,Fast Food Restaurant
4,BerczyPark,Coffee Shop,Cocktail Bar,Bakery,Beer Bar,Pub,Pharmacy,Cheese Shop,Restaurant,Farmers Market,Seafood Restaurant


## 8. Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [532]:
from sklearn.cluster import KMeans

In [533]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [534]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto

# merge toronto_grouped with toronto to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4.0,Fast Food Restaurant,Accessories Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Middle Eastern Restaurant
1,M1C,Scarborough,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497,0.0,Construction & Landscaping,Home Service,Bar,Modern European Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Mobile Phone Shop
2,M1E,Scarborough,"Guildwood, WestHill, Morningside",43.763573,-79.188711,0.0,Breakfast Spot,Bank,Medical Center,Electronics Store,Intersection,Restaurant,Rental Car Location,Mexican Restaurant,Donut Shop,Mobile Phone Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Convenience Store,Korean BBQ Restaurant,Indian Restaurant,Accessories Store,Molecular Gastronomy Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Bakery,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant,Gas Station,Athletics & Sports,Fried Chicken Joint,Bank,Lounge,Middle Eastern Restaurant


Checking any null values

In [535]:
print(toronto_merged.isnull().values.any())

print("\nSum of NAType is \n{}".format(toronto_merged.isnull().sum()))

True

Sum of NAType is 
Postal Code               0
Borough                   0
Neighbourhood             0
Latitude                  0
Longitude                 0
Cluster Labels            3
1st Most Common Venue     3
2nd Most Common Venue     3
3rd Most Common Venue     3
4th Most Common Venue     3
5th Most Common Venue     3
6th Most Common Venue     3
7th Most Common Venue     3
8th Most Common Venue     3
9th Most Common Venue     3
10th Most Common Venue    3
dtype: int64


#### Turning toronto_merged float64 to int64 

In [536]:
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype('Int64')
toronto_merged

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4.0,Fast Food Restaurant,Accessories Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Middle Eastern Restaurant
1,M1C,Scarborough,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497,0.0,Construction & Landscaping,Home Service,Bar,Modern European Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Mobile Phone Shop
2,M1E,Scarborough,"Guildwood, WestHill, Morningside",43.763573,-79.188711,0.0,Breakfast Spot,Bank,Medical Center,Electronics Store,Intersection,Restaurant,Rental Car Location,Mexican Restaurant,Donut Shop,Mobile Phone Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Convenience Store,Korean BBQ Restaurant,Indian Restaurant,Accessories Store,Molecular Gastronomy Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Bakery,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant,Gas Station,Athletics & Sports,Fried Chicken Joint,Bank,Lounge,Middle Eastern Restaurant
5,M1J,Scarborough,ScarboroughVillage,43.744734,-79.239476,0.0,Playground,Spa,Grocery Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
6,M1K,Scarborough,"EastBirchmountPark, KennedyPark, Ionview",43.727929,-79.262029,0.0,Discount Store,Hobby Shop,Department Store,Bus Station,Coffee Shop,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Music Venue
7,M1L,Scarborough,"Clairlea, GoldenMile, Oakridge",43.711112,-79.284577,0.0,Bus Line,Bakery,Soccer Field,Intersection,Park,Ice Cream Shop,Modern European Restaurant,Motel,Moroccan Restaurant,Monument / Landmark
8,M1M,Scarborough,"Cliffcrest, Cliffside",43.716316,-79.239476,0.0,Motel,American Restaurant,Accessories Store,Mobile Phone Shop,Movie Theater,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop
9,M1N,Scarborough,"CliffsideWest, BirchCliff",43.692657,-79.264848,0.0,General Entertainment,College Stadium,Café,Skating Rink,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Music Venue


## 9. Visualizing Toronto Neighbourhood Cluster in Map using Folium

In [537]:
!pip install folium
import folium



In [538]:
address = 'Toronto,Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347.


In [539]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [540]:
# dropping na values
toronto1=toronto_merged.dropna()
toronto1

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4,Fast Food Restaurant,Accessories Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Middle Eastern Restaurant
1,M1C,Scarborough,"RougeHill, HighlandCreek, PortUnion",43.784535,-79.160497,0,Construction & Landscaping,Home Service,Bar,Modern European Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Mobile Phone Shop
2,M1E,Scarborough,"Guildwood, WestHill, Morningside",43.763573,-79.188711,0,Breakfast Spot,Bank,Medical Center,Electronics Store,Intersection,Restaurant,Rental Car Location,Mexican Restaurant,Donut Shop,Mobile Phone Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Convenience Store,Korean BBQ Restaurant,Indian Restaurant,Accessories Store,Molecular Gastronomy Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Bakery,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant,Gas Station,Athletics & Sports,Fried Chicken Joint,Bank,Lounge,Middle Eastern Restaurant
5,M1J,Scarborough,ScarboroughVillage,43.744734,-79.239476,0,Playground,Spa,Grocery Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
6,M1K,Scarborough,"EastBirchmountPark, KennedyPark, Ionview",43.727929,-79.262029,0,Discount Store,Hobby Shop,Department Store,Bus Station,Coffee Shop,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Music Venue
7,M1L,Scarborough,"Clairlea, GoldenMile, Oakridge",43.711112,-79.284577,0,Bus Line,Bakery,Soccer Field,Intersection,Park,Ice Cream Shop,Modern European Restaurant,Motel,Moroccan Restaurant,Monument / Landmark
8,M1M,Scarborough,"Cliffcrest, Cliffside",43.716316,-79.239476,0,Motel,American Restaurant,Accessories Store,Mobile Phone Shop,Movie Theater,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop
9,M1N,Scarborough,"CliffsideWest, BirchCliff",43.692657,-79.264848,0,General Entertainment,College Stadium,Café,Skating Rink,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Music Venue


### Visualizing postal code in Toronto in a map

In [541]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)


# add markers to map
for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Visualizing cluster of neighbourhood in Toronto in map

In [542]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto1['Latitude'], toronto1['Longitude'], toronto1['Neighbourhood'], toronto1['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

## 10. Examining Cluster

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, we can then assign a name to each cluster.

### Cluster 1: The Restaurant District 

In [543]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,0,Construction & Landscaping,Home Service,Bar,Modern European Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Mobile Phone Shop
2,Scarborough,0,Breakfast Spot,Bank,Medical Center,Electronics Store,Intersection,Restaurant,Rental Car Location,Mexican Restaurant,Donut Shop,Mobile Phone Shop
3,Scarborough,0,Coffee Shop,Convenience Store,Korean BBQ Restaurant,Indian Restaurant,Accessories Store,Molecular Gastronomy Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark
4,Scarborough,0,Bakery,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant,Gas Station,Athletics & Sports,Fried Chicken Joint,Bank,Lounge,Middle Eastern Restaurant
5,Scarborough,0,Playground,Spa,Grocery Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
6,Scarborough,0,Discount Store,Hobby Shop,Department Store,Bus Station,Coffee Shop,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Music Venue
7,Scarborough,0,Bus Line,Bakery,Soccer Field,Intersection,Park,Ice Cream Shop,Modern European Restaurant,Motel,Moroccan Restaurant,Monument / Landmark
8,Scarborough,0,Motel,American Restaurant,Accessories Store,Mobile Phone Shop,Movie Theater,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop
9,Scarborough,0,General Entertainment,College Stadium,Café,Skating Rink,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Music Venue
10,Scarborough,0,Indian Restaurant,Chinese Restaurant,Pet Store,Vietnamese Restaurant,Thrift / Vintage Store,Light Rail Station,Modern European Restaurant,Motel,Moroccan Restaurant,Monument / Landmark


### Cluster 2: The Leisure District

In [544]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Scarborough,1,Playground,Intersection,Park,Arts & Crafts Store,Modern European Restaurant,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant
21,NorthYork,1,Home Service,Park,Accessories Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
23,NorthYork,1,Park,Convenience Store,Accessories Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
25,NorthYork,1,Food & Drink Shop,Park,Fast Food Restaurant,Accessories Store,Miscellaneous Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
39,EastYork,1,Convenience Store,Intersection,Park,Coffee Shop,Accessories Store,Mobile Phone Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant
43,CentralToronto,1,Park,Bus Line,Swim School,Accessories Store,Mobile Phone Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop
49,DowntownToronto,1,Park,Playground,Trail,Miscellaneous Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Middle Eastern Restaurant
73,York,1,Park,Women's Store,Pool,Accessories Store,Middle Eastern Restaurant,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop
78,NorthYork,1,Construction & Landscaping,Park,Basketball Court,Bakery,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mediterranean Restaurant,Museum
89,Etobicoke,1,Pool,Park,River,Accessories Store,Miscellaneous Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop


### Cluster 3: The Neighbourhood Area

In [545]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
95,NorthYork,2,Pizza Place,Accessories Store,Middle Eastern Restaurant,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Miscellaneous Shop,Mexican Restaurant


### Cluster 4: The Greater Toronto Area

In [546]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
90,Etobicoke,3,Construction & Landscaping,Baseball Field,Mobile Phone Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Museum
96,NorthYork,3,Baseball Field,Accessories Store,Mobile Phone Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Museum


### Cluster 5: The Outer Greater Toronto Area

In [547]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,4,Fast Food Restaurant,Accessories Store,Miscellaneous Shop,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Middle Eastern Restaurant


# Test

In [548]:
food_category = '4d4b7105d754a06374d81259' # 'Root' category for all food-related venues

italian_restaurant_categories = ['4bf58dd8d48988d145941735','52af3a5e3cf9994f4e043bea','52af3a723cf9994f4e043bec',
                                 '52af3a7c3cf9994f4e043bed','58daa1558bbb0b01f18ec1d3','52af3a673cf9994f4e043beb',
                                 '52af3a903cf9994f4e043bee','4bf58dd8d48988d1f5931735','52af3a9f3cf9994f4e043bef',
                                 '52af3aaa3cf9994f4e043bf0','52af3ab53cf9994f4e043bf1','52af3abe3cf9994f4e043bf2',
                                 '52af3ac83cf9994f4e043bf3','52af3ad23cf9994f4e043bf4','52af3add3cf9994f4e043bf5',
                                 '52af3af23cf9994f4e043bf7','52af3ae63cf9994f4e043bf6','52af3afc3cf9994f4e043bf8',
                                 '52af3b053cf9994f4e043bf9','52af3b213cf9994f4e043bfa','52af3b293cf9994f4e043bfb',
                                 '52af3b343cf9994f4e043bfc','52af3b3b3cf9994f4e043bfd','52af3b463cf9994f4e043bfe',
                                 '52af3b633cf9994f4e043c01','52af3b513cf9994f4e043bff','52af3b593cf9994f4e043c00',
                                 '52af3b6e3cf9994f4e043c02','52af3b773cf9994f4e043c03','52af3b813cf9994f4e043c04',
                                 '52af3b893cf9994f4e043c05','52af3b913cf9994f4e043c06','52af3b9a3cf9994f4e043c07',
                                 '52af3ba23cf9994f4e043c08']

def is_restaurant(categories, specific_filter=None):
    restaurant_words = ['restaurant', 'diner', 'taverna', 'steakhouse']
    restaurant = False
    specific = False
    for c in categories:
        category_name = c[0].lower()
        category_id = c[1]
        for r in restaurant_words:
            if r in category_name:
                restaurant = True
        if 'fast food' in category_name:
            restaurant = False
        if not(specific_filter is None) and (category_id in specific_filter):
            specific = True
            restaurant = True
    return restaurant, specific

In [None]:
!pip install shapely
import shapely.geometry

!pip install pyproj
import pyproj

import math

def lonlat_to_xy(lon, lat):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, lon, lat)
    return xy[0], xy[1]

def xy_to_lonlat(x, y):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, x, y)
    return lonlat[0], lonlat[1]

def calc_xy_distance(x1, y1, x2, y2):
    dx = x2 - x1
    dy = y2 - y1
    return math.sqrt(dx*dx + dy*dy)

print('Coordinate transformation check')
print('-------------------------------')
print('Berlin center longitude={}, latitude={}'.format(berlin_center[1], berlin_center[0]))
x, y = lonlat_to_xy(berlin_center[1], berlin_center[0])
print('Berlin center UTM X={}, Y={}'.format(x, y))
lo, la = xy_to_lonlat(x, y)
print('Berlin center longitude={}, latitude={}'.format(lo, la))