# Data Section

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import folium

# Postal Code extraction

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
get_text = requests.get(link).text
xml = BeautifulSoup(get_text, 'xml')
table=xml.find('table')
columns = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = columns)

# Search for postcodes, borough and neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [3]:
df.head(12)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


# Data treatment

In [4]:
#1)Bring just rows where Borough is diferent from 'Not assigned'
df = df[df.Borough != 'Not assigned']

#2)If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.
for index, row in df.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

#3)Grouping per postcode and bringing neighborhoods and boroughs to the same line  
df=df.groupby(['Postalcode','Borough'])['Neighborhood'].apply(','.join).reset_index()
df.head(15)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [5]:
df.shape

(103, 3)

# Geting geo data

In [6]:
geo_data=pd.read_csv('http://cocl.us/Geospatial_data')

In [7]:
geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [8]:
geo_data.rename(columns={'Postal Code':'Postalcode'},inplace=True)
df_geo = pd.merge(geo_data, df, on='Postalcode')

In [9]:
# create map of New York using latitude and longitude values
map = folium.Map(location=[43.806686, -79.194353], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['Borough'], df_geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map) 
map

## Lets create a fucntion to explore and bring data from the foursquare api

In [10]:
CLIENT_ID = '44GZRUGU0KV34BK1VJ0ED52T1MESILINW1ZH0OQ3HQY1P1K1' # your Foursquare ID
CLIENT_SECRET = 'CXORDLZIUGYFOAZQZXJNG2DYXMTMLF1DE041QPN3RW4MJ3LC' # your Foursquare Secret
VERSION = '20190425'
limit=1000000

In [11]:
# Let's create a function to repeat the process to all the neighborhoods in Toronto
def getNearbyVenues(names, latitudes,longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names,latitudes,longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
downtown_toronto_venues = getNearbyVenues(names=df_geo['Neighborhood'],latitudes=df_geo['Latitude'],longitudes=df_geo['Longitude'], radius=500)

Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West,Riverdale
The Beaches West,Indi

# Now, we need to calculate the frequency of each type of venue in order to stablish clusters

In [13]:
downtown_toronto_venues['Venue Category2'] = downtown_toronto_venues['Venue Category'].str.contains(pat = 'Restaurant')
downtown_toronto_venues['Venue Category2']= downtown_toronto_venues.apply(lambda x: 'Restaurant' if x['Venue Category2'] else x['Venue Category'], axis=1)
downtown_toronto_venues['Venue Category']=downtown_toronto_venues['Venue Category2']

In [14]:
# one hot encoding
downtown_toronto_onehot = pd.get_dummies(downtown_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_toronto_onehot['Neighborhood'] = downtown_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_toronto_onehot.columns[-1]] + list(downtown_toronto_onehot.columns[:-1])
downtown_toronto_onehot = downtown_toronto_onehot[fixed_columns]

downtown_toronto_onehot.head() 

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Video Game Store,Video Store,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
downtown_toronto_grouped = downtown_toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [16]:
num_top_venues = 5

for hood in downtown_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_toronto_grouped[downtown_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
         venue  freq
0   Restaurant  0.31
1  Coffee Shop  0.07
2         Café  0.04
3          Bar  0.04
4   Steakhouse  0.03


----Agincourt----
            venue  freq
0    Skating Rink  0.25
1          Lounge  0.25
2  Breakfast Spot  0.25
3      Restaurant  0.25
4     Yoga Studio  0.00


----Agincourt North,L'Amoreaux East,Milliken,Steeles East----
         venue  freq
0  Coffee Shop  0.33
1   Playground  0.33
2         Park  0.33
3  Yoga Studio  0.00
4    Nightclub  0.00


----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                 venue  freq
0        Grocery Store  0.25
1             Pharmacy  0.12
2          Pizza Place  0.12
3  Fried Chicken Joint  0.12
4           Beer Store  0.12


----Alderwood,Long Branch----
          venue  freq
0   Pizza Place  0.22
1           Pub  0.11
2      Pharmacy  0.11
3           Gym  0.11
4  Skating Rink  0.11


----Bathurst Manor,Downsview North,W

         venue  freq
0         Park  0.33
1     Bus Line  0.33
2  Swim School  0.33
3  Yoga Studio  0.00
4  Music Venue  0.00


----Leaside----
                    venue  freq
0             Coffee Shop  0.09
1              Restaurant  0.09
2     Sporting Goods Shop  0.09
3  Furniture / Home Store  0.06
4            Burger Joint  0.06


----Little Portugal,Trinity----
         venue  freq
0   Restaurant  0.29
1          Bar  0.12
2  Coffee Shop  0.07
3         Café  0.04
4       Bakery  0.04


----Maryvale,Wexford----
            venue  freq
0      Restaurant  0.29
1  Breakfast Spot  0.14
2   Shopping Mall  0.14
3     Auto Garage  0.14
4  Sandwich Place  0.14


----Moore Park,Summerhill East----
         venue  freq
0   Playground  0.33
1        Trail  0.33
2  Summer Camp  0.33
3  Yoga Studio  0.00
4  Music Store  0.00


----North Toronto West----
                 venue  freq
0           Restaurant  0.20
1       Clothing Store  0.15
2          Coffee Shop  0.10
3          Yoga Studio  0

In [17]:
# Let's put that into a pandas dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_toronto_grouped['Neighborhood']

for ind in np.arange(downtown_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Restaurant,Coffee Shop,Bar,Café,Steakhouse,Hotel,Gym,Gastropub,Pizza Place,Concert Hall
1,Agincourt,Lounge,Skating Rink,Breakfast Spot,Restaurant,Women's Store,Curling Ice,Donut Shop,Dog Run,Distribution Center,Discount Store
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Playground,Coffee Shop,Women's Store,Cupcake Shop,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Fried Chicken Joint,Sandwich Place,Beer Store,Pharmacy,Pizza Place,Restaurant,Cupcake Shop,Curling Ice,Dance Studio
4,"Alderwood,Long Branch",Pizza Place,Coffee Shop,Sandwich Place,Pub,Pool,Skating Rink,Pharmacy,Gym,Creperie,Distribution Center
5,"Bathurst Manor,Downsview North,Wilson Heights",Restaurant,Coffee Shop,Shopping Mall,Pizza Place,Supermarket,Deli / Bodega,Fried Chicken Joint,Frozen Yogurt Shop,Bank,Bridal Shop
6,Bayview Village,Restaurant,Bank,Café,Women's Store,Curling Ice,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
7,"Bedford Park,Lawrence Manor East",Restaurant,Coffee Shop,Sandwich Place,Pizza Place,Butcher,Pub,Café,Breakfast Spot,Grocery Store,Pharmacy
8,Berczy Park,Restaurant,Coffee Shop,Beer Bar,Cocktail Bar,Bakery,Cheese Shop,Farmers Market,Café,Park,Beach
9,"Birch Cliff,Cliffside West",Skating Rink,College Stadium,General Entertainment,Café,Women's Store,Cupcake Shop,Donut Shop,Dog Run,Distribution Center,Discount Store
