Get the toronto neighborhood and venues data from wikipedia

In [2]:
import requests
import lxml.html as lh
import pandas as pd

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

#Check the length of the first 12 rows
#[len(T) for T in tr_elements[:12]]

#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    col.append((name,[]))
        
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                data.rstrip()
                pass
        #Append the data to the empty list of the i'th column
        #print(data)
        col[i][1].append(data)
        #Increment i for the next column
        i+=1
        
        
Dict={title:column for (title,column) in col}

neighs = Dict['Neighbourhood\n']
for i in range(0, len(neighs)):
    newval = neighs[i].rstrip()
    neighs[i]=newval
Dict['Neighbourhood\n'] = neighs
    
df=pd.DataFrame(Dict)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
# Remove Not Assigned boroughs
newdf = df[df['Borough'] != 'Not assigned']

#Set 'Not Assigned' neighborhood to borough name
for i in range(0, len(newdf)):
    if newdf.iloc[i]['Neighbourhood\n'] == 'Not assigned\n':
        newdf.iloc[i]['Neighbourhood\n'] = newdf.iloc[i]['Borough']

#newdf.head()

#Group rows by postcode
groupbyDF = newdf.groupby(['Postcode','Borough'], as_index=False, sort=False).agg(",".join)
groupbyDF.shape

(103, 3)

In [4]:
# read CSV file containg geo spactial data
coordDF = pd.read_csv('http://cocl.us/Geospatial_data')  # doctest: +SKIP
coordDF.head()

coordDF=coordDF.rename(columns = {'Postal Code':'Postcode'})

df_merged = pd.merge(groupbyDF, coordDF, on='Postcode')
df_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Not assigned,43.662301,-79.389494


In [None]:
import numpy as np # library to handle data in a vectorized manner

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: - 

In [6]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighbourhood\n']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [9]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [10]:
# Now we are ready to clean the json and structure it into a *pandas* dataframe.
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


Unnamed: 0,name,categories,lat,lng
0,The Greater Good Bar,Bar,43.669409,-79.439267
1,Parallel,Middle Eastern Restaurant,43.669516,-79.438728
2,Happy Bakery & Pastries,Bakery,43.66705,-79.441791
3,Planet Fitness Toronto Galleria,Gym / Fitness Center,43.667588,-79.442574
4,Blood Brothers Brewing,Brewery,43.669944,-79.436533


In [11]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

16 venues were returned by Foursquare.


#### Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [141]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
            
        # make the GET request
        temp_results = requests.get(url).json()
        #print (temp_results)
        results = temp_results["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [143]:
# Now let's get vanues for each neighborhood in Tornoto and create a new dataframe called tornoto_venues.¶

toronto_data = df_merged
#print ('hello')
toronot_venues = getNearbyVenues(names=toronto_data['Neighbourhood\n'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
#print ('done')



Filter neignborhoods based on if they contain "asian restaurants" and NOT any "Indian" restaurnts.

In [144]:
venue_food = ['Indian Restaurant', 'Asian Restaurant', 'Afgan Restaurant', 'Chinese Restaurant','Ethiopian Restaurant', 'Korean Restaurant','Malay Restaurant','Thai Restaurant']

toronot_venues.rename(columns = {'Venue Category' :'VenueCategory'}, inplace=True)
toronto_food_venues = toronot_venues[toronot_venues['VenueCategory'].isin(venue_food)]
print (toronto_food_venues.shape[0], toronto_food_venues.shape[1])
toronto_food_venues.reset_index(inplace=True, drop=True)
#84 total rows

venue_indian_food = ['Indian Restaurant']
toronto_indian_food_venues = toronto_food_venues[toronto_food_venues['VenueCategory'].isin(venue_indian_food)]
print (toronto_indian_food_venues.shape[0], toronto_indian_food_venues.shape[1])
# Result: 15 neghborhoods with indian restaurants.
toronto_indian_food_venues.reset_index(inplace=True, drop=True)

#toronto_indian_food_venues.head(10)
# get rid of neighborhoods with indian restaurants
toronto_food_venues.is_copy = False

for i in range(0, toronto_indian_food_venues.shape[0]):
    for j in range(0, toronto_food_venues.shape[0]):
        #print(toronto_indian_food_venues.iloc[i].Neighborhood)
        if toronto_indian_food_venues.iloc[i].Neighborhood == toronto_food_venues.iloc[j].Neighborhood:
          #print('found a match')
          toronto_food_venues.loc[j,'Neighborhood'] = 'XXX'
print (toronto_food_venues.shape[0], toronto_food_venues.shape[1])

toronto_non_indian_food_venues = toronto_food_venues[toronto_food_venues['Neighborhood'].str.find('XXX') == -1]
toronto_non_indian_food_venues.reset_index(inplace=True, drop=True)

print (toronto_non_indian_food_venues.shape)
#print(toronto_non_indian_food_venues.head())


84 7
15 7


  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


84 7
(48, 7)


In [145]:
# get rid of columns not needed for our analysis
try:
  toronto_non_indian_food_venues.drop('Venue Latitude', axis=1,inplace=True)
  toronto_non_indian_food_venues.drop('Venue Longitude', axis=1,inplace=True)
except:
  print('already dropped')

toronto_non_indian_food_venues = toronto_non_indian_food_venues.groupby(['Neighborhood','Neighborhood Latitude', 'Neighborhood Longitude']).count().reset_index()
#print (toronto_non_indian_food_venues.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


This is what a map of Tornoto neighborhoods with asian but not Indian restaurants looks like 

In [135]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(toronto_non_indian_food_venues['Neighborhood Latitude'], toronto_non_indian_food_venues['Neighborhood Longitude'], toronto_non_indian_food_venues['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [114]:
toronto_grouped = toronto_non_indian_food_venues
print (toronto_grouped.head(10))

                                    Neighborhood  Neighborhood Latitude  \
0  Bathurst Manor,Downsview North,Wilson Heights              43.754328   
1                                Bayview Village              43.786947   
2                                    Berczy Park              43.644771   
3                                      Cedarbrae              43.773136   
4        Chinatown,Grange Park,Kensington Market              43.653206   
5          Clarks Corners,Sullivan,Tam O'Shanter              43.781638   
6                  Commerce Court,Victoria Hotel              43.648198   
7                               Davisville North              43.712751   
8        Design Exchange,Toronto Dominion Centre              43.647177   
9                              Downsview Central              43.728496   

   Neighborhood Longitude  Venue  VenueCategory  
0              -79.442259      1              1  
1              -79.385975      1              1  
2              -79.37330

In [139]:
# set number of clusters
kclusters = 3
print (toronto_grouped.shape)

# add a dummy column because kmeans need at least two
toronto_grouped['Dummy'] = toronto_grouped['VenueCategory']
print (toronto_grouped.shape)

dfForClustering = toronto_grouped.loc[:, ['VenueCategory', 'Dummy']]


kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dfForClustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:23] 
#neighborhoods_venues_sorted.head()

(24, 7)
(24, 7)


array([1, 1, 1, 1, 2, 0, 2, 1, 0, 1, 1, 2, 0, 1, 1, 0, 2, 1, 1, 0, 1, 1,
       0], dtype=int32)

In [140]:
# add clustering labels

try:
  #neighborhoods_venues_sorted.drop('Cluster Labels', axis=1,inplace=True)
  toronto_grouped.drop('Cluster Labels', axis=1,inplace=True)
except:
  print ('already dropped')

toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
print(toronto_grouped.head())

   Cluster Labels                                   Neighborhood  \
0               1  Bathurst Manor,Downsview North,Wilson Heights   
1               1                                Bayview Village   
2               1                                    Berczy Park   
3               1                                      Cedarbrae   
4               2        Chinatown,Grange Park,Kensington Market   

   Neighborhood Latitude  Neighborhood Longitude  Venue  VenueCategory  Dummy  
0              43.754328              -79.442259      1              1      1  
1              43.786947              -79.385975      1              1      1  
2              43.644771              -79.373306      1              1      1  
3              43.773136              -79.239476      1              1      1  
4              43.653206              -79.400049      5              5      5  


In [126]:
# create map

address = 'Toronto, ON'
geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of toront are {}, {}.'.format(latitude, longitude))

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Neighborhood Latitude'], toronto_grouped['Neighborhood Longitude'], toronto_grouped['Neighborhood'], toronto_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The geograpical coordinate of toront are 43.653963, -79.387207.


In [128]:
    toronto_grouped.loc[toronto_grouped['Cluster Labels'] == 0, toronto_grouped.columns[[1] + list(range(5, toronto_grouped.shape[1]))]]

Unnamed: 0,Neighborhood,VenueCategory,Dummy
5,"Clarks Corners,Sullivan,Tam O'Shanter",2,2
8,"Design Exchange,Toronto Dominion Centre",3,3
12,"Flemingdon Park,Don Mills South",3,3
15,L'Amoreaux West,2,2
19,"Ryerson,Garden District",3,3
22,Studio District,2,2


In [129]:
    toronto_grouped.loc[toronto_grouped['Cluster Labels'] == 1, toronto_grouped.columns[[1] + list(range(5, toronto_grouped.shape[1]))]]

Unnamed: 0,Neighborhood,VenueCategory,Dummy
0,"Bathurst Manor,Downsview North,Wilson Heights",1,1
1,Bayview Village,1,1
2,Berczy Park,1,1
3,Cedarbrae,1,1
7,Davisville North,1,1
9,Downsview Central,1,1
10,"Fairview,Henry Farm,Oriole",1,1
13,"Harbord,University of Toronto",1,1
14,"High Park,The Junction South",1,1
17,North Toronto West,1,1


In [130]:
    toronto_grouped.loc[toronto_grouped['Cluster Labels'] == 2, toronto_grouped.columns[[1] + list(range(5, toronto_grouped.shape[1]))]]

Unnamed: 0,Neighborhood,VenueCategory,Dummy
4,"Chinatown,Grange Park,Kensington Market",5,5
6,"Commerce Court,Victoria Hotel",4,4
11,"First Canadian Place,Underground city",5,5
16,"Little Portugal,Trinity",5,5
