# Importing required packages

In [178]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# Pulling raw data from web, using beautiful soup to organize

In [179]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_data= requests.get(wikipedia_link).text
data = BeautifulSoup(raw_data,'xml')


# Creating lists using 'for' iteration, 'find', and 'count'

In [180]:
#extracting the raw table inside that webpage
table = data.find('table')

Postcode      = []
Borough       = []
Neighbourhood = []




for tr_cell in table.find_all('tr'):
    counter = 1
    Postcode_var      = -1
    Borough_var       = -1
    Neighbourhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1: 
            Postcode_var = td_cell.text
            Postcode.append(Postcode_var)
        if counter == 2: 
            Borough_var = td_cell.text
            tag_a_Borough = td_cell.find('a')
            Borough.append(Borough_var)
        if counter == 3: 
            Neighbourhood_var = str(td_cell.text).strip()
            tag_a_Neighbourhood = td_cell.find('a')
            Neighbourhood.append(Neighbourhood_var)
        counter +=1
        

        

        
    
    
    

# Creating unique Postcode list

In [181]:

unique_postcode = set(Postcode)
print('Unique Postal codes:', len(unique_postcode))
Postcode_unique      = []
Borough_unique       = []
Neighbourhood_unique = []


for postcode_unique_element in unique_postcode:
    post_var = ''; bor_var = ''; neigh_var = ''; 
    for postcode_index, postcode_element in enumerate(Postcode):
        if postcode_unique_element == postcode_element:
            post_var = postcode_element;
            bor_var = Borough[postcode_index]
            if neigh_var == '': 
                neigh_var = Neighbourhood[postcode_index]
            else:
                neigh_var = neigh_var + ',' + Neighbourhood[postcode_index]
    Postcode_unique.append(post_var)
    Borough_unique.append(bor_var)
    Neighbourhood_unique.append(neigh_var)

Unique Postal codes: 180


# Creating dictionary and pandas dataframe

In [182]:

toronto_dict = {'Postcode':Postcode_unique, 'Borough':Borough_unique, 'Neighbourhood':Neighbourhood_unique}
df = pd.DataFrame.from_dict(toronto_dict)
df.to_csv('toronto_part1.csv')
df.shape

(180, 3)

# Refining dataframe

In [183]:
df = df.applymap(lambda x: x.replace('\n',''))
df = df[df['Borough'] != 'Not assigned']

In [184]:
df.Neighbourhood = df.Neighbourhood.apply(lambda x: x.replace('/',','))

In [185]:
df.shape

(103, 3)

In [186]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9N,York,Weston
4,M1W,Scarborough,"Steeles West , L'Amoreaux West"
8,M4E,East Toronto,The Beaches
9,M5G,Downtown Toronto,Central Bay Street
10,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place"
...,...,...,...
169,M6P,West Toronto,"High Park , The Junction South"
171,M7Y,East Toronto,Business reply mail Processing CentrE
173,M5V,Downtown Toronto,"CN Tower , King and Spadina , Railway Lands , ..."
177,M3H,North York,"Bathurst Manor , Wilson Heights , Downsview North"


# Downloading Latitude and Longitude csv

In [187]:
import wget
wget.download('http://cocl.us/Geospatial_data')

'Geospatial_Coordinates (2).csv'

# Reading CSV

In [188]:
lat_lon = pd.read_csv('Geospatial_Coordinates.csv')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merging Latitude and Longitude with original DF

In [189]:
df = pd.merge(df,lat_lon, how = 'left', left_on ='Postcode', right_on = 'Postal Code', validate = '1:1')


In [190]:
df = df.drop('Postal Code', axis = 1)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9N,York,Weston,43.706876,-79.518188
1,M1W,Scarborough,"Steeles West , L'Amoreaux West",43.799525,-79.318389
2,M4E,East Toronto,The Beaches,43.676357,-79.293031
3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
4,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place",43.636847,-79.428191


# Downloading Folium

In [191]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



# Retrieving general Toronto coordinates

In [192]:
from geopy.geocoders import Nominatim

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="pete")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


# Generating non-clustered map

In [198]:
import folium


map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Enabling foursquare API

In [201]:
CLIENT_ID = 'CCBMXEF4ITVUNFRDKXYHAMX5F55TLG3DCMYIE3DH44AQBCY' # your Foursquare ID
CLIENT_SECRET = 'UXRV1ZBEK1DCQZIZIDC3L1YMHNNWQIVZC4XVISBTKKZAKTW' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CCBMXEF4ITVUNF0RDKXYHAMX5F55TLG3DCMYIE3DH44AQBCY
CLIENT_SECRET:UXRV1ZBEKD1DCQZIZIDC3L1YMHNNWQIVZC4XVISBTKKZAKTW


# Creating function to call all Postcodes

### I used postcodes instead of Neighborhood because of duplicate names in original dataset, Postcodes were all unique

In [202]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# API request

In [226]:
# type your answer here

toronto_venues = getNearbyVenues(names= df['Postcode'],
                                   latitudes= df['Latitude'],
                                   longitudes= df['Longitude']
                                  )



M9N
M1W
M4E
M5G
M6K
M3L
M4R
M4S
M7A
M1J
M5M
M5L
M7R
M4T
M4A
M6J
M9B
M1C
M2H
M5T
M4M
M2J
M1B
M5E
M5A
M5P
M4G
M8Z
M2R
M6G
M9V
M5H
M9P
M1G
M3B
M6R
M6M
M1K
M3A
M1S
M9R
M1X
M5K
M2N
M4J
M3C
M9C
M6A
M6S
M5J
M1V
M2M
M4L
M1T
M3N
M4B
M3J
M6L
M2P
M4H
M2L
M1P
M5R
M1L
M1R
M3M
M9L
M5C
M4V
M5B
M6H
M8V
M5W
M5N
M4N
M5S
M3K
M4W
M4K
M4Y
M1E
M9W
M8Y
M9M
M8X
M8W
M6C
M1H
M2K
M4P
M4C
M4X
M5X
M6N
M6B
M9A
M1N
M6E
M6P
M7Y
M5V
M3H
M1M


### it should be noted that the API could not pull 3 of the Lat, Lon for unique Postcodes - so the columns have been reduced by 3

In [242]:
toronto_venues.Neighborhood.unique().shape

(100,)

### Unique venues

In [229]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 228 uniques categories.


# Creating numerical dataset for KMeans clustering

In [244]:

toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")


toronto_onehot['Postcode'] = toronto_venues['Neighborhood'] 


toronto_onehot.shape

(1345, 229)

In [245]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()

In [246]:
toronto_grouped.head()

Unnamed: 0,Postcode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Creating function for sorting top venues

In [248]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [249]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']


columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))


postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Yoga Studio,Dance Studio,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner
1,M1C,Bar,History Museum,Yoga Studio,Dance Studio,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
2,M1E,Electronics Store,Breakfast Spot,Intersection,Bank,Mexican Restaurant,Rental Car Location,Medical Center,Department Store,Drugstore,Donut Shop
3,M1G,Coffee Shop,Insurance Office,Korean Restaurant,Deli / Bodega,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
4,M1H,Bakery,Bank,Athletics & Sports,Thai Restaurant,Gas Station,Hakka Restaurant,Fried Chicken Joint,Caribbean Restaurant,Dessert Shop,Department Store


# KMeans Clustering of data based off of venue types 

In [300]:
from sklearn.cluster import KMeans

kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=1000).fit(toronto_grouped_clustering)


kmeans.labels_[0:10] 

array([1, 0, 0, 0, 0, 3, 0, 0, 0, 0])

### 4 clusters captured much of the variety, as many of the Postcodes fell into 2 large clusters

In [332]:
postcode_venues_sorted.head()

Unnamed: 0,Cluster Labels,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,M1B,Fast Food Restaurant,Yoga Studio,Dance Studio,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner
1,0,M1C,Bar,History Museum,Yoga Studio,Dance Studio,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
2,0,M1E,Electronics Store,Breakfast Spot,Intersection,Bank,Mexican Restaurant,Rental Car Location,Medical Center,Department Store,Drugstore,Donut Shop
3,0,M1G,Coffee Shop,Insurance Office,Korean Restaurant,Deli / Bodega,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
4,0,M1H,Bakery,Bank,Athletics & Sports,Thai Restaurant,Gas Station,Hakka Restaurant,Fried Chicken Joint,Caribbean Restaurant,Dessert Shop,Department Store


### Joining cluster labels and top 10 Common Venues with the original dataset (df), using Postcode

In [302]:

#postcode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
postcode_venues_sorted['Cluster Labels'] = kmeans.labels_

df_merged = df


df_merged = df_merged.join(postcode_venues_sorted.set_index('Postcode'), on='Postcode')

df_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M9N,York,Weston,43.706876,-79.518188,2.0,Park,Convenience Store,Curling Ice,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dim Sum Restaurant
1,M1W,Scarborough,"Steeles West , L'Amoreaux West",43.799525,-79.318389,0.0,Fast Food Restaurant,Chinese Restaurant,Coffee Shop,Pharmacy,Discount Store,Sandwich Place,Supermarket,Bank,Pizza Place,Breakfast Spot
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,0.0,Trail,Park,Neighborhood,Pub,Health Food Store,Yoga Studio,Dance Studio,Deli / Bodega,Department Store,Dim Sum Restaurant
3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0.0,Coffee Shop,Café,Italian Restaurant,Yoga Studio,Sandwich Place,Bubble Tea Shop,Spa,Ice Cream Shop,Bar,Hotel
4,M6K,West Toronto,"Brockton , Parkdale Village , Exhibition Place",43.636847,-79.428191,0.0,Café,Breakfast Spot,Nightclub,Coffee Shop,Stadium,Bar,Bakery,Restaurant,Intersection,Convenience Store


### Filled NA's (caused by API not returning values for Postcodes) and transforming columns to Integer 

In [304]:
df_merged['Cluster Labels'].fillna(0, inplace = True)
df_merged['Cluster Labels'] = df_merged['Cluster Labels'].astype(int) 

In [305]:
import matplotlib.cm as cm
import matplotlib.colors as colors


map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)


x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Postcode'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 0 and Cluster 2 are the largest Postcodes

# Cluster Analysis

### You can see that Cluster_0 consists of coffee shops and cafe's - 28 out of  49 cluster points

In [331]:
cluster_0 = df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[0] + list(range(5, df_merged.shape[1]))]]


cluster_0.iloc[:,2].value_counts()

Coffee Shop                  15
Café                         13
Pizza Place                   8
Grocery Store                 4
Restaurant                    2
Fast Food Restaurant          2
Bar                           2
Indian Restaurant             2
Clothing Store                2
Baseball Field                2
Italian Restaurant            2
Auto Garage                   1
Light Rail Station            1
Garden                        1
Dance Studio                  1
Bakery                        1
Pub                           1
Liquor Store                  1
Drugstore                     1
Japanese Restaurant           1
Electronics Store             1
Food Truck                    1
Dessert Shop                  1
Sporting Goods Shop           1
Plaza                         1
Golf Course                   1
Airport Service               1
Latin American Restaurant     1
Pharmacy                      1
American Restaurant           1
Greek Restaurant              1
Trail   

### You can see that Cluster_2 is entirely park areas

In [329]:
cluster_2 = df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[0] + list(range(5, df_merged.shape[1]))]]
cluster_2.iloc[:,2].value_counts()

Park    12
Name: 1st Most Common Venue, dtype: int64