# Segmenting and Clustering Neighborhoods in Toronto

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
!pip install geopy
!pip install folium
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import os
import folium
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


### Importing the data using BeautifulSoup to scrape the data from the wiki page

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

### Creating the empty dataframe to sort the data into, and formatting based on rubric

In [3]:
df = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    if row != [] and row[1] != "Not assigned\n":
        if "Not assigned\n" in row[2]: 
            row[2] = row[1]
        df.append(row)
        
df2 = pd.DataFrame(df, columns = ["PostalCode", "Borough", "Neighborhood"])

### Cleaning up the data and grouping the data by neighborhood

In [4]:
df2['Neighborhood'] = df2['Neighborhood'].str.replace('\n',"")
df2['PostalCode'] = df2['PostalCode'].str.replace('\n',"")
df2['Borough'] = df2['Borough'].str.replace('\n',"")

df2 = df2.groupby(['PostalCode','Borough'])['Neighborhood'].apply(", ".join).reset_index()

### Displaying the first 5 rows of the dataframe

In [15]:
df2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Display the number of rows using the shape method

In [6]:
df2.shape[0]

103

### Opening the CVS File with the Longitude/Latitude Data and saving as df

In [7]:
df_gsp = pd.read_csv(r'http://cocl.us/Geospatial_data')

### Merging the 2 dataframes

In [8]:
df_toronto = pd.merge(df2, df_gsp, left_on ='PostalCode',right_on = 'Postal Code')
df_toronto = df_toronto[["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]]
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Exploring the Toronto Data

In [9]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent = 'toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The latitude and longitude of Toronto are {},{}".format(latitude,longitude))

The latitude and longitude of Toronto are 43.6534817,-79.3839347


### Setting up Folium and Adding Markers to Map

In [10]:
# create map of Manhattan using latitude and longitude values
map1 = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map1)  
    
map1

### Mapping only places with toronto in it

In [11]:
df_toronto_new = df_toronto[df_toronto['Neighborhood'].str.contains('Lawrenceburg', "Davisville")].reset_index(drop=True)
df_toronto_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude


### Adding New Markers to Map

In [12]:
for lat, lng, borough, neighborhood in zip(
        df_toronto_new['Latitude'], 
        df_toronto_new['Longitude'], 
        df_toronto_new['Borough'], 
        df_toronto_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color = 'green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map1)  
map1

### Foursquare Credentials

In [13]:
CLIENT_ID = 'RWBQAFG4PNX4L0UMTEXSLTGXNYJ0CWCJBQ5GBPVGCLI0RGAH'
CLIENT_SECRET = '3EH3TN05DZIHUXSEOD0XUCUNIX5BDRPHET3VSLYL2BFSNO0J'
VERSION = '20201203'

### Exploring First Neighborhood

In [14]:
n_name = df_toronto_new.loc[0,'Neighborhood']
print(f"The first neighborhood's name is '{n_name}'.")

n_latitude = df_toronto_new.loc[0,'Latitude']
n_longitude = df_toronto_new.loc[0,'Longitude']
print('Latitude and longitude values of {} are {}, {}.'.format(n_name, n_latitude, n_longitude))


KeyError: 0

### Exploring the top 100 venues within 500 meters of The Beaches and getting results

In [None]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    n_latitude, 
    n_longitude, 
    radius, 
    LIMIT)
# get the result to a json file
results = requests.get(url).json()
results

### Extracting the Categories, Cleaning up Results, Show Results

In [None]:
#extracting the categories
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

### Exploring neighborhoods in parts of toronto city

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Creating a new Data Frame for the new neighborhoods

In [None]:
toronto_new_nhoods = getNearbyVenues(names=df_toronto_new['Neighborhood'],
                                   latitudes=df_toronto_new['Latitude'],
                                   longitudes=df_toronto_new['Longitude']
                                  )

### Check the size of the resulting dataframe

In [None]:
print(toronto_new_nhoods.shape)
toronto_new_nhoods.head()

### Check how many venues were returned for each neighborhood

In [None]:
toronto_new_nhoods.groupby('Neighborhood').count()

### Grouping the Neighborhoods and counting the number of Unique Venues and Calculating number of Unique Categories

In [None]:
print("There are {} unique categories.".format(len(toronto_new_nhoods['Venue Category'].unique())))

## Analyze each neighborhood

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_new_nhoods[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_new_nhoods['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

### Examine New Dataframe Size

In [None]:
toronto_onehot.shape

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

### Confirm new size

In [None]:
toronto_grouped.shape

### Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 10

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['Venue','Freq']
    temp = temp.iloc[1:]
    temp['Freq'] = temp['Freq'].astype(float)
    temp = temp.round({'Freq': 2})
    print(temp.sort_values('Freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Put this Data Into New Pandas DF

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

### Running KMeans Clustering

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto_new

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

#### Visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

### determine the discriminating venue categories that distinguish each cluster. 

In [None]:
### Cluster 1

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
###Cluster 2

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
###Cluster 3

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
### Cluster 4

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
###Cluster 5

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

# DONE! 