In [1]:
#importing the postalcode file
import pandas 
import numpy as np
df = pandas.read_csv('Postalcode.csv')
# Prune non-data rows
print(df.head())


  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


In [2]:
##delete Not assigned borough from the file
indexNames = df[ df['Borough'] == 'Not assigned'].index
df.drop(indexNames , inplace=True)

In [3]:
#check the number of lines in the file to make sure rows have been delete it
df.shape

(211, 3)

In [4]:
print(df.head(9))

   Postcode           Borough     Neighbourhood
2       M3A        North York         Parkwoods
3       M4A        North York  Victoria Village
4       M5A  Downtown Toronto      Harbourfront
5       M5A  Downtown Toronto       Regent Park
6       M6A        North York  Lawrence Heights
7       M6A        North York    Lawrence Manor
8       M7A      Queen's Park      Not assigned
10      M9A         Etobicoke  Islington Avenue
11      M1B       Scarborough             Rouge


In [5]:
##Replace not assigned neighbhourhood with Borough value
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])
print(df.head(9))

   Postcode           Borough     Neighbourhood
2       M3A        North York         Parkwoods
3       M4A        North York  Victoria Village
4       M5A  Downtown Toronto      Harbourfront
5       M5A  Downtown Toronto       Regent Park
6       M6A        North York  Lawrence Heights
7       M6A        North York    Lawrence Manor
8       M7A      Queen's Park      Queen's Park
10      M9A         Etobicoke  Islington Avenue
11      M1B       Scarborough             Rouge


In [6]:
#Joining Neighbourhoods for the same postalcode
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)
df = df.sample(frac=1).reset_index()
df ['Neighbourhood']= df['Neighbourhood'].str.join(', ')
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4J,East York,East Toronto
1,M4A,North York,Victoria Village
2,M6R,West Toronto,"Parkdale, Roncesvalles"
3,M3M,North York,Downsview Central
4,M9M,North York,"Emery, Humberlea"
5,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
6,M5A,Downtown Toronto,"Harbourfront, Regent Park"
7,M5C,Downtown Toronto,St. James Town
8,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
9,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ..."


In [7]:
#checking count of each category
print('Unique Postal Codes = ', df['Postcode'].unique().size)
print('Unique Boroughs = ', df['Borough'].unique().size)
print('Unique Neighborhoods = ', df['Neighbourhood'].unique().size)

Unique Postal Codes =  103
Unique Boroughs =  11
Unique Neighborhoods =  103


In [8]:
##importing the Coordinates file
##Renaming the first column to match Postalcode file so we can merge both files
##Printing first 5 rows of the new file
GC = pandas.read_csv('GeoCord.csv')
GC.rename(columns={'Postal Code': 'Postcode'},inplace=True)
GC.head()


Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
##Merging both files on colum postcode
pd = pandas 
result = pd.merge(df, GC, how= 'right', on='Postcode')



In [10]:
#printing first 20 lines to doublecheck the new file
result.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4J,East York,East Toronto,43.685347,-79.338106
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
3,M3M,North York,Downsview Central,43.728496,-79.495697
4,M9M,North York,"Emery, Humberlea",43.724766,-79.532242
5,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.650943,-79.554724
6,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
7,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
8,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
9,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.75741,-79.273304


In [11]:
#checking the new file to have same amount of lines as the original Postal code file
#confirming the merge task was done correctly
result.shape
#save the new data to CSV file to later reference

result.to_csv('neighbors_processed.csv')

In [12]:
##result[result['Borough'] like '%Toronto']
neighborhoods = pandas
neighborhoods=result[result['Borough'].str.contains("Toronto")]
neighborhoods


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
6,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
7,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
12,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049
15,M4M,East Toronto,Studio District,43.659526,-79.340923
16,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
21,M6G,Downtown Toronto,Christie,43.669542,-79.422564
27,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
28,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
32,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


In [13]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 4 boroughs and 38 neighborhoods.


In [14]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import Image 
from IPython.core.display import HTML 
import matplotlib.pyplot as plt
from folium import plugins
import seaborn as sns
%matplotlib inline

##import Nominatim
# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


print('Libraries imported.')
# set number of clusters



Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    pandas-0.25.3              |   py36hb3f55d8_0        11.4 MB  conda-forge
    tbb4py-2019.9              |   py36hc9558a2_0         245 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        13.8 MB

The following NEW packages will be INSTALLED:

    tbb:     2019.9-hc9558a2_0     conda-forge
    tbb4py:  2019.9-py36hc9558a2_0 conda-forge

The following packages will be UPDATED:

    openssl: 1.1.1c-h

In [15]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values


Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##

In [16]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#latitude =  43.653963
#longitude = -79.387207
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [17]:

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

neighbors=pd.read_csv('neighbors_processed.csv')
neighbors=neighbors[['Postcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]

neighbors.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4J,East York,East Toronto,43.685347,-79.338106
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
3,M3M,North York,Downsview Central,43.728496,-79.495697
4,M9M,North York,"Emery, Humberlea",43.724766,-79.532242


In [18]:
neighbors.shape

(103, 5)

In [19]:
map_To = folium.Map(location=[latitude, longitude], zoom_start=10)
map_To

In [None]:
#map_To = folium.Map(location=[latitude, longitude], zoom_start=5.5)
from IPython.display import HTML, display
# add markers to map
for lat, lng, borough, neighborhood in zip(neighbors['Latitude'], neighbors['Longitude'], neighbors['Borough'], neighbors['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='gray',
        fill_opacity=0.7,
        parse_html=False).add_to(map_To)  
    
map_To

In [None]:
#checking how many boroughs and neighborhoods we have now
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

In [None]:
pip install lxml

In [None]:
#for Week 5 analysis get Wikipedia link of the data
table_url2 = 'https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods'
url_req2 = requests.get(table_url2)
url_req2
toronto_demographics = pd.read_html(url_req2.text, header=0)
toronto_demographics = toronto_demographics[1]
toronto_demographics.head()

In [None]:
#Extract neighbourhood name, popluaton, density..etc
toronto_demographics=toronto_demographics[['Name','Population', 'Density (people/km2)','Average Income']]
toronto_demographics.head()

In [None]:
toronto_demographics.rename(columns={'Name': 'Neighbourhood'}, inplace=True)
toronto_demographics.head()

In [None]:
toronto_demographics.shape

In [None]:
#foursquare version
CLIENT_ID = '2C5RETK4V3W3R1SUR33JONGXOE2FPCI0NOSJVH1P0STSVZR4' # your Foursquare ID
CLIENT_SECRET = '1CYX5PSDWNID3ND0J25SUNBB3B4ADX0CUBK11SFWI1UQYXHH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
neighbors.loc[100, 'Neighbourhood']

In [None]:
neighbors

In [None]:
neighborhood_latitude = neighbors.loc[100, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighbors.loc[100, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighbors.loc[100, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()
results


In [None]:

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:

toronto_data = neighbors[neighbors['Borough'].str.contains("Toronto")].reset_index(drop=True)
print(toronto_data.shape)
toronto_data.head()

In [None]:

#toronto_venues.head()

In [None]:
#toronto_venues.groupby('Neighbourhood').count()

In [None]:

# create map of Toronto using latitude and longitude values
map_toronto_data = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to new toronto_data map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_data)  
    
map_toronto_data
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL
results = requests.get(url).json()
results

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:

Toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

print(Toronto_venues.shape)
Toronto_venues.head()

In [None]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

In [None]:

# one hot encoding
To_spot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
To_spot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [To_spot.columns[-1]] + list(To_spot.columns[:-1])
Toronto_onehot = To_spot[fixed_columns]

To_spot.head()

In [None]:
To_grouped = To_spot.groupby('Neighbourhood').mean().reset_index()
To_grouped

In [None]:
To_grouped.shape

In [None]:
To_cafe=To_grouped[['Neighbourhood','Café',]]
To_cafe.head(10)

In [None]:
toronto_cafe_sorted= To_cafe.sort_values(by='Café', ascending=False)
toronto_cafe_sorted_top = toronto_cafe_sorted.head(10)
toronto_cafe_sorted_top

In [None]:
#display graph comparing neighbourhoods and cafe count
sns.set(style="whitegrid")
ax = sns.barplot(y="Neighbourhood", x="Café", data=toronto_cafe_sorted_top)

In [None]:

to_merged2 = pd.merge(To_cafe, neighbors, on='Neighbourhood', how='inner')
to_merged2.head(10)

In [None]:
to_merged2.shape

In [None]:
toronto_demographics.rename(columns={'Neighborhood':'Neighbourhood'}, inplace = True) 
toronto_demographics.head()

In [None]:

toronto_merged3 = pd.merge(to_merged2, toronto_demographics, on='Neighbourhood', how='inner')
toronto_merged3.head()

In [None]:

td_sorted_population = toronto_merged3.sort_values(by='Population', ascending=False)
td_sorted_population_top = td_sorted_population.head(10)
ax = sns.barplot(y="Neighbourhood", x="Population", data=td_sorted_population_top)

In [None]:

toronto_cluster = pd.merge(toronto_demographics, To_cafe, on='Neighbourhood', how='inner')
toronto_cluster.head()

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 7

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = To_grouped['Neighbourhood']

for ind in np.arange(To_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(To_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

#week 3
kclusters = 5

To_grouped_clustering =To_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(To_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
#neighborhoods_venues_sorted.shape

In [None]:
# set number of clusters
kclusters = 5

toronto_clustering = toronto_cluster.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:

# add clustering labels
toronto_cluster.insert(0, 'Cluster Label', kmeans.labels_)

toronto_final = toronto_cluster

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_final = pd.merge(toronto_final,neighbors, on='Neighbourhood', how ='inner')

toronto_final.head() # check the last columns!

In [None]:
kmeans.labels_

In [None]:
#week 5
toronto_final.loc[toronto_final['Cluster Label'] == 0]

In [None]:

toronto_final.loc[toronto_final['Cluster Label'] == 1]

In [None]:

toronto_final.loc[toronto_final['Cluster Label'] == 2]

In [None]:
toronto_final.loc[toronto_final['Cluster Label'] == 3]

In [None]:
toronto_final.loc[toronto_final['Cluster Label'] == 4]