# Segmenting and Clustering Neighborhoods in Toronto
## Week 3 Peer-graded Assignment: Question 2

### Setting up the First Dataframe:

In [1]:
#Import pandas and use the get the postal codes from Wikipedia using the pandas read_html method.

import pandas as pd
import requests

pd.options.display.max_rows = 999  #Value changed to show more DataFrame rows so that values could be double checked.

toronto_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",
                            attrs = {'class': 'wikitable sortable'}) #attrs grabs the desired table based on its class

In [2]:
# toronto_data is not actually a dataframe, the dataframe is stored within at toronto_data[0]
t_df = toronto_data[0]

type(t_df) #double check the type of t_df



pandas.core.frame.DataFrame

In [3]:
#Drop 'Not assigned Boroughs, reset the index once those values have ben dropped, drop the old index, 
#replace "Not assigned" neighbourhood with the name of the Borough it is in.

t_df.drop(t_df.index[t_df['Borough'] == 'Not assigned'], inplace=True)

t_df.reset_index(inplace=True)
t_df.drop(['index'],axis=1, inplace=True)

t_df['Neighbourhood'].replace("Not assigned", t_df["Borough"],inplace = True)

In [4]:
t_df #Show the dataframe with the desired changes.

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


### Getting GeoData and Grouping Neighbourhoods

In [5]:
#Import the Geocoding information from the provided csv, Google is denying requests from the Geocoder library 
#when no API key is present.

pc_df = pd.read_csv('Geospatial_Coordinates.csv')
        

In [6]:
df_dict = pc_df.set_index('Postal Code').to_dict()

In [7]:
#Group neighborhoods according to PostalCode/borough, aggregating the Neighbourhood names into a single cell.
#GroupedNeighbourhoods is a Pandas Series

GroupedNeighbourhoods = t_df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x))

#Print the Groups of Neighbourhoods to check the result of the grouping.

for i in GroupedNeighbourhoods:
    print("{}\n".format(i))

Rouge, Malvern

Highland Creek, Rouge Hill, Port Union

Guildwood, Morningside, West Hill

Woburn

Cedarbrae

Scarborough Village

East Birchmount Park, Ionview, Kennedy Park

Clairlea, Golden Mile, Oakridge

Cliffcrest, Cliffside, Scarborough Village West

Birch Cliff, Cliffside West

Dorset Park, Scarborough Town Centre, Wexford Heights

Maryvale, Wexford

Agincourt

Clarks Corners, Sullivan, Tam O'Shanter

Agincourt North, L'Amoreaux East, Milliken, Steeles East

L'Amoreaux West

Upper Rouge

Hillcrest Village

Fairview, Henry Farm, Oriole

Bayview Village

Silver Hills, York Mills

Newtonbrook, Willowdale

Willowdale South

York Mills West

Willowdale West

Parkwoods

Don Mills North

Flemingdon Park, Don Mills South

Bathurst Manor, Downsview North, Wilson Heights

Northwood Park, York University

CFB Toronto, Downsview East

Downsview West

Downsview Central

Downsview Northwest

Victoria Village

Woodbine Gardens, Parkview Hill

Woodbine Heights

The Beaches

Leaside

Thorncliff

In [8]:
#The Series is converted to a dictionary so that the PostalCode/Borough key (a list), can be split into a key (key[0]:PostalCode)
#And values containing a list of the Borough and Neighbourhood key[1]:Borough, a[key]:Neighbourhood

a =GroupedNeighbourhoods.to_dict()
a.keys()
newDict = dict()
for key in a.keys():
    newDict[key[0]] = [key[1], a[key]]

#List of desired postal codes for question 2 in the Capstone Week 3 Peer-Graded Assignment.
pcList = ["M5G","M2H","M4B","M1J","M4G","M4M","M1R","M9V","M9L","M5V","M1B","M5A"]


#For all the entries in newDict that are in pcList, add the entry to preDict()
preDict = dict()

for key in newDict.keys():
    if key in pcList:
        preDict[key]=newDict[key]

#Create a new dataframe from preDict, transpose it to make the PostalCodes a column, and reset the index
df3 = pd.DataFrame.from_dict(preDict).transpose().reset_index()

#change the names of the columns to desired values
df3.rename(columns={"index":"PostalCode", 0: "Borough",1:"Neighbourhoods"}, inplace=True)

df3

Unnamed: 0,PostalCode,Borough,Neighbourhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1J,Scarborough,Scarborough Village
2,M1R,Scarborough,"Maryvale, Wexford"
3,M2H,North York,Hillcrest Village
4,M4B,East York,"Woodbine Gardens, Parkview Hill"
5,M4G,East York,Leaside
6,M4M,East Toronto,Studio District
7,M5A,Downtown Toronto,Harbourfront
8,M5G,Downtown Toronto,Central Bay Street
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."


In [9]:
#Add Latitude and Longitude columns with default values of 0.0 to the dataframe

df3["Latitude"] = 0.0
df3["Longitude"] = 0.0

#Apply latitude and Longitude values from the geocoding dictionary created earlier to the Latitude and Longitude columns
#Based on the value of the PostalCode in each row.
df3["Latitude"]=df3["PostalCode"].apply(lambda x: df_dict["Latitude"].get(x))
df3["Longitude"]=df3["PostalCode"].apply(lambda x: df_dict["Longitude"].get(x))

df3

Unnamed: 0,PostalCode,Borough,Neighbourhoods,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
2,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
3,M2H,North York,Hillcrest Village,43.803762,-79.363452
4,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
5,M4G,East York,Leaside,43.70906,-79.363452
6,M4M,East Toronto,Studio District,43.659526,-79.340923
7,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
8,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


In [10]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import json

from sklearn.cluster import KMeans

In [11]:
fsCLIENT_ID = '5ZGJ4LEQDQEXALFUGT3RIHRTCW1UIRHKDRO4Q2Q0LQ01YOP1'
fsCLIENT_SECRET = 'WT0FE5FIQCY1MFH05K5L3IUQYVLHDT2LNK5MSP12HY5TYRWW'
fsAPI_VERSION = '20180323'

In [12]:
neighborhoods = list()

for i in df3.iterrows():

    neighborhoods.append({"name":"{}\t{}".format(i[1][1],i[1][0]), "lat":i[1][3],"lng":i[1][4]})


In [13]:
fsVenueLimit = 100
fsRadius = 500

fsResults = []
for i in neighborhoods:
    fsURL = 'https://api.foursquare.com/v2/venues/explore'
    params =dict(    
        client_id = fsCLIENT_ID, 
        client_secret=fsCLIENT_SECRET, 
        v=fsAPI_VERSION, 
        #ll="{},{}".format(i[1][3],i[1][4]),
        ll="{},{}".format(i['lat'],i['lng']),
        limit=fsVenueLimit,
        radius=fsRadius)
    nameList = i['name'].split("\t")
    fsResponse = requests.get(url = fsURL, params=params).json()["response"]["groups"][0]['items']

    fsResults.append([nameList[0],nameList[1],fsResponse, i['lat'],i['lng']])
    



In [14]:
fsList = []
for i in fsResults:
    for j in i[2]:
        fsList.append([i[0],i[1],j['venue'],i[3],i[4]])
print(len(fsList))

251


In [15]:
venueDict = dict()
venueCount = 0
desiredList = ['name', 'location', 'categories']
for i in fsList:
    nbrhd = i[0]
    nbrhd_pc = i[1]
    nbrhd_lat = i[3]
    nbrhd_lng = i[4]
    venueCount+=1
    #print("{}".format(i[0]))
    for j in i[2]:

        if j in desiredList:

            tempCat = list()
            if ('categories' in j) and len(j) >1:
                for k in i[2]['categories']:
                    tempCat.append(k['name'])
            elif ('categories' in j):
                tempCat = i[2]['name']

            temp = [
                    nbrhd_pc,
                    nbrhd_lat,
                    nbrhd_lng,
                    i[2]['name'],
                    i[2]['location']['lat'],
                    i[2]['location']['lng'],
                    tempCat]
    venueDict["Venue{}".format(venueCount)]=([nbrhd,temp[0],temp[1],temp[2],temp[3],temp[4], temp[5], temp[6][0]])



In [16]:
v_df = pd.DataFrame(venueDict).transpose()

v_df.columns = ['Neighborhood', "PostalCode", "Neighborhood Latitude", "Neighborhood Longitude",'Venue', "Venue Latitude", "Venue Longitude", "Venue Categories"]
v_df

Unnamed: 0,Neighborhood,PostalCode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Categories
Venue1,Scarborough,M1B,43.8067,-79.1944,Wendy's,43.8074,-79.1991,Fast Food Restaurant
Venue2,Scarborough,M1J,43.7447,-79.2395,McCowan Park,43.7451,-79.2393,Playground
Venue3,Scarborough,M1R,43.7501,-79.2958,Crown Pastries,43.7461,-79.2931,Bakery
Venue4,Scarborough,M1R,43.7501,-79.2958,Wexford Restaurant,43.746,-79.2938,Breakfast Spot
Venue5,Scarborough,M1R,43.7501,-79.2958,Frank's Smoke Shop,43.7459,-79.2949,Smoke Shop
Venue6,Scarborough,M1R,43.7501,-79.2958,Sequoia Lounge,43.7456,-79.2957,Middle Eastern Restaurant
Venue7,North York,M2H,43.8038,-79.3635,Eagle's Nest Golf Club,43.8055,-79.3642,Golf Course
Venue8,North York,M2H,43.8038,-79.3635,New York Fries,43.8037,-79.3639,Fast Food Restaurant
Venue9,North York,M2H,43.8038,-79.3635,AY Jackson Pool,43.8045,-79.3661,Pool
Venue10,North York,M2H,43.8038,-79.3635,Villa Madina,43.8017,-79.3639,Mediterranean Restaurant


In [17]:
import math

In [18]:
mapOfToronto =folium.Map(location=[43.7091,-79.3635], zoom_start=10)

import random

def makeColor():
    colors = {"red":0,
              "blue":0,
              "green":0,
              }
    primary = ['red','green','blue']
    random.shuffle(primary)
    for c in primary:
        colors[c] =hex(math.floor(random.randrange(170,256)/(primary.index(c)+1)))[2:]

    return"#{}{}{}".format(colors['red'],colors['green'],colors['blue'])


for lat, lng, neighborhood, postalcode in zip(v_df['Neighborhood Latitude'],v_df['Neighborhood Longitude'],v_df["Neighborhood"],v_df["PostalCode"]):
    label = '{}, {}'.format(neighborhood,postalcode)
    label= folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color=makeColor(),
        fill=True,
        fill_color=makeColor(),
        fill_opacity=0.7,
        parse_html=False).add_to(mapOfToronto)
    
mapOfToronto