In [31]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
# !conda install -c conda-forge BeautifulSoup4 --yes # uncomment this line if you haven't completed the Foursquare API lab
from bs4 import BeautifulSoup
import requests as req
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

In [4]:
def is_assigned(c) -> bool:
    return str(c.attrs).find("color:#ccc;") == -1


def get_borough_from_raw_text(text: str) -> str:
    if len(text) > 0:
        idx_par = text.find("(")
        if idx_par > 0:
            return text[0:idx_par]
    return ''


def get_neighbours_from_raw_text(t) -> str:
    result = ''
    if len(t) > 0:
        split = t.split("/")
        if len(split) > 0:
            result = str(split[0]).replace("(", "").replace(")", "").strip()
            for i in range(1, len(split)):
                result = result + "," + (str(split[i]).replace("(", "").replace(")", "").strip())

    return result


def get_neighbours_from_span_tags(s) -> [str]:
    result = []
    tags = s.find_all()
    tags = list(filter((lambda t: t.text != ''), tags))
    if len(tags) > 0:
        for tag in tags:
            result.append(tag.text)

    return result


def append_neighbours(neigh: str, n_list_to_append: [str]) -> str:
    result = neigh
    if n_list_to_append is not None and len(n_list_to_append) > 0:
        for i in n_list_to_append:
            result = result + "," + i
    return result


def get_text_from_span(s) -> (str, str):
    borough_result = get_borough_from_raw_text(s.text)
    neighbours_from_span_raw_text = get_neighbours_from_raw_text(s.text[len(borough_result):])
    text_from_span_tags = ''

    if len(borough_result) > 0:
        neighbour_result = append_neighbours(neighbours_from_span_raw_text, text_from_span_tags[1:])
    else:
        borough_result = text_from_span_tags[0]
        neighbour_result = append_neighbours(neighbours_from_span_raw_text, text_from_span_tags)
    return borough_result, neighbour_result


def load_df() -> pd.DataFrame:
    html = req.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

    bs = BeautifulSoup(html, 'html.parser')
    table = bs.find("body").find("div", {"id": "mw-content-text"}).find("table").find('tbody')

    codes = []
    borough = []
    neighbourhood = []
    for html_line_table in table.find_all("tr"):
        for cell in html_line_table.find_all("td"):
            if is_assigned(cell):
                span = cell.find("span")
                b, n = get_text_from_span(span)
                borough.append(b)
                neighbourhood.append(n)
                codes.append(cell.find("b").text)

    df = pd.DataFrame({"code": codes, "borough": borough, "neighbourhood": neighbourhood})

    df_geo = pd.read_csv('https://cocl.us/Geospatial_data')
    df_geo.rename(columns={'Postal Code': 'code'}, inplace=True)

    df = pd.merge(df, df_geo, on="code")

    return df

In [5]:
# @hidden_cell
CLIENT_ID = 'TEUHLLPHPGLZM0TPSIOJ1IAV0L1BVMF3EAXLA3BAVTDFQFUW'
CLIENT_SECRET = 'CQ30ZFSS45TBPDSRO3GQCZKPLTAVQMWTE5H55KZYS5VWHF2R'
VERSION = '20180605'
LIMIT = 100

In [17]:
def get_venues_in_radius(lat: float, long: float, radius: int):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius,
        LIMIT)
    response = req.get(url)
    r = response.json()['response']['groups'][0]['items']

    return r


def get_venues(lat, long):
    venues = get_venues_in_radius(lat, long, 500)
    return venues


def get_features_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    venues_list = []
    for code, name, lat, long in zip(df['code'], df["neighbourhood"], df["Latitude"], df["Longitude"]):
        print(f'({code}){name} {lat} {long}')
        results = get_venues(lat, long)

        venues_list.append([(
            code,
            name,
            lat,
            long,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Code',
                             'Neighborhood',
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']

    return nearby_venues


def get_one_hot_category_df(df: pd.DataFrame) -> pd.DataFrame:
    result = pd.get_dummies(df['Venue Category'], prefix_sep='_', prefix='cat')
    len(result)
    result["Neighborhood"] = df["Neighborhood"]
    result["Code"] = df["Code"]
    c = ["Code", "Neighborhood"] + list(result.columns[:-2])
    result = result[c]

    result = result.groupby(["Code", "Neighborhood"]).mean().reset_index(drop=True)

    return result


def get_kmeans_clusters(k: int, one_hot_df: pd.DataFrame) -> KMeans:
    x = one_hot_df.iloc[:, 2:]
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10, max_iter=1000).fit(x)
    return kmeans


def get_best_k(max_k: int, one_hot_df: pd.DataFrame):
    scores = np.zeros(max_k + 1)
    best_inertia = -1
    for k in range(1, max_k + 1):
        kmeans = get_kmeans_clusters(k, one_hot_df)
        scores[k] = kmeans.inertia_
    plt.scatter(x=list(range(max_k + 1)), y=scores)
    plt.show()

In [37]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_postal = load_df()

df_venues_categories = pd.read_csv("venues.csv")
df_categories_one_hot = get_one_hot_category_df(df_venues_categories)

result = df_venues_categories.iloc[:, 0:4].drop_duplicates().reset_index(drop=True)
k = 6
labels = get_kmeans_clusters(k, df_categories_one_hot).labels_
result["Cluster"] = labels

result.head()

Unnamed: 0,Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster
0,M3A,Parkwoods,43.753259,-79.329656,0
1,M4A,Victoria Village,43.725882,-79.315572,4
2,M5A,"Regent Park,Harbourfront",43.65426,-79.360636,0
3,M6A,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763,0
4,M7A,Ontario Provincial Government,43.662301,-79.389494,0


In [40]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for code, lat, lon, neighbourhood, cluster in zip(result["Code"], result['Neighborhood Latitude'], result['Neighborhood Longitude'], result['Neighborhood'], result['Cluster']):
    label = folium.Popup(str(neighbourhood), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    folium.Circle([lat, lon],
                    radius=500
                   ).add_to(map_clusters)
       
map_clusters