# __Clustering and analysing Toronto data__

**Daggy1234**

_Aspiring Data Scientist_

### Extracting data from Wikipedia
---
We will use the wikipedia page
[Here](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M) to load the tabluar data into a pandas dataframe. We will use the libraries of requests, BeautifulSoup and pandas to create a dataframe.

In [59]:
import pandas as pd
from bs4 import  BeautifulSoup
import requests

In [60]:
#get the data
y = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
text = y.text
soup = BeautifulSoup(text,'html.parser')
tb = str(soup.find('table'))
df = pd.read_html(tb)
df = df[0]
df
# we have officially obtained all the tabular data

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [61]:
# we will now process data
dfa = df[df.Borough != 'Not assigned']
dfa['Neighborhood'].fillna(df['Borough'],inplace=True)
dfa.reset_index(inplace=True)
dfa

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...,...
98,160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,165,M4Y,Downtown Toronto,Church and Wellesley
100,168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [62]:
ll = pd.read_csv('https://cocl.us/Geospatial_data')
ll.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [63]:
# wei will now merge
df = pd.merge(dfa,ll,on='Postal Code')
df = df.drop('index',axis=1)
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Mapping the values
---
Using folium we will create a vivid map of the toronto area. marking all the areas!




In [64]:
#find areas in toronto by using sorting the districts
dft = df[df['Borough'].str.contains('Toronto',regex=False)]
dft


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [65]:
import folium
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=12)
for lat,long,b,n in zip(dft['Latitude'],dft['Longitude'],dft['Borough'],dft['Neighborhood']):
    label = '{}, {}'.format(n, b)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,long],
    radius=3,
    popup=label,
    color='blue',
    fill=False,
    fill_color='#3186cc',
    fill_opacity=1.0,
    parse_html=False).add_to(map_toronto)
map_toronto

### Machine Learning
----
Lets use Kmeans clustering to cluster the data into similar subsets and plot them on the map

we will use latitute and longitude

In [66]:
from sklearn.cluster import KMeans
k = 4
toronto_clustering = dft.drop(['Postal Code','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
dft.insert(0, 'Cluster Labels', kmeans.labels_)
dft

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,3,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,1,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,3,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [68]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=12,tiles='Stamen Toner')

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighbourhood, cluster in zip(dft['Latitude'], dft['Longitude'], dft['Neighborhood'], dft['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters