# Step 1: Scraped data from Wikipedia, cleaned and organized the data, and created a new dataframe

In [1]:
from bs4 import BeautifulSoup # BeautifulSoup is in bs4 package 
import requests
import pandas as pd
from pandas import DataFrame

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
content = requests.get(URL)
soup = BeautifulSoup(content.text, 'html.parser')

row = soup.find("tbody") # Extract and return first occurrence of tr           # Print row with HTML formatting
mylist = str(row.get_text()).split("\n")

del mylist[0:8]

mylist = (" ".join(mylist))

mylist2 = mylist.split("   ")

postal_codes = []
borough = []
neighborhood = []

for i in mylist2:
    mylist3 = []
    mylist3.append(i.split("  "))
    postal_codes.append(mylist3[0][0])
    borough.append(mylist3[0][1])
    neighborhood.append(mylist3[0][2])

for i in borough[:]:
    if i=="Not assigned":
        ind = borough.index(i)
        borough.remove(i)
        del neighborhood[ind]
        del postal_codes[ind]
        
for i in neighborhood[:]:
    if i == 'Not assigned':
        ind = neighborhood.index(i)
        neighborhood[ind]=borough[ind]

df = pd.DataFrame()
df['PostalCode'] = pd.Series(postal_codes)
df['Borough'] = pd.Series(borough)
df['Neighborhood'] = pd.Series(neighborhood)

print(df.head())

  PostalCode           Borough                                 Neighborhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


# Step 2: Checked to make sure number of rows and columns of data were correct and consistent with the number of rows in the Wikipedia Page

In [2]:
print(df.shape)

(103, 3)


# Step 3: Added Longitude and Latitude columns to original dataframe using a CSV file with the necessary information

In [21]:
df2 = pd.read_csv("Geospatial_Coordinates.csv")

In [26]:
latitude_list = df2['Latitude'].tolist()
longitude_list = df2['Longitude'].tolist()
postal_code_listV2 = df2['Postal Code'].tolist()

In [48]:
original_postal_code_list = df['PostalCode'].tolist()
new_latitude = []
new_longitude = []

In [54]:
for x in postal_code_listV2:
    for y in original_postal_code_list:
        if x==y:
            ind = postal_code_listV2.index(x)
            new_latitude.append(latitude_list[ind])
            new_longitude.append(longitude_list[ind])
        

In [55]:
df['Latitude'] = pd.Series(new_latitude)
df['Longitude'] = pd.Series(new_longitude)

# Step 4: Checked to make sure the new columns were added and that no columns were missing the longitude and latitude data

In [3]:
print(df.tail())

    PostalCode           Borough  \
98         M8X         Etobicoke   
99         M4Y  Downtown Toronto   
100        M7Y      East Toronto   
101        M8Y         Etobicoke   
102        M8Z         Etobicoke   

                                          Neighborhood  
98       The Kingsway, Montgomery Road, Old Mill North  
99                                Church and Wellesley  
100  Business reply mail Processing Centre, South C...  
101  Old Mill South, King's Mill Park, Sunnylea, Hu...  
102  Mimico NW, The Queensway West, South of Bloor,...  


In [4]:
print(df.head())

  PostalCode           Borough                                 Neighborhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


# Step 5: Used K-Means to generate clusters and displayed clusters on a Folium Map

In [71]:
from sklearn.cluster import KMeans

kclusters = 4

df3 = df[['Latitude', 'Longitude']]

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df3)

df3['Cluster Labels'] = pd.Series(kmeans.labels_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Cluster Labels'] = pd.Series(kmeans.labels_)


## Map of Toronto grouped into Four Areas

In [73]:
import folium
import numpy as np 
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[new_latitude[0], new_longitude[0]], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df3['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Observations:

### 1. There appear to be four regions in Toronto: an eastern and western region and a mid-northern and a mid-southern region.
### 2. Out all the regions, the region with the most postal codes in it is the mid-southern region, marked in light yellow.
### 3. A city in the mid-southern region that seems to have an unusually high number of postal codes is Toronto. This makes sense, as there is an airport nearby, which may be indicative of a high tourist population.
### 4. In turn, a high tourist population would spur growth, leading to more hotels, rental homes, restaurants, museums, and other tourist attractions, crowding the area and leading to a need for more postal codes.
### 5. Toronto is likely a more populated area, compared to the other regions, and may cost more to live in. 