# Gabriel Mantilla Saltos, final projects of Data Science Course in Python

### Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe like the one shown below:

### Install complements like beatifulsoup4 and import neccesary libraries

In [3]:
#pip install beautifulsoup4
#pip install lxml
#pip install html5lib

import numpy as np 
import pandas as pd
from bs4 import BeautifulSoup
import requests
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')

### Extract the column names

In [4]:
table = soup.find('table',{'class':'wikitable sortable'})
column_names = [item.get_text().rstrip() for item in table.find_all('th')]

### Extract values in rows 

In [5]:
data = []
table_body = table.find('tbody')

rows = table_body.find_all('tr')

for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

### Put all the data into a dataframe

In [6]:
df = pd.DataFrame(data, columns=column_names)
df = df.replace(to_replace='None', value=np.nan).dropna()

### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.


In [7]:
df = df[df['Borough'] != 'Not assigned'] 
df.set_index(['Postcode','Borough'], inplace=True)

### Combined into one row with the neighborhoods separated with a comma

In [8]:
df = df.groupby(['Postcode','Borough'], sort=False).agg( ','.join)
df = df.reset_index(level=[0,1])
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
df.head()
print(df.shape)

(103, 3)


### Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

### I tried whi this code, but it was giving bad coordinates, so I used the csv that the tool gives me, to continue with the exercise

In [1]:
# df=df.assign(latitud="",longitud="")
# postal_code=df['Postcode']
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="foursquare_agent")
# for i in range(df.shape[0]):
#        if  geolocator.geocode(postal_code[i]) is not None:
#            location = geolocator.geocode(postal_code[i])
#            latitud = location.latitude
#            longitud=location.longitude
#            df.loc[df.index[i],'longitud']=longitud
#            df.loc[df.index[i],'latitud']=latitud

In [9]:
# 2.- Use the Geocoder package or the csv file to create the following dataframe:     
path='https://cocl.us/Geospatial_data'
df = pd.merge(df,pd.read_csv(path),left_on='Postcode',right_on='Postal Code',how='left').drop(columns=['Postal Code'])
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Latitude         103 non-null float64
Longitude        103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


### Explore and cluster the neighborhoods in Toronto. Work with only boroughs that contain the word Toronto.

### Toronto coordinates are searched in the AP

In [10]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium 
from geopy.geocoders import Nominatim
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
Toronto = folium.Map(location=[latitude,longitude], zoom_start=11)

### Create map of Toronto using latitude and longitude values

In [11]:
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto)  
Toronto

### Let's simplify the above map and segment and cluster only the neighborhoods in Toronto. 
### So let's slice the original dataframe and create a new dataframe of the Toronto data.

In [12]:
dft=df[df['Borough'].str.contains("Toronto")]

### Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them
### Define Foursquare Credentials and Version

In [13]:
CLIENT_ID = 'QDHYFTGXCVD4DFZUHH0P3SOKGAHYFHLRDN31RHXMNHWCQQUH' # your Foursquare ID
CLIENT_SECRET = 'EXVT2ZAMS4DMG32FCLASGF5WSU4WPMDLNY5W5NZRM1MMUSGH' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 50 # limit of number of venues returned by Foursquare API

### Let's create a function that extracts the category of the venue to all the neighborhoods in Toronto


In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood','Neighbourhood Latitude','Neighbourhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venues)

### Now we write the code that run the above function on each neighborhood and create a new dataframe called dft.


In [15]:
toronto_venues = getNearbyVenues(names=dft['Neighbourhood'],latitudes=dft['Latitude'],longitudes=dft['Longitude'])


Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide,King,Richmond
Dovercourt Village,Dufferin
Harbourfront East,Toronto Islands,Union Station
Little Portugal,Trinity
The Danforth West,Riverdale
Design Exchange,Toronto Dominion Centre
Brockton,Exhibition Place,Parkdale Village
The Beaches West,India Bazaar
Commerce Court,Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North,Forest Hill West
High Park,The Junction South
North Toronto West
The Annex,North Midtown,Yorkville
Parkdale,Roncesvalles
Davisville
Harbord,University of Toronto
Runnymede,Swansea
Moore Park,Summerhill East
Chinatown,Grange Park,Kensington Market
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown,St. James Town
First Canadian Place,Underground city


### We need to analyze Each Neighborhood, we create a hot encoding

In [17]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

### Add neighborhood column back to dataframe

In [18]:
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

### Move neighborhood column to the first column

In [19]:
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [20]:
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Add neighborhood column back to dataframe

In [22]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.shape
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

### Create columns according to number of top venues

In [23]:
columns = ['Neighbourhood']
num_top_venues = 5
indicators = ['st', 'nd', 'rd']

### Let's print each neighborhood along with the top 5 most common venues

In [24]:
for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                 venue  freq
0          Coffee Shop  0.08
1  American Restaurant  0.06
2           Steakhouse  0.06
3                 Café  0.06
4     Asian Restaurant  0.04


----Berczy Park----
          venue  freq
0   Coffee Shop  0.08
1  Cocktail Bar  0.06
2    Steakhouse  0.04
3          Café  0.04
4   Cheese Shop  0.04


----Brockton,Exhibition Place,Parkdale Village----
            venue  freq
0  Breakfast Spot  0.10
1            Café  0.10
2     Coffee Shop  0.10
3   Grocery Store  0.05
4         Stadium  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.05
2             Brewery  0.05
3       Garden Center  0.05
4              Garden  0.05


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0   Airport Service  0.18
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3

### First, let's write a function to sort the venues in descending order


In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [28]:
num_top_venues = 10
num_top_venues = 10
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

### Run k-means to cluster the neighborhood into 5 clusters.
### Cluster Neighborhoods

In [29]:
from sklearn.cluster import KMeans
# Set number of clusters
kclusters = 5

### Run k-means clustering

In [30]:
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

### Check cluster labels generated for each row in the dataframe

In [31]:
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Add clustering labels

In [32]:
neighborhoods_venues_sorted.head()
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = dft

### Merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood

In [33]:
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
toronto_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,2,Coffee Shop,Bakery,Pub,Park,Café,Restaurant,Mexican Restaurant,Breakfast Spot,Antique Shop,Dessert Shop
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,2,Coffee Shop,Café,Fast Food Restaurant,Ramen Restaurant,Clothing Store,Bookstore,Theater,Shopping Mall,Sporting Goods Shop,Bar
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Gastropub,Italian Restaurant,Restaurant,Japanese Restaurant,Café,Hotel,Farmers Market,Cocktail Bar,Park
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Neighborhood,Yoga Studio,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Coffee Shop,Cocktail Bar,Farmers Market,Steakhouse,Beer Bar,Bakery,Cheese Shop,Café,Seafood Restaurant,Bistro


### Create a new map with the analysis of Kmeans clustering

In [34]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

### This is the final result

In [35]:
map_clusters