## IBM DataScience Capstone Project: web_scraping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import json

import os
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.environ["CLIENT_ID"]
CLIENT_SECRET = os.environ["CLIENT_SECRET"]
VERSION = os.environ["VERSION"]

VERSION_2 = os.environ["VERSION_2"]

In the cell above, we have imported all libraries necessary for this project. Additionally, I created an .env file to store my Api credentials and keep them secured.

Getting the source webpage and assigining the variable source to it and iniatilizing the beautifulsoup object to soup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

src = requests.get(url).text 
soup = BeautifulSoup(src, 'lxml')

On this function, we scrap the given url in order to extract and parse to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe 

In [3]:
def url_par(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    for table in soup.find_all('table', class_="wikitable sortable"):
    # We search for the table that stores the info we want inside the class "wikitable_..."
        n_columns = 0
        n_rows=0
        column_names = []
        
        for row in table.find_all('tr'):
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    n_columns = len(td_tags)
                        
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())
            columns = row.find_all('td')
    
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles != number columns")
    
        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        
        df = pd.DataFrame(columns = columns, index= range(0,n_rows))
        row_marker = 0
       
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
                    
        for col in df:
            try:
                df[col] = df[col].astype(float)
                
            except ValueError:
                pass
            
        return df

def cleanup(df):
    df = df[df.Borough != 'Not assigned']
    df = df[df['Neighbourhood\n'] != 'Not assigned']

    df = df.replace('\n',' ', regex=True)
    return df

In [4]:
table_init = url_par(url)
df_fin = cleanup(table_init)
df_fin.head()

Unnamed: 0,Postcode,Borough,Neighbourhood\n
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [5]:
df = df_fin.groupby(['Postcode','Borough'])['Neighbourhood\n'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df_final = df.sample(frac=1).reset_index(drop=True)

print("The dataframe shape is: ",df_final.shape)
display(df_final.head(10))

The dataframe shape is:  (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood\n
0,M9B,Etobicoke,"Cloverdale , Islington , Martin Grove , Prince..."
1,M9V,Etobicoke,"Albion Gardens , Beaumond Heights , Humbergate..."
2,M1B,Scarborough,"Rouge , Malvern"
3,M4S,Central Toronto,Davisville
4,M6L,North York,"Downsview , North Park , Upwood Park"
5,M4G,East York,Leaside
6,M6S,West Toronto,"Runnymede , Swansea"
7,M3L,North York,Downsview West
8,M5N,Central Toronto,Roselawn
9,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village W..."


In the cell above we finally get the table requested for the exercise. Therefore, we have the dataframe with the following requirements matched:
- The df consist of three columns: PostalCode, Borough, and Neighborhood
- We have ignored the cells with a borough that is Not assigned. Same with the neighborhood one.
- As more than one neighborhood can exist in one postal code area, these two rows will be combined into one row with the neighborhoods separated with a comma.

## PART 2

In order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. We can use a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

Therefore, we created the requested dataframe:

In [6]:
postal_codes = df_final['Postcode'].values

url_geo ="http://cocl.us/Geospatial_data"

geo_data=pd.read_csv(url_geo)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
df_geo = pd.merge(left=df_final, right=geo_data, left_on='Postcode', right_on='Postal Code')
df_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood\n,Postal Code,Latitude,Longitude
0,M9B,Etobicoke,"Cloverdale , Islington , Martin Grove , Prince...",M9B,43.650943,-79.554724
1,M9V,Etobicoke,"Albion Gardens , Beaumond Heights , Humbergate...",M9V,43.739416,-79.588437
2,M1B,Scarborough,"Rouge , Malvern",M1B,43.806686,-79.194353
3,M4S,Central Toronto,Davisville,M4S,43.704324,-79.38879
4,M6L,North York,"Downsview , North Park , Upwood Park",M6L,43.713756,-79.490074


## Part 3

Finally, we are going to reproduce the NY City analysis here, at Toronto.

First, we import all the libraries we would like to use:

In [8]:
from geopy.geocoders import Nominatim 
import numpy as np
from pandas.io.json import json_normalize # Tranform JSON file into a pandas dataframe

# Visualisation
import matplotlib.cm as cm
import matplotlib.colors as colores
import folium 


#Modeling
from sklearn.cluster import KMeans

Then, we get the geographical coordinates of Toronto to start the clusterization

In [9]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['Neighbourhood\n']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_geo)  
    
map_geo

In [18]:
# We check the API status
i = 2
neigh_lat = df_geo.loc[i, 'Latitude'] #Latitude
neigh_lng = df_geo.loc[i, 'Longitude']
radius = 500
LIMIT = 100 

url = f'https://api.foursquare.com/v2/venues/explore?client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&ll={neigh_lat},{neigh_lng}&v={VERSION}&radius={radius}&limit={LIMIT}'

requests.get(url).status_code

200

In [13]:
lista = [{'id': '4bf58dd8d48988d163941735', 'name': 'Park', 'pluralName': 'Parks', 'shortName': 'Park', 'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_', 'suffix': '.png'}, 'primary': True}]
for i in lista:
    print(list(i.values())[1])
lista[0]['name']

Park


'Park'

In [20]:
# Here, we define a function to obtain the categories
def get_category(row):        
    if len(row) == 0:
        return None
    else:
        return row[0]['name']

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return list(filter(lambda dic: dic['name'].values(), categories_list))
        #return categories_list[0]['name']
    

### Here we call the Foursquare API to get the data 

explore_df_list = []

for i, neigh_name in enumerate(df_geo['Neighbourhood\n']):  
    try :         
        ### Getting the data of neighbourhood
        neigh_name = df_geo.loc[i, 'Neighbourhood\n'] #neigh_name
        neigh_lat = df_geo.loc[i, 'Latitude'] #Latitude
        neigh_lng = df_geo.loc[i, 'Longitude'] #Longitude
        radius = 500 
        LIMIT = 100 
        # LIMIT up to 100 venues

        url = f'https://api.foursquare.com/v2/venues/explore?client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&ll={neigh_lat},{neigh_lng}&v={VERSION}&radius={radius}&limit={LIMIT}'



                # Here we call the API

        results = json.loads(requests.get(url).text)
        results = results['response']['groups'][0]['items']

        near = pd.json_normalize(results) # Flattens JSON normalisation

        # Filtering the columns
        filtered_columns = ["venue.name","venue.categories","venue.location.lat","venue.location.lng"]
        near = near.filter(items=filtered_columns)


        # Renaming the columns
        near = near.rename(columns = {'venue.name':'Name', 'venue.categories':'Category', 'venue.location.lat':'Latitude','venue.location.lng':'Longitude'})

        # Gets the categories

        near['Category'] = near['Category'].apply(get_category)

        # Gets the data required
        for i, name in enumerate(near['Name']):
            explore_df_list.append([neigh_name, neigh_lat, neigh_lng] + near.loc[i, :].values.tolist())
    
    except Exception as e:
        print(e)

'Category'
'Category'
'Category'
'Category'


#### We now turn our results to a DF

In [21]:
tor_df = pd.DataFrame([item for item in explore_df_list])
tor_df.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue Name', 'Venue Category', 'Venue Latitude', 'Venue Longitude']
display(tor_df.head())
print(tor_df.shape)

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,"Albion Gardens , Beaumond Heights , Humbergate...",43.739416,-79.588437,Subway,Sandwich Place,43.742645,-79.589643
1,"Albion Gardens , Beaumond Heights , Humbergate...",43.739416,-79.588437,Shoppers Drug Mart,Pharmacy,43.741685,-79.584487
2,"Albion Gardens , Beaumond Heights , Humbergate...",43.739416,-79.588437,Popeyes Louisiana Kitchen,Fried Chicken Joint,43.741202,-79.584545
3,"Albion Gardens , Beaumond Heights , Humbergate...",43.739416,-79.588437,The Beer Store,Beer Store,43.741694,-79.584373
4,"Albion Gardens , Beaumond Heights , Humbergate...",43.739416,-79.588437,No Frills,Grocery Store,43.741696,-79.584379


(2231, 7)


### One hot encoding
Now, I added neighborhood column to the df and moved it to the first column.

In [22]:
toronto_onehot = pd.get_dummies(tor_df[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighbourhood'] = tor_df['Neighbourhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Albion Gardens , Beaumond Heights , Humbergate...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Albion Gardens , Beaumond Heights , Humbergate...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Albion Gardens , Beaumond Heights , Humbergate...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Albion Gardens , Beaumond Heights , Humbergate...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Albion Gardens , Beaumond Heights , Humbergate...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide , King , Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North , L'Amoreaux East , Milliken ,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens , Beaumond Heights , Humbergate...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood , Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# With this function, we get the most common venues in our df. This way, we can create columns according 
# to number of top venues
def common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
num_class_venues = 11
indicators = ['st', 'nd', 'rd']

# Columns as number of class venues
columns = ['Neighbourhood']
for ind in np.arange(num_class_venues):
    columns.append(f'{ind+1} Most-common Type Venue')

# Create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = common_venues(toronto_grouped.iloc[ind, :], num_class_venues)

venues_sorted.head()

Unnamed: 0,Neighbourhood,1 Most-common Type Venue,2 Most-common Type Venue,3 Most-common Type Venue,4 Most-common Type Venue,5 Most-common Type Venue,6 Most-common Type Venue,7 Most-common Type Venue,8 Most-common Type Venue,9 Most-common Type Venue,10 Most-common Type Venue,11 Most-common Type Venue
0,"Adelaide , King , Richmond",Coffee Shop,Restaurant,Café,Bar,Thai Restaurant,Steakhouse,Sushi Restaurant,Gym,Asian Restaurant,Breakfast Spot,Burger Joint
1,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Dumpling Restaurant
2,"Agincourt North , L'Amoreaux East , Milliken ,...",Park,Bakery,Playground,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Department Store
3,"Albion Gardens , Beaumond Heights , Humbergate...",Grocery Store,Pizza Place,Fast Food Restaurant,Beer Store,Sandwich Place,Fried Chicken Joint,Pharmacy,Comic Shop,Concert Hall,Electronics Store,Eastern European Restaurant
4,"Alderwood , Long Branch",Pizza Place,Gym,Sandwich Place,Skating Rink,Coffee Shop,Pub,Pharmacy,Athletics & Sports,Dessert Shop,Dim Sum Restaurant,Diner


### K-Means

In [26]:
k = 5
tor_clusters = toronto_grouped.drop('Neighbourhood', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters = k, random_state = 0).fit(tor_clusters)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# Add clustering labels
venues_sorted.insert(0, 'K-Labels', kmeans.labels_)

In [27]:
df_final = df_final.rename(columns = {'Neighbourhood\n':'Neighbourhood'})
df_final = pd.merge(left=df_final, right=geo_data, left_on='Postcode', right_on='Postal Code')
tor_merged = df_final

tor_merged = tor_merged.join(venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
tor_merged.dropna(inplace = True)
tor_merged['K-Labels'] = tor_merged['K-Labels'].astype(int)
tor_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,K-Labels,1 Most-common Type Venue,2 Most-common Type Venue,3 Most-common Type Venue,4 Most-common Type Venue,5 Most-common Type Venue,6 Most-common Type Venue,7 Most-common Type Venue,8 Most-common Type Venue,9 Most-common Type Venue,10 Most-common Type Venue,11 Most-common Type Venue
1,M9V,Etobicoke,"Albion Gardens , Beaumond Heights , Humbergate...",M9V,43.739416,-79.588437,0,Grocery Store,Pizza Place,Fast Food Restaurant,Beer Store,Sandwich Place,Fried Chicken Joint,Pharmacy,Comic Shop,Concert Hall,Electronics Store,Eastern European Restaurant
2,M1B,Scarborough,"Rouge , Malvern",M1B,43.806686,-79.194353,0,Fast Food Restaurant,Dessert Shop,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
3,M4S,Central Toronto,Davisville,M4S,43.704324,-79.38879,0,Pizza Place,Sandwich Place,Dessert Shop,Gym,Coffee Shop,Italian Restaurant,Café,Sushi Restaurant,Pharmacy,Brewery,Discount Store
4,M6L,North York,"Downsview , North Park , Upwood Park",M6L,43.713756,-79.490074,0,Basketball Court,Park,Bakery,Construction & Landscaping,Yoga Studio,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore
5,M4G,East York,Leaside,M4G,43.70906,-79.363452,0,Coffee Shop,Sporting Goods Shop,Furniture / Home Store,Burger Joint,Breakfast Spot,Fish & Chips Shop,Sports Bar,Beer Store,Shopping Mall,Electronics Store,Juice Bar


### Visualisation

In [28]:
map_Kmeans = folium.Map(location=[latitude, longitude], zoom_start=11)


# Color for the clusters
x = np.arange(k)
y = [i + x + (i*x)**2 for i in range(k)]

colors_list = cm.rainbow(np.linspace(0, 1, len(y)))
rainbow = [colores.rgb2hex(i) for i in colors_list]

# Markers to the map
markers_colors = []
for lat, lon, i, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighbourhood'], tor_merged['K-Labels']):
    label = folium.Popup(str(i) + ' (Cluster ' + str(cluster) + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_Kmeans)
       
map_Kmeans