## IBM DataScience Capstone Project: web_scraping

In [9]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

import os
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.environ["CLIENT_ID"]
CLIENT_SECRET = os.environ["CLIENT_SECRET"]


Getting the source webpage and assigining the variable source to it and iniatilizing the beautifulsoup object to soup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

src = requests.get(url).text 
soup = BeautifulSoup(src, 'lxml')

In [3]:
def url_par(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    for table in soup.find_all('table', class_="wikitable sortable"):
    # We search for the table that stores the info we want inside the class "wikitable_..."
        n_columns = 0
        n_rows=0
        column_names = []
        
        for row in table.find_all('tr'):
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    n_columns = len(td_tags)
                        
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())
            columns = row.find_all('td')
    
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles != number columns")
    
        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        
        df = pd.DataFrame(columns = columns, index= range(0,n_rows))
        row_marker = 0
       
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
                    
        for col in df:
            try:
                df[col] = df[col].astype(float)
                
            except ValueError:
                pass
            
        return df

def cleanup(df):
    df = df[df.Borough != 'Not assigned']
    df = df[df['Neighbourhood\n'] != 'Not assigned']

    df = df.replace('\n',' ', regex=True)
    return df

In [5]:
table_init = url_par(url)
df_fin = cleanup(table_init)
df_fin.head()

Unnamed: 0,Postcode,Borough,Neighbourhood\n
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [16]:
df = df_fin.groupby(['Postcode','Borough'])['Neighbourhood\n'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df_final = df.sample(frac=1).reset_index(drop=True)
df_final.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood\n
0,M7Y,East Toronto,Business Reply Mail Processing Centre 969 East...
1,M1E,Scarborough,"Guildwood , Morningside , West Hill"
2,M9A,Etobicoke,Islington Avenue
3,M4S,Central Toronto,Davisville
4,M2M,North York,"Newtonbrook , Willowdale"
5,M5P,Central Toronto,"Forest Hill North , Forest Hill West"
6,M3K,North York,"CFB Toronto , Downsview East"
7,M3N,North York,Downsview Northwest
8,M1V,Scarborough,"Agincourt North , L'Amoreaux East , Milliken ,..."
9,M1X,Scarborough,Upper Rouge


PART 2

In [17]:
postal_codes = df_final['Postcode'].values

url_geo ="http://cocl.us/Geospatial_data"

geo_data=pd.read_csv(url_geo)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
df_geo = pd.merge(left=df_final, right=geo_data, left_on='Postcode', right_on='Postal Code')
df_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood\n,Postal Code,Latitude,Longitude
0,M7Y,East Toronto,Business Reply Mail Processing Centre 969 East...,M7Y,43.662744,-79.321558
1,M1E,Scarborough,"Guildwood , Morningside , West Hill",M1E,43.763573,-79.188711
2,M9A,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
3,M4S,Central Toronto,Davisville,M4S,43.704324,-79.38879
4,M2M,North York,"Newtonbrook , Willowdale",M2M,43.789053,-79.408493


In [22]:
from geopy.geocoders import Nominatim 
import numpy as np
from pandas.io.json import json_normalize # Tranform JSON file into a pandas dataframe

# Visualisation
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium 


#Modeling
from sklearn.cluster import KMeans

In [23]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [26]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['Neighbourhood\n']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_geo)  
    
map_geo