# Data Scraping

In [55]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

In [66]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'html5lib')

postal_codes_dict = {} # initialize an empty dictionary to save the data in
for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.b.text # get the postal code
        postal_code_investigate = table_cell.span.text
        neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
        borough = neighborhoods_data.split('(')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            try:
                neighborhoods = neighborhoods_data.split('(')[1]
            
                # remove parantheses from neighborhoods string
                neighborhoods = neighborhoods.replace('(', ' ')
                neighborhoods = neighborhoods.replace(')', ' ')

                neighborhoods_names = neighborhoods.split('/')
                neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
            except:
                borough = borough.strip('\n')
                neighborhoods_clean = borough
 
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
    except:
        pass
    
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    toronto_data = toronto_data.append({"PostalCode": postal_code, 
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)

# print number of rows of dataframe
toronto_data.shape[0]

103

# GeoSpacial

In [68]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request,urllib.parse , requests
import re 
import geocoder

In [69]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = urllib.request.urlopen(url).read()
soup = BeautifulSoup(data,'lxml')
soup_data = soup.find_all('table')

df = pd.read_html(str(soup_data))[0]

data_dict = {
    'PostalCode':[],
    'Borough':[],
    'Neighborhood':[]
    }

for i in range(len(df.columns)):
    for j in range(len(df)):
        postal_code = df[i][j][0:3]
        borough = re.findall('^[^\(]+',df[i][j][3:] )[0] 
        if borough != 'Not assigned':
            neighborhoods = re.findall('(?<=[(])[^\)]+',df[i][j][3:])[0].split(' / ')
            data_dict['PostalCode'].append(postal_code)
            data_dict['Borough'].append(borough)
            data_dict['Neighborhood'].append(','.join(neighborhoods)) 

dataframe = pd.DataFrame(data_dict)

In [70]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

r = requests.get(url, allow_redirects=True)
open('GeoSpatialDataset.csv', 'wb').write(r.content)

geodata = pd.read_csv('GeoSpatialDataset.csv')
geodata = geodata.rename(columns = {'Postal Code':'PostalCode'})
full_data = pd.merge(dataframe,geodata ,on='PostalCode')
full_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Exploration in Toronto

In [71]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request,urllib.parse , requests
import re 
import geocoder
import folium
import matplotlib
import matplotlib.cm as cm
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

In [72]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = urllib.request.urlopen(url).read()
soup = BeautifulSoup(data,'lxml')
soup_data = soup.find_all('table')

df = pd.read_html(str(soup_data))[0]

data_dict = {
    'PostalCode':[],
    'Borough':[],
    'Neighborhood':[]
    }

for i in range(len(df.columns)):
    for j in range(len(df)):
        postal_code = df[i][j][0:3]
        borough = re.findall('^[^\(]+',df[i][j][3:] )[0] 
        if borough != 'Not assigned':
            neighborhoods = re.findall('(?<=[(])[^\)]+',df[i][j][3:])[0].split(' / ')
            data_dict['PostalCode'].append(postal_code)# it is a string
            data_dict['Borough'].append(borough)
            data_dict['Neighborhood'].append(','.join(neighborhoods)) 

dataframe = pd.DataFrame(data_dict)

In [73]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

r = requests.get(url, allow_redirects=True)
open('GeoSpatialDataset.csv', 'wb').write(r.content)

geodata = pd.read_csv('GeoSpatialDataset.csv')
geodata = geodata.rename(columns = {'Postal Code':'PostalCode'})
full_data = pd.merge(dataframe,geodata ,on='PostalCode')
full_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [74]:
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode('toronto')
toronto_latitude = location.latitude
toronto_longitude = location.longitude

toronto_map = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(full_data['Latitude'], full_data['Longitude'], full_data['Borough'], full_data['Neighborhood'] ):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius=5,  popup=label,  fill=True, fill_opacity=0.7 , parse_html=False).add_to(toronto_map)  
    
toronto_map

In [75]:
scarborough_df = full_data[full_data['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [76]:
scarborough_df = full_data[full_data['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [77]:
eolocator = Nominatim(user_agent="scarborough_explorer")
location = geolocator.geocode('Scarborough,Toronto')
borough_latitude = location.latitude
borough_longitude = location.longitude

scarborough_map = folium.Map(location=[borough_latitude, borough_longitude], zoom_start=11)

# add markers to map
for lat,lng, neighborhood in zip(scarborough_df['Latitude'], scarborough_df['Longitude'], scarborough_df['Neighborhood'] ):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius=5,  popup=label,  fill=True, fill_opacity=0.7 , parse_html=False).add_to(scarborough_map)  
    
scarborough_map