# Scraping Wikipedia page and creating a Dataframe and Transforming the data on Wiki page into pandas dataframe.

## Part 1

### Install required libraries 

In [1]:
pip install BeautifulSoup4



In [2]:
pip install lxml



### Importing Libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
print('Done')

Done


#### Scraping list of postal codes from given Wikipedia page using BeautifulSoup4

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(url).text
wiki_data = BeautifulSoup(extracting_data, 'lxml')

#### Converting content of PostalCode HTML table to dataframe

In [5]:
column_names = ['Postalcode','Borough','Neighborhood']
toronto = pd.DataFrame(columns = column_names)

content = wiki_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto = toronto.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [7]:
# clean dataframe 
toronto = toronto[toronto.Borough!='Not assigned']
toronto = toronto[toronto.Borough!= 0]
toronto.reset_index(drop = True, inplace = True)
i = 0
for i in range(0,toronto.shape[0]):
    if toronto.iloc[i][2] == 'Not assigned':
        toronto.iloc[i][2] = toronto.iloc[i][1]
        i = i+1

#### Data Cleaning | Drop rows which contain 'Not assigned' value

In [8]:
toronto_df = toronto.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
toronto_df = toronto_df.replace('\n','',regex=True)
toronto_df = toronto_df[(toronto_df.Borough != 'Not assigned') & (toronto_df.Neighborhood != 'Not assigned')].reset_index().drop(columns='index')
toronto_df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [9]:
toronto_df.describe()

Unnamed: 0,Postalcode,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M6R,North York,Downsview
freq,1,24,4


## Part 2

### Install required libraries 

In [10]:
pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |███▎                            | 10kB 18.3MB/s eta 0:00:01[K     |██████▋                         | 20kB 6.3MB/s eta 0:00:01[K     |██████████                      | 30kB 7.7MB/s eta 0:00:01[K     |█████████████▎                  | 40kB 8.4MB/s eta 0:00:01[K     |████████████████▋               | 51kB 6.7MB/s eta 0:00:01[K     |████████████████████            | 61kB 7.3MB/s eta 0:00:01[K     |███████████████████████▎        | 71kB 7.7MB/s eta 0:00:01[K     |██████████████████████████▋     | 81kB 8.6MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 9.1MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 5.8MB/s 
Collecting ratelim
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad4

### Importing Libraries

In [11]:
import pandas as pd
import numpy as np
import geocoder
print('Done')

Done


In [12]:
# Adding Columns Latitude & Longitude
df_co_ords = pd.DataFrame(columns=['Latitude', 'Longitude'])
toronto_df['Latitude'] = df_co_ords['Latitude']
toronto_df['Longitude'] = df_co_ords['Longitude']
toronto_df

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",,
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,
...,...,...,...,...,...
98,M9N,York,Weston,,
99,M9P,Etobicoke,Westmount,,
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",,
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",,


In [18]:
def get_latilong(postal_code):
    lati_long_coords = None
    while(lati_long_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code), timeout=100)
      lati_long_coords = g.latlng
    return lati_long_coords
    
#et_latilong(toronto_df.Postalcode[102]) #example
toronto_df.Postalcode[0]

'M1B'

In [21]:
toronto_df.shape[0]

103

In [22]:
df_list_coord = []
for i in range(0, toronto_df.shape[0]):
    df_list_coord.append(get_latilong(toronto_df.Postalcode[i]))

In [28]:
df_list_coord

[[43.81153000000006, -79.19551999999999],
 [43.78564000000006, -79.15870999999999],
 [43.765750000000025, -79.17519999999996],
 [43.768200000000036, -79.21760999999998],
 [43.769690000000026, -79.23943999999995],
 [43.74309000000005, -79.23525999999998],
 [43.72861000000006, -79.26366999999993],
 [43.714060000000075, -79.28411999999997],
 [43.72360000000003, -79.23495999999994],
 [43.69539000000003, -79.26193999999998],
 [43.75998000000004, -79.26836999999995],
 [43.750710000000026, -79.30055999999996],
 [43.79394000000008, -79.26710999999995],
 [43.784730000000025, -79.29936999999995],
 [43.817810000000065, -79.28023999999994],
 [43.80052000000006, -79.32073999999994],
 [43.83422000000007, -79.21669999999995],
 [43.802850000000035, -79.35620999999998],
 [43.780970000000025, -79.34780999999998],
 [43.78102000000007, -79.38059999999996],
 [43.757220000000075, -79.37973999999997],
 [43.79135000000008, -79.41355999999996],
 [43.76714000000004, -79.40706999999998],
 [43.747870000000034, -7

In [None]:
for i in range(0, toronto_df.shape[0]):
    toronto_df.Latitude[i] = df_list_coord[i][0]
    toronto_df.Longitude[i] = df_list_coord[i][1]

In [33]:
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8115,-79.1955
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7856,-79.1587
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7658,-79.1752
3,M1G,Scarborough,Woburn,43.7682,-79.2176
4,M1H,Scarborough,Cedarbrae,43.7697,-79.2394


In [31]:
toronto_df.to_csv('toronto_2.csv',index=False)