# Peer-Graded Assignment: Week 3 Part 1


### Step1: Setting up the environment

In [36]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from bs4 import BeautifulSoup
from urllib.request import urlopen

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Step2: Parsing the HTML file/table

In [37]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=urlopen(url).read().decode('utf-8')
soup=BeautifulSoup(page,'html.parser')

wiki_table=soup.body.table.tbody

### Step3: Extract Data from the Wikipedia Page to a pandas Dataframe

In [38]:
def get_cell(element):
    cells=element.find_all('td') 
    row=[]
    
    for cell in cells:
        if cell.a:
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
    
    return row
                

In [39]:
def get_row():
    data=[]
    
    for tr in wiki_table.find_all('tr'):
        row=get_cell(tr)
        if len(row)!=3:
            continue
        data.append(row)
    return data

In [40]:
data=get_row()
columns=['Postcode','Borough','Neighborhood']
df=pd.DataFrame(data,columns=columns)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


The above code helps in just scrapping the data from the Wikipedia Page as it is. We need to clean the data in the next step as per the guidelines provided in the instructions for the assignment.

### Step4: Cleaning the Data as per Instructions

In [41]:
df1=df[df.Borough != 'Not assigned']
df1=df1.sort_values(by=['Postcode','Borough'])

df1.reset_index(inplace= True)
df1.drop('index',axis=1,inplace=True)

df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


In [45]:
#The following code is for 'Queen's Park' anomaly
for i in df1.index:
    if df1.iloc[i,2]=='Not assigned':
        df1.iloc[i,2]=df1.iloc[i,1]
        
df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


Now we need to assign the same Borough to the Neighborhood column for certain Postcodes and also one Postcode can have a single borough and multiple neighborhoods. This can be coded as follows:

In [46]:
aggregations={'Neighborhood':lambda x:','.join(x)}
g=df1.groupby('Postcode').agg(aggregations).reset_index()
x=df1.drop('Neighborhood',axis=1)
m=pd.merge(x,g,on='Postcode',how='left')
df2=m.drop_duplicates()
df2.to_csv('ADSC_Week3_Part1.csv',index=False)
df2

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
5,M1E,Scarborough,"Guildwood,Morningside,West Hill"
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae
10,M1J,Scarborough,Scarborough Village
11,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
14,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
17,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
20,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [47]:
df2.shape

(103, 3)