# Coursera Capstone Project Data Science
This notebook will be used for the data science coursera capstone project

In [1]:
import pandas as pd
import numpy as np

print("Hello Capstone Project Course!")

Hello Capstone Project Course!


### 1) Insert necessary modules

In [2]:
#%pip install bs4
from bs4 import BeautifulSoup
import requests
#%pip install urllib3
import urllib3
import re

### 2) Define url and use requests and bs4 to read in the url

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
r = requests.get(url)
HCE = BeautifulSoup(r.content)
#type(HCE)

### 3) Find the table as a 'wikitable sortable'

In [4]:
#print(soup.prettify())
table = HCE.find('table',{'class':'wikitable sortable'})

### 4) Find headers with 'th' and rows with 'td'

In [5]:
headers= [header.text for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
    rows.append([val.text for val in row.find_all('td')])

### 5) Define pd dataframe with rows and headers and inspect it

In [6]:
df1 = pd.DataFrame(rows, columns=headers)
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


### 6) "\n"s should be removed...

In [7]:
def preproc(dat):
    dat[dat.columns].replace('\\n','')
    dat.rename(columns = lambda x : re.sub("\n","",x), inplace=True)
    dat.replace(to_replace="\n", value="", regex=True, inplace=True)
    return(dat)

df1 = preproc(df1)
df1.head()
#df1.dtypes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### 7) First row should be removed

In [8]:
df1 = df1.iloc[1:]
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### 8) Remove rows with Borough=='Not assigned'

In [9]:
df1 = df1[df1.Borough != 'Not assigned']
df1 = df1.reset_index(drop=True)
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### 9) Group by postcode and append different neighbourhoods into new frame

In [10]:
def f(x):
     return pd.Series(dict(Postcode = x['Postcode'].iloc[0],
                           Borough = x['Borough'].iloc[0], 
                           Neighbourhood = "%s" % ', '.join(x['Neighbourhood'])))
    
df1 = df1.groupby('Postcode').apply(f)
df1 = df1.reset_index(drop=True)
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 10) Turn 'Not assigned' Neighborhoods to respective Borough names

In [11]:
indices = df1.index[df1['Neighbourhood'] == 'Not assigned'].tolist()
print("Not assigned index: ",indices)
helper_series = df1['Borough'].iloc[indices]
helper_series = helper_series.rename('Neighbourhood')
df1['Neighbourhood'].iloc[indices] = helper_series

Not assigned index:  [85]


Inspect changed index 85:

In [12]:
df1[84:87]

Unnamed: 0,Postcode,Borough,Neighbourhood
84,M6S,West Toronto,"Runnymede, Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre


### 11) Inspect .shape of df

In [13]:
df1.shape

(103, 3)

### 12) Add latitude and the longitude coordinates of each neighborhood using csv table

In [14]:
gsd = pd.read_csv('https://cocl.us/Geospatial_data')
gsd.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
gsd.shape

(103, 3)

#### Shape matching - now test if Postal Code // Postcode columns in gsd // df1 match
... then we can just add columns without bothering of resorting rows first

In [16]:
gsd['Postal Code'].equals(df1['Postcode'])

True

#### Merge latitude and longitude columns into df1

In [17]:
df1[['Latitude','Longitude']]=gsd[['Latitude','Longitude']]
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Compare first 3 rows of example from coursera solution (different sorting of rows)

In [18]:
df1[df1['Postcode'].isin(['M5G','M2H','M4B'])]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
17,M2H,North York,Hillcrest Village,43.803762,-79.363452
35,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
