# Web Scraping Toronto neighbourhoods data from Wikipedia

__1. Web Scraping Wikipedia Table__

In [104]:
import requests
import lxml.html as lh
import pandas as pd

In [105]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url) #Create a handle, page, to handle the contents of the website
doc = lh.fromstring(page.content) #Store the contents of the website under doc
tr_elements = doc.xpath('//tr') #tr_elements = doc.xpath('//tr') 

In [106]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [107]:
tr_elements = doc.xpath('//tr')
col=[]
i=0
for t in tr_elements[0]: #For each row, store each first element (header) and an empty list
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


In [108]:
#Since the first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [109]:
[len(C) for (title,C) in col]

[288, 288, 288]

In [110]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

__2. Dataframe manipulation and computations - Data Transform__

In [111]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [112]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,288,288,288
unique,180,12,209
top,M8Y,Not assigned,Not assigned\n
freq,8,77,78


In [113]:
for col in df.columns: 
    print(col) 
df.columns

Postcode
Borough
Neighbourhood



Index(['Postcode', 'Borough', 'Neighbourhood\n'], dtype='object')

In [114]:
df.rename(columns={'Neighbourhood\n':'Neighbourhood'}, inplace=True) #last column header had new line escape character, so rename that to remove new line character

In [115]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [116]:
df2 = df[df['Borough'] != 'Not assigned'] #copy a dataframe without unassigned boroughs to the main dataframe

In [117]:
df2 #to check all boroughs have been assigned

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
10,M9A,Etobicoke,Islington Avenue\n
11,M1B,Scarborough,Rouge\n
12,M1B,Scarborough,Malvern\n


In [118]:
df3 = df2[df2['Neighbourhood'] == 'Not assigned\n'] #copy a dataframe with assigned boroughs but unassigned neighborhoods to a new dataframe

In [119]:
df3

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned\n


In [120]:
df3['Neighbourhood'] = df3['Borough'] #assign unassigned neighborhood in new dataframe to its borough value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [121]:
df3 #check

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [122]:
com_df = pd.concat([df2, df3], axis=0) #combine original dataframe and previous data into a combined final dataframe

In [123]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
10,M9A,Etobicoke,Islington Avenue\n
11,M1B,Scarborough,Rouge\n
12,M1B,Scarborough,Malvern\n


___I noticed that all the neighborhood values of the dataframe also have a new line escape character. What follows is the code to remove the escape character at the end of all neighborhood column values___

In [124]:
l = com_df.ix[:,2].tolist() #assign all neighborhood column values to a list

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


In [125]:
for i in range(len(l)-1): #loop through contents of list (last element not required since it was the neighborhood we assigned to its borough) and slice out final two characters
    l[i] = l[i][0:-1]

In [126]:
l #check that all values have no escape characters at the end

['Parkwoods',
 'Victoria Village',
 'Harbourfront',
 'Regent Park',
 'Lawrence Heights',
 'Lawrence Manor',
 'Not assigned',
 'Islington Avenue',
 'Rouge',
 'Malvern',
 'Don Mills North',
 'Woodbine Gardens',
 'Parkview Hill',
 'Ryerson',
 'Garden District',
 'Glencairn',
 'Cloverdale',
 'Islington',
 'Martin Grove',
 'Princess Gardens',
 'West Deane Park',
 'Highland Creek',
 'Rouge Hill',
 'Port Union',
 'Flemingdon Park',
 'Don Mills South',
 'Woodbine Heights',
 'St. James Town',
 'Humewood-Cedarvale',
 'Bloordale Gardens',
 'Eringate',
 'Markland Wood',
 'Old Burnhamthorpe',
 'Guildwood',
 'Morningside',
 'West Hill',
 'The Beaches',
 'Berczy Park',
 'Caledonia-Fairbanks',
 'Woburn',
 'Leaside',
 'Central Bay Street',
 'Christie',
 'Cedarbrae',
 'Hillcrest Village',
 'Bathurst Manor',
 'Downsview North',
 'Wilson Heights',
 'Thorncliffe Park',
 'Adelaide',
 'King',
 'Richmond',
 'Dovercourt Village',
 'Dufferin',
 'Scarborough Village',
 'Fairview',
 'Henry Farm',
 'Oriole',
 'Nor

In [127]:
len(l) 

212

In [128]:
len(com_df) #check if length of df matches length of list with escape chars removed

212

In [129]:
com_df['Neighbourhood'] = l #assign neighbourhood column values to our list

In [130]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


___The following is the code to combine all neighbourhoods with the same postal code into 1 record in the dataframe. Using 'groupby' and 'agg'___

In [131]:
com_df = com_df.groupby('Postcode').agg({'Neighbourhood': ', '.join, 'Borough':'first' }).reset_index() #

In [132]:
com_df #check that neighborhoods have been combined

Unnamed: 0,Postcode,Neighbourhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough
5,M1J,Scarborough Village,Scarborough
6,M1K,"East Birchmount Park, Ionview, Kennedy Park",Scarborough
7,M1L,"Clairlea, Golden Mile, Oakridge",Scarborough
8,M1M,"Cliffcrest, Cliffside, Scarborough Village West",Scarborough
9,M1N,"Birch Cliff, Cliffside West",Scarborough


In [133]:
com_df = com_df[['Postcode', 'Borough', 'Neighbourhood']] #rearrange columns 

In [134]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [137]:
print(com_df.shape)

(103, 3)


___Read in csv file and put into dataframe___

In [149]:
coords = pd.read_csv('https://cocl.us/Geospatial_data')
coords

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [171]:
com_df['Longitude'] = coords['Longitude'].values #assign longitude values in csv dataframe to our dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [172]:
com_df['Latitude'] = coords['Latitude'].values #assign latitude values in csv dataframe to our dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [173]:
com_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
