# Web Scrapping Project - We will be scraping from Wikipedia

### First we need to load all the necessary libraries:

In [2]:
# Importing the libraries
import pandas as pd #library to work with data frames
import numpy as np #library to work with arrays
from bs4 import BeautifulSoup as bs #library to scrape from website
import requests #library to make requests to website.

### Then we are requesting the http to get the text of the website

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Now all we have to do is using BeautifulSoup to load the data and build the pandas dataframe.

In [4]:
soup = bs(source, 'lxml')
soup.prettify()
header=[]
htemp=soup.find("table").tr

#Getting the header into a list
counter=0
for x in htemp.find_all("th"):
    header.append(x.text)
    
#Getting the body into a list with different columns
bod=soup.find("table")
body=[]
for i in bod.find_all("td"):
    body.append(i.text)
    #print(i.text)
    
Postcode = body[0:len(body):3]
Borough = body[1:len(body):3]
Neighbourhood = body[2:len(body):3]
Neighbourhood= [s.replace('\n', '') for s in Neighbourhood] #Dropping the \n

#Creating Pandas dataframe
df=pd.DataFrame({'Borough':Borough,
                 'Postcode':Postcode,
                 'Neighbourhood':Neighbourhood})

#Reorganizing the dataframe's columns
df=df[['Postcode','Borough','Neighbourhood']]

#Creating index list with 'Not assigned' boroughs
emptybor=df.index[df['Borough']=='Not assigned'].tolist()

#Dropping the emptyboroughs
df=df.drop(emptybor,axis=0)

#reseting the indexes of the new dataframe
df=df.reset_index(drop=True)

# Finding the indexes for "Not assigned" neighbourhoods:
nan=df.index[df['Neighbourhood']=='Not assigned'].tolist()

#modifying the "Not assigned" neighbouhoods to Boroughs
df.iloc[nan,2] =df.iloc[nan,1]

# Using groupby we can obtain the list of neighbourhoods that are in same postcode.
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)

# Reseting the indexes
df=df.reset_index()

#looking at the first 5 items of final dataframe
df


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]"
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]"


In [5]:
# Lets see whats the shape of our final dataframe:
df.shape

(103, 3)

The final dataframe contains 103 rows

## Now we are going to find the lattitude and longitude based on our post_codes:

In [15]:
import sys
!{sys.executable} -m pip install geocoder


[33mYou are using pip version 18.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [27]:
import geocoder # import geocoder

# initializing the variable to None
lat_lng_coords = None

# loop until we get the coordinates
while(lat_lng_coords is None):
    g = geocoder.google('Mountain View, CA', sensor=True)
    lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
latitude

KeyboardInterrupt: 

## geocoder did not quite work. Therefore, below I will use the .csv file to add lattitude and longitude data

In [31]:
import os
os.getcwd()


'/Users/John/Downloads/Data-Scraping---Toronto-neighbourhoods-Wikipedia--master'

In [34]:
latlong=pd.read_csv('Geospatial_Coordinates.csv')
lat=latlong['Latitude']
lng=latlong['Longitude']
df['Latitude']=lat
df['Longitude']=lng
df


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476
5,M1J,Scarborough,[Scarborough Village],43.744734,-79.239476
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]",43.727929,-79.262029
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]",43.711112,-79.284577
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]",43.716316,-79.239476
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]",43.692657,-79.264848
