# Part 1- Scrapping data from Wikipedia 

### Libraries needed and may need for assignment 

In [2]:
import requests  
import pandas as pd
import matplotlib.pyplot as plt

### Scrapping data from wikipedia 

In [3]:
# url of wikipedia page from which you want to scrap tabular data.
url1 = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# determine the index of your table
df = pd.read_html(url1, header = 0)[0]

# Igonre cells with not assigned borough 
df = df[df.Borough != 'Not assigned']

### Combining rows where more than one neighbourhood has the same postal code and separating values with a comma. 

In [4]:
df = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()
df.head(20)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Assigning boroughs with 'Not assigned' neighborhoods to be the same as the borough

In [5]:
for index, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] == row['Borough']

In [6]:
df.shape

(103, 3)

## 

# Part 2- Utilizing  Foursquare location data to find latitude and longitude coordinates of each neighborhood

## Importing libraries needed for this part 

In [58]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!pip install folium
!pip install geocoder
import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium 

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.4 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Libraries imported.


In [33]:
neighborhoods_data = df

df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


### Looping though data to fill df one row at a time 

In [46]:
from bs4 import BeautifulSoup 

r = requests.get(url1) 
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 


In [62]:
postalCodes = [];
boroughs= [];
neighborhoods = [];
columnNum = 1;
passVal = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            passVal = False
            if columnNum == 1:
                if passVal == False and cell.string[1].isdigit():
                    postalCodes.append(cell.string);   
                    columnNum = 2
                else:
                    continue
            elif columnNum == 2 :
                if cell.string == 'Not assigned':
                    passVal = True
                    del postalCodes[-1]
                    columnNum = 1
                    continue
                else:
                    boroughs.append(cell.string);      
                    columnNum = 3
            elif columnNum == 3 :
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string); 
                columnNum = 1
                
print('Data Collected.')

Data Collected.


### Turning data into pandas df

In [74]:
# define the dataframe columns
column_names = ['Postal Code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)

df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


In [73]:
# initialize your variable to None
lat_lng_coords = None

for data in range(0, len(postalCodes)-1):
    code = postalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]
    
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    lat_lng_coords = g.latlng

    df = df.append({ 'PostalCode': code,
                                   'Borough': borough,
                                   'Neighborhood': neighborhood_name,
                                   'Latitude': lat_lng_coords[0],
                                   'Longitude': lat_lng_coords[1]}, ignore_index=True)
    
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,PostalCode
0,,Not assigned\n,Not assigned\n,43.64869,-79.38544,M1A\n
1,,Not assigned\n,Not assigned\n,43.64869,-79.38544,M2A\n
2,,North York\n,Parkwoods\n,43.75245,-79.32991,M3A\n
3,,North York\n,Victoria Village\n,43.73057,-79.31306,M4A\n
4,,Downtown Toronto\n,"Regent Park, Harbourfront\n",43.65512,-79.36264,M5A\n
5,,North York\n,"Lawrence Manor, Lawrence Heights\n",43.72327,-79.45042,M6A\n
6,,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n",43.66253,-79.39188,M7A\n
7,,Not assigned\n,Not assigned\n,43.64869,-79.38544,M8A\n
8,,Etobicoke\n,"Islington Avenue, Humber Valley Village\n",43.66263,-79.52831,M9A\n
9,,Scarborough\n,"Malvern, Rouge\n",43.81139,-79.19662,M1B\n


#  