### Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto
Edwin N Asberg

## First Part

#### Step 1: scrapping data on Toronto neighborhoods from wikipedia, and renaming the columns accordingly

In [1]:
import pandas as pd
toronto_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header = 0)
toronto_df = toronto_df[0]
toronto_df.rename(columns={'Postcode':'PostalCode',
                          'Borough':'Borough',
                          'Neighbourhood':'Neighbourhood'}, 
                 inplace=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


##### Step 2: ignoring rows with Borough = 'Not assigned'

In [2]:
toronto_df=toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


##### Step 3: Combining different rows with have same Postal Code

In [3]:
toronto_df = toronto_df.groupby(['PostalCode','Borough'], as_index=False, sort=False).agg(', '.join)
toronto_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


##### Step 4: If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [4]:
import numpy as np
toronto_df['Neighbourhood'] = np.where(toronto_df['Neighbourhood'] == 'Not assigned', toronto_df['Borough'],toronto_df['Neighbourhood'])
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


##### Step 5: Getting the shape of the final pandas dataframe

In [None]:
toronto_df.shape

(103, 3)

## Second Part

In [None]:
# #pip install geocoder #done on the anaconda prompt
# import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

#### Step 1: reading the Lat / Long dataframe for all Postal Codes in Canada

In [None]:
import pandas as pd
lat_log = pd.read_csv('http://cocl.us/Geospatial_data', header = 0)
lat_log.head()

#### Step 2: Merging the dataframes (lat_long and toronto_df)

In [None]:
df = pd.merge(toronto_df, lat_log, 
             left_on='PostalCode',
             right_on='Postal Code',
             how='left')
df = df.drop(columns=['Postal Code']) #dropping this column which has redudant info
df.head()

#### Checking if the shape above is correct (same number of rows as in the first part)

In [None]:
df.shape #correct!