## IBM Applied Data Science Capstone Course by Coursera
### Week-3 Part-1

#### Import Required Libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


### Scrape data from Wikipedia page into a DataFrame

In [2]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# creating three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

**Clean the data using Beautiful Soup**

In [5]:
for row in soup.find('table').find_all('tr'): # find all tables and rows
    cells = row.find_all('td') # find data in table rows
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text.rstrip('\n')) # avoid new lines in postalCode cell
        boroughList.append(cells[1].text.rstrip('\n')) # avoid new lines in borough cell
        neighborhoodList.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

**Create a new DataFrame from the three lists**

In [6]:
torontoDF = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

torontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**Drop cells with a borough as "*Not assigned*"**

In [7]:
new_torontoDF = torontoDF[torontoDF.Borough != "Not assigned"].reset_index(drop=True)
new_torontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Group neighborhoods in the same borough

In [8]:
grouped_torontoDF = new_torontoDF.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
grouped_torontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Handle the rows having a borough but *Not Assigned* neghborhood

Below code cell handles rows having a borough but a *Not Assigned* neighborhood by replacing neighborhood to be  same as the borough.

In [9]:
# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in grouped_torontoDF.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
grouped_torontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Print shape (number of rows) of DataFrame

In [10]:
grouped_torontoDF.shape

(103, 3)

### Loading coordinates from *.csv* file

In [11]:
# read csv file
coordinates = pd.read_csv("data/Geospatial_Coordinates.csv")

# rename header 
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Merging two DataFrames**

In [12]:
merged_torontoDF = grouped_torontoDF.merge(coordinates, on="PostalCode", how="left")
merged_torontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Comparing with the DataFrame given in question

In [13]:
# create a new test dataframe
columns = ["PostalCode", "Borough", "Neighborhood"]
testDF = pd.DataFrame(columns=columns)

rows = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in rows:
    testDF = testDF.append(merged_torontoDF[merged_torontoDF["PostalCode"]==postcode], ignore_index=True)
    
testDF

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
