# Capstone Coursera-IBM Python for Data-Science

This notebook will be used to be peer-reviewed in the module 3 of the capstone IBM.

## Module 3

Import some libraries

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium

### Part 1

#### Get data from Wikipedia

In [3]:
data=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Use BeautifulSoup to parse data

In [4]:
data = BeautifulSoup(data, 'html.parser')
zc=[] #zipcode
bl=[] #borough list
nl=[] #neighborhood

In [5]:
for r in data.find('table').find_all('tr'):
    c=r.find_all('td')
    if(len(c)>0):
        #print(c[0].text)
        zc.append(c[0].text.rstrip('\n'))
        bl.append(c[1].text.rstrip('\n'))
        nl.append(c[2].text.rstrip('\n'))

In [6]:
t_df=pd.DataFrame({'PostalCode':zc,
                   'Borough':bl,
                   'Neighborhood':nl})

In [7]:
t_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [8]:
t_df.PostalCode.duplicated().any()

False

There's no duplicate value in PostalCode.

#### Drop Borough='Not assigned' lines and correct Neighborhood='Not assigned'

In [9]:
t_df=t_df[t_df.Borough!="Not assigned"].reset_index(drop=True)
for index, r in t_df.iterrows():
    if r["Neighborhood"] == "Not assigned":
        r["Neighborhood"] = r["Borough"]

In [10]:
t_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


It seems that something changes on wikipedia as the neiborhood seems to be separed by a ' / '

#### Replace '/' by ','

In [11]:
for r in t_df.index:
    t_df.Neighborhood[r]=t_df.Neighborhood[r].replace(' / ',', ')

In [12]:
t_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Finally, last question of part 1 : shape of the DataFrame

In [13]:
t_df.shape

(103, 3)

### Part 2

#### Load the coordinates from the CSV given by Coursera

In [14]:
coord=pd.read_csv("Geospatial_Coordinates.csv")
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
coord['Postal Code'].duplicated().any()

False

There's no duplicate value in Postal Code.
The column is entitled "PostalCode" in t_df, the Toronto DataFrame from Wikipedia and "Postal Code" in coord, the DataFrame form Coursera. It should be made consistent before to merge.

In [16]:
coord.rename(columns={"Postal Code":"PostalCode"},inplace=True)
coord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge the two tables

In [17]:
t_df_full=t_df.merge(coord, on="PostalCode", how="left")

In [18]:
t_df_full.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [27]:
t_df_full

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


#### Give the same DF as in the Coursera Assignments

In [19]:
t_df_pr=pd.DataFrame(columns=t_df_full.columns)
t_list=["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for pc in t_list:
    t_df_pr=t_df_pr.append(t_df_full[t_df_full['PostalCode'] == pc],ignore_index=True)

t_df_pr

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


### Part 3

#### Use Geopy and folium to create a map of Toronto

In [35]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
loc = geolocator.geocode(address)
lat = loc.latitude
long = loc.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address,lat, long))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [36]:
map_toronto=folium.Map(location=[lat,long],zoom_start=9)
map_toronto

#### Add Borough containing "Toronto" and Neighborhood

In [90]:
bt=list(set([t_df_full.Borough[i] for i in range(0,t_df_full.Borough.shape[0])])) #df->list->set(to avoid duplicate)->list
bt=[bt[i] for i in range(0,len(bt)) if "toronto" in bt[i].lower()] #list->list with only "toronto-name"
bt

['Downtown Toronto', 'East Toronto', 'Central Toronto', 'West Toronto']

In [91]:
tt_df=t_df_full[t_df_full.Borough.isin(bt)].reset_index(drop=True)
tt_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [93]:
map_toronto = folium.Map(location=[lat, long], zoom_start=12)

for la, lo, borough, neighborhood in zip(tt_df['Latitude'], tt_df['Longitude'], tt_df['Borough'], tt_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [la, lo],
        radius=5,
        popup=label,
        color='blue').add_to(map_toronto)  
    
map_toronto 

#### Let's go on Foursquare

In [113]:
fs_api=pd.read_csv('foursquare.csv',header=None) #Credentials

In [114]:
CLIENT_ID = fs_api[0][0] # your Foursquare ID
CLIENT_SECRET = fs_api[1][0] # your Foursquare Secret
VERSION = fs_api[2][0] # Foursquare API version

Top 20 in the 500m around the (lat,long) of each borough

In [None]:
limit=20
r=500

venues=[]

for la, lo, post, borough, neighborhood in zip(tt_df['Latitude'], tt_df['Longitude'], tt_df['PostalCode'], 
                                               tt_df['Borough'], tt_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID, CLIENT_SECRET, VERSION,
        la, lo,
        r, limit)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            la, 
            lo, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))


In [1]:
limit=20
r=500

venues=[]

for la, lo, post, borough, neighborhood in zip(tt_df['Latitude'], tt_df['Longitude'], tt_df['PostalCode'], 
                                               tt_df['Borough'], tt_df['Neighborhood']):
    print(la,lo,post,borough,neighborhood)

NameError: name 'tt_df' is not defined