# Collecting the Neigbourhoods of Paris

In this notebook we collect the neighbourhoods of Paris from a wikepedia webpage.
After cleaning the Neighbourhood data, it will be enriched with the geographical coordinates.

## Importing libraries

In [1]:
!pip install beautifulsoup4
!pip install lxml
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

from bs4 import BeautifulSoup # Library for scraping webpage
from IPython.display import display_html # Library for displaying HTML

#!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from geopy.extra.rate_limiter import RateLimiter # ratelimiter for stopping if it takes to long to get the geocodes

# Library for saving en reading data from the project
#from project_lib import Project

!pip install folium 
import folium # plotting library

print('Importing ready!')

Importing ready!


## Retreive neighbourhoods of Paris from Wikipedia webpage

In [2]:
# Get webpage
source = requests.get('https://nl.wikipedia.org/wiki/Lijst_van_wijken_in_Parijs').text
# Scrape webpage
soup = BeautifulSoup(source,'lxml')
# Check title of webpage
print(soup.title)
# Get table from webpage
html_table = str(soup.table)
# Display table
display_html(html_table,raw=True)

<title>Lijst van wijken in Parijs - Wikipedia</title>


Arrondissement,Wijknummer,Naam,Kaart
1e arrondissement,1,Saint-Germain-l'Auxerrois,
1e arrondissement,2,Les Halles,
1e arrondissement,3,Palais-Royal,
1e arrondissement,4,Place-Vendôme,
2e arrondissement,5,Gaillon,
2e arrondissement,6,Vivienne,
2e arrondissement,7,Mail,
2e arrondissement,8,Bonne-Nouvelle,
3e arrondissement,9,Arts-et-Métiers,
3e arrondissement,10,Enfants-Rouges,


In [3]:
# Create a list from the HTML table
list = pd.read_html(html_table)
# Create a dataframe from that list
df = list[0]
df

Unnamed: 0,Arrondissement,Wijknummer,Naam,Kaart
0,1e arrondissement,1,Saint-Germain-l'Auxerrois,
1,1e arrondissement,2,Les Halles,
2,1e arrondissement,3,Palais-Royal,
3,1e arrondissement,4,Place-Vendôme,
4,2e arrondissement,5,Gaillon,
...,...,...,...,...
75,19e arrondissement,76,Combat,
76,20e arrondissement,77,Belleville,
77,20e arrondissement,78,Saint-Fargeau,
78,20e arrondissement,79,Père-Lachaise,


## Cleaning en preparing the neighbourhoods

In [4]:
# Remove unnecessary columns
df.drop(df.iloc[:, 3:4], inplace=True, axis=1)
df

Unnamed: 0,Arrondissement,Wijknummer,Naam
0,1e arrondissement,1,Saint-Germain-l'Auxerrois
1,1e arrondissement,2,Les Halles
2,1e arrondissement,3,Palais-Royal
3,1e arrondissement,4,Place-Vendôme
4,2e arrondissement,5,Gaillon
...,...,...,...
75,19e arrondissement,76,Combat
76,20e arrondissement,77,Belleville
77,20e arrondissement,78,Saint-Fargeau
78,20e arrondissement,79,Père-Lachaise


In [5]:
# Rename columns (Dutch to English)
df.rename(columns={'Arrondissement': 'Borough', 'Wijknummer':'Neighbourhood_nbr', 'Naam': 'Neighbourhood'}, inplace=True)
df

Unnamed: 0,Borough,Neighbourhood_nbr,Neighbourhood
0,1e arrondissement,1,Saint-Germain-l'Auxerrois
1,1e arrondissement,2,Les Halles
2,1e arrondissement,3,Palais-Royal
3,1e arrondissement,4,Place-Vendôme
4,2e arrondissement,5,Gaillon
...,...,...,...
75,19e arrondissement,76,Combat
76,20e arrondissement,77,Belleville
77,20e arrondissement,78,Saint-Fargeau
78,20e arrondissement,79,Père-Lachaise


In [6]:
# Add column City to dataframe
df['City'] = 'Paris'
df = df[ ['City'] + [ col for col in df.columns if col != 'City' ] ]
df

Unnamed: 0,City,Borough,Neighbourhood_nbr,Neighbourhood
0,Paris,1e arrondissement,1,Saint-Germain-l'Auxerrois
1,Paris,1e arrondissement,2,Les Halles
2,Paris,1e arrondissement,3,Palais-Royal
3,Paris,1e arrondissement,4,Place-Vendôme
4,Paris,2e arrondissement,5,Gaillon
...,...,...,...,...
75,Paris,19e arrondissement,76,Combat
76,Paris,20e arrondissement,77,Belleville
77,Paris,20e arrondissement,78,Saint-Fargeau
78,Paris,20e arrondissement,79,Père-Lachaise


In [7]:
df.shape

(80, 4)

## Collecting the geographical coordinates for the neighbourhoods of Paris

In [8]:
# Create a column 'Address' for getting the geographical coordinates
df1 = df.copy()
df1["Address"] = df1["Neighbourhood"] + ', ' +  df1["City"]
df1

Unnamed: 0,City,Borough,Neighbourhood_nbr,Neighbourhood,Address
0,Paris,1e arrondissement,1,Saint-Germain-l'Auxerrois,"Saint-Germain-l'Auxerrois, Paris"
1,Paris,1e arrondissement,2,Les Halles,"Les Halles, Paris"
2,Paris,1e arrondissement,3,Palais-Royal,"Palais-Royal, Paris"
3,Paris,1e arrondissement,4,Place-Vendôme,"Place-Vendôme, Paris"
4,Paris,2e arrondissement,5,Gaillon,"Gaillon, Paris"
...,...,...,...,...,...
75,Paris,19e arrondissement,76,Combat,"Combat, Paris"
76,Paris,20e arrondissement,77,Belleville,"Belleville, Paris"
77,Paris,20e arrondissement,78,Saint-Fargeau,"Saint-Fargeau, Paris"
78,Paris,20e arrondissement,79,Père-Lachaise,"Père-Lachaise, Paris"


In [9]:
# Get the Geographical coordinates of 1 neighboorhood, to check if the geolocator works
address = 'Charonne, Paris'

geolocator = Nominatim(user_agent="neighbourhoud_explorer")

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Paris are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Paris are 48.8547441, 2.3853565.


In [10]:
# 1 - convenient function to delay between geocoding calls
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [11]:
# 2- - create location column
df2 = df1.copy()
df2['location'] = df2['Address'].apply(geocode)

In [12]:
# 3 - create longitude, latitude and altitude from location column (returns tuple)
df2['point'] = df2['location'].apply(lambda loc: tuple(loc.point) if loc else None)

In [13]:
# Check for Neighbourhoods without geogrophical coordinates
print(df2.loc[df2["location"].isnull()].count())
df2.loc[df2["location"].isnull()]

City                 1
Borough              1
Neighbourhood_nbr    1
Neighbourhood        1
Address              1
location             0
point                0
dtype: int64


Unnamed: 0,City,Borough,Neighbourhood_nbr,Neighbourhood,Address,location,point
73,Paris,19e arrondissement,74,Pont-de-Flandres,"Pont-de-Flandres, Paris",,


In [14]:
# Remove the rows without geographical coordinats
df2.dropna(inplace=True)
df2.reset_index(drop=True, inplace=True)
df2

Unnamed: 0,City,Borough,Neighbourhood_nbr,Neighbourhood,Address,location,point
0,Paris,1e arrondissement,1,Saint-Germain-l'Auxerrois,"Saint-Germain-l'Auxerrois, Paris","(Quartier Saint-Germain-l'Auxerrois, Paris 1er...","(48.860211199999995, 2.3362988847682233, 0.0)"
1,Paris,1e arrondissement,2,Les Halles,"Les Halles, Paris","(Les Halles, Allée Saint-John Perse, Quartier ...","(48.8624659, 2.3460086, 0.0)"
2,Paris,1e arrondissement,3,Palais-Royal,"Palais-Royal, Paris","(Palais Royal, Rue de Valois, Quartier du Pala...","(48.863584700000004, 2.3362042200938715, 0.0)"
3,Paris,1e arrondissement,4,Place-Vendôme,"Place-Vendôme, Paris","(Place Vendôme, Accès Parking Vendôme, Quartie...","(48.867463400000005, 2.329428116825194, 0.0)"
4,Paris,2e arrondissement,5,Gaillon,"Gaillon, Paris","(Quartier Gaillon, Paris 2e Arrondissement, Pa...","(48.869135150000005, 2.332908770335507, 0.0)"
...,...,...,...,...,...,...,...
74,Paris,19e arrondissement,76,Combat,"Combat, Paris","(Colonel Fabien, Boulevard de la Villette, Qua...","(48.8774215, 2.3710197, 0.0)"
75,Paris,20e arrondissement,77,Belleville,"Belleville, Paris","(Quartier de Belleville, Paris 20e Arrondissem...","(48.8717265, 2.385085, 0.0)"
76,Paris,20e arrondissement,78,Saint-Fargeau,"Saint-Fargeau, Paris","(Quartier Saint-Fargeau, Paris 20e Arrondissem...","(48.8703623, 2.4067355726428277, 0.0)"
77,Paris,20e arrondissement,79,Père-Lachaise,"Père-Lachaise, Paris","(Cimetière du Père-Lachaise, 8, Boulevard de M...","(48.8612168, 2.3939292638106417, 0.0)"


In [15]:
df2.shape

(79, 7)

In [16]:
# 4 - split point column into latitude, longitude and altitude columns
df2[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df2['point'].tolist(), index=df2.index)
df2

Unnamed: 0,City,Borough,Neighbourhood_nbr,Neighbourhood,Address,location,point,latitude,longitude,altitude
0,Paris,1e arrondissement,1,Saint-Germain-l'Auxerrois,"Saint-Germain-l'Auxerrois, Paris","(Quartier Saint-Germain-l'Auxerrois, Paris 1er...","(48.860211199999995, 2.3362988847682233, 0.0)",48.860211,2.336299,0.0
1,Paris,1e arrondissement,2,Les Halles,"Les Halles, Paris","(Les Halles, Allée Saint-John Perse, Quartier ...","(48.8624659, 2.3460086, 0.0)",48.862466,2.346009,0.0
2,Paris,1e arrondissement,3,Palais-Royal,"Palais-Royal, Paris","(Palais Royal, Rue de Valois, Quartier du Pala...","(48.863584700000004, 2.3362042200938715, 0.0)",48.863585,2.336204,0.0
3,Paris,1e arrondissement,4,Place-Vendôme,"Place-Vendôme, Paris","(Place Vendôme, Accès Parking Vendôme, Quartie...","(48.867463400000005, 2.329428116825194, 0.0)",48.867463,2.329428,0.0
4,Paris,2e arrondissement,5,Gaillon,"Gaillon, Paris","(Quartier Gaillon, Paris 2e Arrondissement, Pa...","(48.869135150000005, 2.332908770335507, 0.0)",48.869135,2.332909,0.0
...,...,...,...,...,...,...,...,...,...,...
74,Paris,19e arrondissement,76,Combat,"Combat, Paris","(Colonel Fabien, Boulevard de la Villette, Qua...","(48.8774215, 2.3710197, 0.0)",48.877421,2.371020,0.0
75,Paris,20e arrondissement,77,Belleville,"Belleville, Paris","(Quartier de Belleville, Paris 20e Arrondissem...","(48.8717265, 2.385085, 0.0)",48.871727,2.385085,0.0
76,Paris,20e arrondissement,78,Saint-Fargeau,"Saint-Fargeau, Paris","(Quartier Saint-Fargeau, Paris 20e Arrondissem...","(48.8703623, 2.4067355726428277, 0.0)",48.870362,2.406736,0.0
77,Paris,20e arrondissement,79,Père-Lachaise,"Père-Lachaise, Paris","(Cimetière du Père-Lachaise, 8, Boulevard de M...","(48.8612168, 2.3939292638106417, 0.0)",48.861217,2.393929,0.0


## Create a map with neighbourhoods superimposed on top.

In [17]:
def getGeolocation(city):
    address = city
    geolocator = Nominatim(user_agent="city_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(city, latitude, longitude))
            
    return [latitude, longitude]    

In [18]:
def printMap(dta, city, zoom):
    print(city)
    map = folium.Map(location=getGeolocation(city), zoom_start=zoom)
    
    data = dta[dta["City"] == city]
    
    # add markers to map
    for lat, lng, city, neighbourhood in zip(data['latitude'], data['longitude'], data['City'], data['Neighbourhood']):
        label = '{}, {}'.format(neighbourhood, city)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map)  
    
    return map    

In [19]:
printMap(df2, 'Paris', 12)

Paris
The geograpical coordinate of Paris are 48.8566969, 2.3514616.


## Save Neighbourhood information in CSV

In [20]:
# @hidden_cell
token = 'p-7547ced92495ac0a4b7cff0670f4667f5c30ffb0'

In [21]:
# Create an access to this project
#project = Project.access(None,token,token)

# Save the collected Neighbourhoods and geographical data in project data bucket
#project.save_data(file_name="geo_paris.csv", data=df2.to_csv(index=False))

In [22]:
# Save in same dir as Notebook
df2.to_csv('Neighbourhoods_of_Paris.csv', index=False)
print('Geographical data are saved in Neighbourhoods_of_Paris.csv')

Geographical data are saved in Neighbourhoods_of_Paris.csv
