# The Battle of the Neigbourhoods - Rome

In this notebook we collect the neighbourhoods of Rome from wikepedia webpages. <br>
After cleaning the Neighbourhood data, it will be enriched with the geographical coordinates.

## Importing libraries

In [1]:
!pip install beautifulsoup4
!pip install lxml
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

from bs4 import BeautifulSoup # Library for scraping webpage
from IPython.display import display_html # Library for displaying HTML

#!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from geopy.extra.rate_limiter import RateLimiter # ratelimiter for stopping if it takes to long to get the geocodes

# Library for saving en reading data from the project
from project_lib import Project

print('Importing ready!')

Importing ready!


## Retreive neighbourhoods of Rome from Wikipedia webpage

### First the Quarters of Rome

In [2]:
# Get webpage
source = requests.get('https://en.wikipedia.org/wiki/Quarters_of_Rome').text
# Scrape webpage
soup = BeautifulSoup(source,'lxml')
# Check title of webpage
print(soup.title)
# Collect all list items from div class in a list
list = []
for item in soup.findAll('div',{'class':'div-col'}):
    sub_items = item.findAll('li')
    for sub_item in sub_items:
        list.append(['Rome', sub_item.text])
list

<title>Quarters of Rome - Wikipedia</title>


[['Rome', 'Q. I Flaminio'],
 ['Rome', 'Q. II Parioli'],
 ['Rome', 'Q. III Pinciano'],
 ['Rome', 'Q. IV Salario'],
 ['Rome', 'Q. V Nomentano'],
 ['Rome', 'Q. VI Tiburtino'],
 ['Rome', 'Q. VII Prenestino-Labicano'],
 ['Rome', 'Q. VIII Tuscolano'],
 ['Rome', 'Q. IX Appio-Latino'],
 ['Rome', 'Q. X Ostiense'],
 ['Rome', 'Q. XI Portuense'],
 ['Rome', 'Q. XII Gianicolense'],
 ['Rome', 'Q. XIII Aurelio'],
 ['Rome', 'Q. XIV Trionfale'],
 ['Rome', 'Q. XV Della Vittoria'],
 ['Rome', 'Q. XVI Monte Sacro'],
 ['Rome', 'Q. XVII Trieste'],
 ['Rome', 'Q. XVIII Tor di Quinto'],
 ['Rome', 'Q. XIX Prenestino-Centocelle'],
 ['Rome', 'Q. XX Ardeatino'],
 ['Rome', 'Q. XXI Pietralata'],
 ['Rome', 'Q. XXII Collatino'],
 ['Rome', 'Q. XXIII Alessandrino'],
 ['Rome', 'Q. XXIV Don Bosco'],
 ['Rome', 'Q. XXV Appio Claudio'],
 ['Rome', 'Q. XXVI Appio-Pignatelli'],
 ['Rome', 'Q. XXVII Primavalle'],
 ['Rome', 'Q. XXVIII Monte Sacro Alto'],
 ['Rome', 'Q. XXIX Ponte Mammolo'],
 ['Rome', 'Q. XXX San Basilio'],
 ['Rome', 

In [3]:
# Create a dataframe with the Neighbourhoods of Amsterdam
df_quarters=pd.DataFrame(list,columns=['City', 'Quarter'])
#df_quarters["Neighbourhood"]  = df_quarters["Neighbourhood"].str.strip()
df_quarters

Unnamed: 0,City,Quarter
0,Rome,Q. I Flaminio
1,Rome,Q. II Parioli
2,Rome,Q. III Pinciano
3,Rome,Q. IV Salario
4,Rome,Q. V Nomentano
5,Rome,Q. VI Tiburtino
6,Rome,Q. VII Prenestino-Labicano
7,Rome,Q. VIII Tuscolano
8,Rome,Q. IX Appio-Latino
9,Rome,Q. X Ostiense


In [4]:
df_quarters.shape

(35, 2)

In [5]:
# Split de Quarters into type of Neighbourhood, the number and the name
df_quarters[['Type','Number','Neighbourhood']] = df_quarters['Quarter'].str.split(n=2, expand=True)
df_quarters

Unnamed: 0,City,Quarter,Type,Number,Neighbourhood
0,Rome,Q. I Flaminio,Q.,I,Flaminio
1,Rome,Q. II Parioli,Q.,II,Parioli
2,Rome,Q. III Pinciano,Q.,III,Pinciano
3,Rome,Q. IV Salario,Q.,IV,Salario
4,Rome,Q. V Nomentano,Q.,V,Nomentano
5,Rome,Q. VI Tiburtino,Q.,VI,Tiburtino
6,Rome,Q. VII Prenestino-Labicano,Q.,VII,Prenestino-Labicano
7,Rome,Q. VIII Tuscolano,Q.,VIII,Tuscolano
8,Rome,Q. IX Appio-Latino,Q.,IX,Appio-Latino
9,Rome,Q. X Ostiense,Q.,X,Ostiense


In [6]:
# Remove columns with the complete quarter information, because it is has been spit into separate columns
df_quarters.drop(df_quarters.columns[[1]], axis = 1, inplace = True) 
# Remove the point in the column Type
df_quarters['Type'] = df_quarters['Type'].str.replace('.', '')
df_quarters

Unnamed: 0,City,Type,Number,Neighbourhood
0,Rome,Q,I,Flaminio
1,Rome,Q,II,Parioli
2,Rome,Q,III,Pinciano
3,Rome,Q,IV,Salario
4,Rome,Q,V,Nomentano
5,Rome,Q,VI,Tiburtino
6,Rome,Q,VII,Prenestino-Labicano
7,Rome,Q,VIII,Tuscolano
8,Rome,Q,IX,Appio-Latino
9,Rome,Q,X,Ostiense


In [7]:
df_quarters.shape

(35, 4)

### Then the Rioni of Rome

In [8]:
# Get webpage
source = requests.get('https://it.wikipedia.org/wiki/Rioni_di_Roma').text
# Scrape webpage
soup = BeautifulSoup(source,'lxml')
# Check title of webpage
print(soup.title)
# Get table from webpage
html_table = str(soup.findAll('table',{'class':'wikitable sortable'}))
# Display table
display_html(html_table,raw=True)

<title>Rioni di Roma - Wikipedia</title>


Stemma,Numero,Nome,Popolazione[4],Superficie,Densità,Circoscrizione,Mappa
,R. I,Monti,13 028,"1,6508 km²","7891,93 ab./km²",Municipio I,
,R. II,Trevi,2 327,"0,5503 km²","4228,6 ab./km²",Municipio I,
,R. III,Colonna,2 111,"0,2689 km²","7850,5 ab./km²",Municipio I,
,R. IV,Campo Marzio,5 860,"0,8817 km²","6646,25 ab./km²",Municipio I,
,R. V,Ponte,3 596,"0,3189 km²","11276,26 ab./km²",Municipio I,
,R. VI,Parione,2 572,"0,1938 km²","13271,41 ab./km²",Municipio I,
,R. VII,Regola,3 238,"0,3189 km²","10153,65 ab./km²",Municipio I,
,R. VIII,Sant'Eustachio,1 962,"0,1688 km²","11623,22 ab./km²",Municipio I,
,R. IX,Pigna,10 737,"0,2063 km²","52045,56 ab./km²",Municipio I,
,R. X,Campitelli,552,"0,5990 km²","921,54 ab./km²",Municipio I,


In [22]:
# Create a list from the HTML table
list = pd.read_html(html_table)
# Create a dataframe from that list
df_rioni = list[0]
df_rioni


Unnamed: 0,Stemma,Numero,Nome,Popolazione[4],Superficie,Densità,Circoscrizione,Mappa
0,,R. I,Monti,13 028,"1,6508 km²","7891,93 ab./km²",Municipio I,
1,,R. II,Trevi,2 327,"0,5503 km²","4228,6 ab./km²",Municipio I,
2,,R. III,Colonna,2 111,"0,2689 km²","7850,5 ab./km²",Municipio I,
3,,R. IV,Campo Marzio,5 860,"0,8817 km²","6646,25 ab./km²",Municipio I,
4,,R. V,Ponte,3 596,"0,3189 km²","11276,26 ab./km²",Municipio I,
5,,R. VI,Parione,2 572,"0,1938 km²","13271,41 ab./km²",Municipio I,
6,,R. VII,Regola,3 238,"0,3189 km²","10153,65 ab./km²",Municipio I,
7,,R. VIII,Sant'Eustachio,1 962,"0,1688 km²","11623,22 ab./km²",Municipio I,
8,,R. IX,Pigna,10 737,"0,2063 km²","52045,56 ab./km²",Municipio I,
9,,R. X,Campitelli,552,"0,5990 km²","921,54 ab./km²",Municipio I,


In [23]:
# Remove rows that have NO disctrict number
df_rioni.drop(df_rioni.columns[[0, 3, 4, 5, 6, 7]], axis = 1, inplace = True) 
df_rioni

Unnamed: 0,Numero,Nome
0,R. I,Monti
1,R. II,Trevi
2,R. III,Colonna
3,R. IV,Campo Marzio
4,R. V,Ponte
5,R. VI,Parione
6,R. VII,Regola
7,R. VIII,Sant'Eustachio
8,R. IX,Pigna
9,R. X,Campitelli


In [24]:
# Remove all columns with a NaN value
df_rioni.dropna(axis=0, inplace=True)
df_rioni

Unnamed: 0,Numero,Nome
0,R. I,Monti
1,R. II,Trevi
2,R. III,Colonna
3,R. IV,Campo Marzio
4,R. V,Ponte
5,R. VI,Parione
6,R. VII,Regola
7,R. VIII,Sant'Eustachio
8,R. IX,Pigna
9,R. X,Campitelli


In [25]:
# Add column City to dataframe
df_rioni['City'] = 'Rome'
df_rioni = df_rioni[ ['City'] + [ col for col in df_rioni.columns if col != 'City' ] ]
df_rioni

Unnamed: 0,City,Numero,Nome
0,Rome,R. I,Monti
1,Rome,R. II,Trevi
2,Rome,R. III,Colonna
3,Rome,R. IV,Campo Marzio
4,Rome,R. V,Ponte
5,Rome,R. VI,Parione
6,Rome,R. VII,Regola
7,Rome,R. VIII,Sant'Eustachio
8,Rome,R. IX,Pigna
9,Rome,R. X,Campitelli


### Cleaning en preparing the neighbourhoods

#### Remove rows with 'Template' in the Neighbourhood

In [None]:
# The first row is a title wich begins with 'Template', so we check how many rows also do
df.loc[df.Neighbourhood.str.startswith('Template'), 'Neighbourhood'].count() 

In [None]:
# Remove the row that begins with 'Template'
df1 = df[~df.Neighbourhood.str.contains('Template')]
df1.head()

In [None]:
df1.shape

#### Remove text between brackets () in the column Neigbourhood

In [None]:
# Count the number of Neighbourhoods with a starting bracket (
df1.loc[df1.Neighbourhood.str.contains('\('), 'Neighbourhood'].count()

In [None]:
# Take the part of the Neighbourhood before the starting bracket (
df1['Neighbourhood'] = df1['Neighbourhood'].str.split('\(').str[0]
df1

In [None]:
# Check if there are any ending brackets ) in the Neighbourhood
df1.loc[df1.Neighbourhood.str.contains('\)'), 'Neighbourhood'].count()

In [None]:
# Create a column 'Address' for getting the geographical coordinates
df1["Address"] = df1["Neighbourhood"] + ', ' +  df1["City"]
df1

In [None]:
# Create a new dataframe with Neigbourhood data
df2 = df1.copy()
df2

## Collecting the geographical coordinates for the neighbourhoods of Amsterdam

In [None]:
# Get the Geographical coordinates of 1 neighboorhood, to check if the geolocator works
address = 'Amsteldorp, Amsterdam'

geolocator = Nominatim(user_agent="neighbourhoud_explorer")

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Amsterdam are {}, {}.'.format(latitude, longitude))

In [None]:
# 1 - convenient function to delay between geocoding calls
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [None]:
# 2- - create location column
df2['location'] = df2['Address'].apply(geocode)

In [None]:
# 3 - create longitude, latitude and altitude from location column (returns tuple)
df2['point'] = df2['location'].apply(lambda loc: tuple(loc.point) if loc else None)


In [None]:
# Check for Neighbourhoods without geogrophical coordinates
print(df2.loc[df2["location"].isnull()].count())
df2.loc[df2["location"].isnull()]

In [None]:
# Remove the rows without geographical coordinats
df2.dropna(inplace=True)
df2.reset_index(drop=True, inplace=True)
df2

In [None]:
df2.shape

In [None]:
# 4 - split point column into latitude, longitude and altitude columns
df2[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df2['point'].tolist(), index=df2.index)
df2

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
# Create an access to this project
project = Project.access(None,token,token)

# Save the collected Neighbourhoods and geographical data in project data bucket
project.save_data(file_name="geo_amsterdam.csv", data=df2.to_csv(index=False))