In [1]:
% cd ~/Master_Data_Science/TFM/2_Collect_data/
% pwd

/home/giltrapo/Master_Data_Science/TFM/2_Collect_data


u'/home/giltrapo/Master_Data_Science/TFM/2_Collect_data'

### Collect codes of public elementary schools in municipality of Madrid.

In [2]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

In [3]:
# Url to perform advanced searches.
url_advsearch = "http://www.madrid.org/wpad_pub/run/j/BusquedaAvanzada.icm"

In [4]:
# Parameter for searching public primary schools in municipality of Madrid
params = {"titularidadPublica": "S", "cdMuni": "079", "cdNivelEdu": "6545"}

In [5]:
# Request and parse list of schools.
schools = BeautifulSoup(requests.post(url_advsearch, data = params).content, "lxml")

In [6]:
# Extract list of school codes.
school_codes = schools.findAll(attrs = {"name": "codCentrosExp", "value": re.compile("^.+$")})[0]["value"]

In [7]:
# Convert from string to list.
school_codes = school_codes.split(";")

# Check codes.
# 247 it's ok.
print len(school_codes)

247


In [8]:
# Save the list of school codes
with open("school_codes.txt", "wb") as f:
    pickle.dump(school_codes, f)

In [9]:
# open list of school codes
with open("school_codes.txt", "rb") as f:
    school_codes = pickle.load(f)

### Extract tables from school cards.

In [10]:
# The data that we want to obtain from each school is contained in tables,
# whose content is generated on-the-fly through JavaScript code.
# We can't use 'request' library again because only fetch source code of
# the web page but it doesn't run code. So we need to mimic the rendering
# process of a browser. Let's use dryscrape

import dryscrape

In [11]:
# School card url.
url_schoolcard = "http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm"

# School code parameter.
school_code_par = "cdCentro="

# List of schools urls
schools_urls = [url_schoolcard + "?" + school_code_par + code for code in school_codes]

In [12]:
# Check with a single url.

render = dryscrape.Session()
render.visit("http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063799")
source = render.body()
school_card = BeautifulSoup(source, "lxml")
school_tables = school_card.findAll('table', class_="tablaGraficaDatos")

table = list(school_tables)[1]
pd.read_html(table.prettify())

[  PROCESO DE ADMISIÓN (solicitudes presentadas, admitidas, no admitidas)  \
 0                                        Presentadas                       
 1                                          Admitidas                       
 2                                       No admitidas                       
 
    2012-2013  2013-2014  2014-2015  2015-2016  2016-2017  
 0        103         93         74         71         64  
 1         95         89         74         67         63  
 2          8          4          0          4          1  ]

In [13]:
type(table)

bs4.element.Tag

In [14]:
# The second table is not that one I want. This table is generated by default when the page loads.
# I need to update it by clicking on radio button labeled "Primary".
# I try this with "at_css" method.

render = dryscrape.Session()
render.visit("http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063799")
radiob = render.at_css('#nivEd12\.grafica3')
radiob.click()
source = render.body()
school_card = BeautifulSoup(source, "lxml")
school_tables = school_card.findAll('table', class_="tablaGraficaDatos")

table = list(school_tables)[1]
pd.read_html(table.prettify())

InvalidResponseError: {"class":"ClickFailed","message":"Failed to find position for element /html/body/div[@id='contenedor']/div[@id='solapas']/div[10]/table/tbody/tr[1]/td[1]/div[@id='solapaspanel1']/div[@id='cuerpoL']/div/div[@id='capaSelGrafica']/div[@id='display.grafica3']/table/tbody/tr[2]/td[2]/input[@id='nivEd12.grafica3'] because it is not visible"}

In [15]:
# The method doesn't work. The radio button is invisible because the parent node is not
# displayed (<div id="solapaspanel1" style="display: none;">...</div>).
# I try to run a piece of javascript to trigger that click event.

render = dryscrape.Session()
render.visit("http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063799")
render.driver.exec_script('document.getElementById("nivEd12.grafica3").click();')
source = render.body()
school_card = BeautifulSoup(source, "lxml")
school_tables = school_card.findAll('table', class_="tablaGraficaDatos")
table = list(school_tables)[1]
pd.read_html(table.prettify())

[  PROCESO DE ADMISIÓN (solicitudes presentadas, admitidas, no admitidas)  \
 0                                        Presentadas                       
 1                                          Admitidas                       
 2                                       No admitidas                       
 
    2012-2013  2013-2014  2014-2015  2015-2016  2016-2017  
 0         20         25         25         20         18  
 1         14         11          7          9         11  
 2          6         14         18         11          7  ]

In [29]:
# Work fine. Now, the loop test.

schools_urls2 = ['http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28077865',
 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063751',
 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004989',
 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004990']

school_tables_collection = {}
school_name_collection = []

render = dryscrape.Session()
for z, school in enumerate(schools_urls2):
    render.visit(school)
    render.driver.exec_script('document.getElementById("nivEd12.grafica3").click();')
    source = render.body()
    school_card = BeautifulSoup(source, "lxml")
    school_tables = school_card.findAll('table', class_="tablaGraficaDatos")
    school_name = school_card.find(style="text-transform:uppercase").next.next
    for i, table in list(enumerate(school_tables)):
        if i <= 1:
            school_tables_collection[school_name + "_" + str(i)] = \
            pd.read_html(table.prettify())
            school_name_collection.append(school_name)

            
            print "Tables of school %s extracted" % schools_urls[z]

InvalidResponseError: {"class":"InvalidResponseError","message":"Javascript failed to execute"}

In [58]:
schools_urls2 = ['http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28077865',
 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063751',
 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004989',
 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004990']


for z, school in enumerate(schools_urls2):
    render = dryscrape.Session()
    render.visit(url)
    #render.driver.exec_script('document.getElementById("nivEd12.grafica3").click();')
    #source = render.body()
    #school_card = BeautifulSoup(source, "lxml")
    #school_tables = school_card.findAll('table', class_="tablaGraficaDatos")
    #table = list(school_tables)[1]
    #print pd.read_html(table.prettify())
    render.reset()
    print school
    

http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28077865
http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063751
http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004989
http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004990


In [42]:
schools_urls2 = ["'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28077865'",
 "'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063751'",
 "'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004989'",
 "'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004990'"]

for z, school in enumerate(schools_urls2):
    print z, school

0 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28077865'
1 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28063751'
2 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004989'
3 'http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm?cdCentro=28004990'
