In [1]:
# This script use sys, request, PyQt4, bs4 and pandas python libraries.

import sys
import requests
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
import pandas as pd
import re

### Collect codes of public elementary schools in municipality of Madrid.

In [2]:
# Url to perform advanced searches.
url_advsearch = "http://www.madrid.org/wpad_pub/run/j/BusquedaAvanzada.icm"

In [3]:
# Parameter for searching public primary schools in municipality of Madrid
params = {"titularidadPublica": "S", "cdMuni": "079", "cdNivelEdu": "6545"}

In [4]:
# Request and parse list of schools.
schools = BeautifulSoup(requests.post(url_advsearch, data = params).content, "lxml")

In [5]:
# Extract list of school codes.
school_codes = schools.findAll(attrs = {"name": "codCentrosExp", "value": re.compile("^.+$")})[0]["value"]

In [6]:
# Convert from string to list.
school_codes = school_codes.split(";")

# Check codes.
# 247 it's ok.
print len(school_codes)

247


### Extract tables from school cards.

In [7]:
# The data that we want to obtain from each school is contained in tables,
# whose content is generated on-the-fly through JavaScript code.
# We can't use 'request' library again because only fetch source code of
# the web page but it doesn't run code. So we need to mimic the rendering
# process of a browser.
# The QtWebKit module, in PyQt4 toolkit library, implements a web browser
# engine based on the WebKit open source browser engine. 


# Create the class 'mimic-render' that is inheriting from QWebPage.
class mimic_render(QWebPage):

  def __init__(self, url):
    self.app = QApplication(sys.argv)
    QWebPage.__init__(self)
    self.loadFinished.connect(self.on_page_load)
    self.mainFrame().load(QUrl(url))
    self.app.exec_()

  def on_page_load(self, result):
    self.frame = self.mainFrame()
    self.app.quit()

In [8]:
# School card url.
url_schoolcard = "http://www.madrid.org/wpad_pub/run/j/MostrarFichaCentro.icm"

# School code parameter.
school_code_par = "cdCentro="

# Url to instance the class 'mimic_render'.
url_card_0 = url_schoolcard+"?"+school_code_par+school_codes[0]
url_card_100 = url_schoolcard+"?"+school_code_par+school_codes[100]
url_card_200 = url_schoolcard+"?"+school_code_par+school_codes[200]

In [9]:
# Create an instance of the class 'mimic_render'.
# QtP4 load the web page creating a 'mimic_render' object, that is
# basically a QWebPage object.
render_content_0 = mimic_render(url_card_0)

In [None]:
render_content_100 = mimic_render(url_card_100)

In [None]:
render_content_200 = mimic_render(url_card_200)

In [None]:
# QtP4 grab the source code from QWebPage.
source_0 = render_content_0.frame.toHtml()
source_100 = render_content_100.frame.toHtml()
source_200 = render_content_200.frame.toHtml()

In [None]:
# Convert QString to string so it can be handled by BeautifulSoup.
formatted_source_0 = str(source_0.toAscii())
formatted_source_100 = str(source_100.toAscii())
formatted_source_200 = str(source_200.toAscii())

In [None]:
# Parse school card.
school_card_0 = BeautifulSoup(formatted_source_0, "lxml")
school_card_100 = BeautifulSoup(formatted_source_100, "lxml")
school_card_200 = BeautifulSoup(formatted_source_200, "lxml")

In [None]:
# Extract tables html
school_tables_0 = school_card_0.findAll('table', class_="tablaGraficaDatos")
school_tables_100 = school_card_100.findAll('table', class_="tablaGraficaDatos")
school_tables_200 = school_card_200.findAll('table', class_="tablaGraficaDatos")

In [None]:
# Extract school name
school_name_0 = school_card_0.find(style="text-transform:uppercase").next.next
school_name_100 = school_card_100.find(style="text-transform:uppercase").next.next
school_name_200 = school_card_200.find(style="text-transform:uppercase").next.next