# Data collection

In [18]:
#%load_ext soup
import requests
from bs4 import BeautifulSoup
import pandas as pd
from os import path

## Preparation

URL for the filtering page, different from results URL. The parameter is mandatory

In [19]:
url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter"

In [20]:
arguments = {'ww_i_reportModel':133685247} # grab HTML data

In [21]:
r = requests.get(url, params=arguments)
isa_form = BeautifulSoup(r.text, 'html.parser')

Extract parameters from the filtering page to allow easily querying:

In [22]:
request_codes = {}
for input in isa_form.find_all('select'):
    for element in input.find_all('option'):
        #print(element.text)
        request_codes[element.text] = (input['name'], element['value'])

This is the base url for data results, different from the filtering one:

In [23]:
url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?"

Add hiddent default params:

In [24]:
arguments["ww_x_GPS"] = "-1"  # this one is mandatory and always -1
arguments["ww_i_reportModel"] = "133685247"
arguments["ww_i_reportModelXsl"] = "133685270"
arguments["ww_x_HIVERETE"] = "null"

And specify only the IN section:

In [25]:
arguments[request_codes["Informatique"][0]] = request_codes["Informatique"][1]

## Collection

In [30]:
def download_data(url, params, period_name, base_filename, year, semester, verbose=False):
    """ Downloads data from URL with params and saves it to disk """
    filename = base_filename + '_%d_%s.csv' % (year, str(semester))
    if path.isfile(filename):
        if verbose: print("Already have file %s" % filename)
        return  # avoid unnecessary requests
    
    # translate period name into required parameters and do request
    arguments[request_codes[period_name][0]] = request_codes[period_name][1]
    r = requests.get(url, params=arguments)
    try:
        # data is in the first HTML <table>; first row is 'noise'
        if verbose: print("Getting data %s..." % filename, end=' ')
        table = pd.read_html(r.text, flavor='lxml', skiprows=1, header=0)
        if len(table) == 0:  # pandas didn't read any frame
            if verbose: print("Period contains no data. Skipping...")
            return
        table = table[0]
        table = table.dropna(axis=1, how='all')  # remove empty columns
        table['year']     = year
        table['semester'] = semester
        table.to_csv(filename)
        if verbose: print("Saved period as %s" % filename)
    except ValueError as e:
        # pandas couldn't read, i.e. there was no table to process
        if verbose: print("Failed to read %s" % filename)
        pass

In [31]:
bachelor_semesters = [1, 5, 6]    # we need the 5th to detect 4-5-6-5
bachelor_base_file = "data/bachelor" 

master_semesters = [1, 2, 3, 4]   # though in the 4th there's no data
master_base_file = 'data/master'

project_semesters = ['automne', 'printemps']
project_base_file = 'data/projet_master'

for year in range(2007,2017):
    # fill in params request: year
    y_range = "%d-%d" % (year, year+1)
    arguments[request_codes[y_range][0]] = request_codes[y_range][1]
    for semester in bachelor_semesters:
        # fill in params request: semester
        academic_period = "Bachelor semestre %d" % semester
        download_data(url, arguments, academic_period, bachelor_base_file, year, semester, verbose=True)

    for semester in master_semesters:
        academic_period = "Master semestre %d" % semester
        download_data(url, arguments, academic_period, master_base_file  , year, semester, verbose=True)
    
    for semester in project_semesters:
        academic_period = "Projet Master %s" % semester
        download_data(url, arguments, academic_period, project_base_file , year, semester, verbose=True)

Already have file data/bachelor_2007_1.csv
Already have file data/bachelor_2007_5.csv
Already have file data/bachelor_2007_6.csv
Already have file data/master_2007_1.csv
Already have file data/master_2007_2.csv
Already have file data/master_2007_3.csv
Getting data data/master_2007_4.csv... Failed to read data/master_2007_4.csv
Already have file data/projet_master_2007_automne.csv
Already have file data/projet_master_2007_printemps.csv
Already have file data/bachelor_2008_1.csv
Already have file data/bachelor_2008_5.csv
Already have file data/bachelor_2008_6.csv
Already have file data/master_2008_1.csv
Already have file data/master_2008_2.csv
Already have file data/master_2008_3.csv
Getting data data/master_2008_4.csv... Failed to read data/master_2008_4.csv
Already have file data/projet_master_2008_automne.csv
Already have file data/projet_master_2008_printemps.csv
Already have file data/bachelor_2009_1.csv
Already have file data/bachelor_2009_5.csv
Already have file data/bachelor_2009

### Let's take a look at a small sample

In [32]:
pd0 = pd.read_csv('data/bachelor_2007_1.csv')
pd0.head()

Unnamed: 0.1,Unnamed: 0,Civilité,Nom Prénom,No Sciper,year,semester
0,0,Monsieur,Arévalo Christian,169569,2007,1
1,1,Monsieur,Aubelle Flavien,174905,2007,1
2,2,Monsieur,Badoud Morgan,173922,2007,1
3,3,Monsieur,Baeriswyl Jonathan,179406,2007,1
4,4,Monsieur,Barroco Michael,179428,2007,1


In [22]:
pd1 = pd.read_csv("data/master_2007_1.csv")
pd1.head()

Unnamed: 0.1,Unnamed: 0,Civilité,Nom Prénom,Spécialisation,Statut,Type Echange,Ecole Echange,No Sciper,year,semester
0,0,Monsieur,Aeberhard François-Xavier,,Présent,,,153066,2007,1
1,1,Madame,Agarwal Megha,,Présent,,,180027,2007,1
2,2,Monsieur,Anagnostaras David,,Présent,,,152232,2007,1
3,3,Monsieur,Auroux Damien,,Présent,,,177395,2007,1
4,4,Monsieur,Awalebo Joseph,,Présent,,,161970,2007,1


In [21]:
pd2 = pd.read_csv("data/projet_master_2007_printemps.csv")
pd2.head()

Unnamed: 0.1,Unnamed: 0,Civilité,Nom Prénom,Spécialisation,Statut,No Sciper,year,semester
0,0,Monsieur,Brutsche Florian,Internet computing,Congé,159852,2007,printemps
1,1,Monsieur,Dotta Mirco,,Stage,153819,2007,printemps
2,2,Monsieur,Hügli Michael,,Stage,145957,2007,printemps
3,3,Monsieur,Indra Saurabh,,Présent,173257,2007,printemps
4,4,Monsieur,Lépine Simon,Biocomputing,Présent,160150,2007,printemps
