In [47]:
import numpy as np
import requests
from bs4 import BeautifulSoup

In [119]:
parser = 'html.parser' # or 'lxml'
filter_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'
table_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'

#define the parameter
payload = {'ww_i_reportModel':'133685247'}

#define the table name
table_name = {
    'unite_faculty':'ww_x_UNITE_ACAD',
    'academic_year':'ww_x_PERIODE_ACAD', 
    'semester':'ww_x_PERIODE_PEDAGO',
    'season':'ww_x_HIVERETE'
}

# request example
default_parameter_dict = {
    'ww_b_list':1,
    'ww_i_reportmodel':'133685247',
    'ww_i_reportModelXsl':'133685270'
}


table_columns = [
    'Civilité',
    'Nom Prénom',
    'Orientation Bachelor',
    'Orientation Master',
    'Spécialisation',
    'Filière opt.',
    'Mineur',
    'Statut',
    'Type Echange',
    'Ecole Echange',
    'No Sciper'
]

table_columns_dict = dict(
    zip(table_columns, range(len(table_columns)))
)           

Find the URL

In [54]:
def build_key_map():
    # get the URL
    r = requests.get(filter_url, params = payload)
    # convert to beautifulSoup
    soup = BeautifulSoup(r.text, parser)
    
    # extract the dictionary for Unité académique
    dict_unite = {}
    for table_key in table_name.keys():
        dict_unite_tem = {}
        for unite in soup.select('select[name=%s] option' % table_name[table_key]):
            if len(unite.contents) > 0:            
                dict_unite_tem[(unite.contents)[0]] = unite['value'] 
        dict_unite[table_key] = dict_unite_tem
    return dict_unite

dict_unite = build_key_map()

In [61]:
# function to complete the parameter dictionary
def func_parameter_dict(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite, gps = False):
    # request example
    parameter_dict = default_parameter_dict
    parameter_dict[table_name['academic_year']] = dict_unite['academic_year'][academic_year_str]
    parameter_dict[table_name['season']] = dict_unite['season'][season_str]
    parameter_dict[table_name['semester']] = dict_unite['semester'][semester_str]
    parameter_dict[table_name['unite_faculty']] = dict_unite['unite_faculty'][unite_faculty_str]
    if gps:
        parameter_dict['ww_x_GPS'] = -1
    return parameter_dict

In [71]:
def get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite):
    parameter_dict = func_parameter_dict(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)
    # get the URL
    r = requests.get(filter_url, params = parameter_dict)
    # convert to beautifulSoup
    soup = BeautifulSoup(r2.text, parser)
    # check if the ww_x_GPS is exist or not, if it existed, ww_x_GPS == -1
    if soup.select('a[class=%s]' % "ww_x_GPS")==[]:
        print('The input is not valide')
        raise RuntimeError
        
    # update parameter, add ww_x_GPS
    gps_exist = True
    parameter_dict_gps =  func_parameter_dict(academic_year_str,season_str,semester_str,
                                          unite_faculty_str,dict_unite, gps_exist)

    # get the URL
    r_table = requests.get(table_url, params = parameter_dict_gps)
    # convert to beautifulSoup
    soup_table = BeautifulSoup(r_table.text, parser)
    return soup_table

In [150]:
import pandas as pd

def build_dataframe(soup, list_of_columns):
    l = soup.select('table tr')
    # The first row is the title of the table
    # The second row is the label of columns
    s = []
    
    for column in list_of_columns:
        s.append([item.contents[table_columns_dict[column]].contents[0] for item in l[2:]])
    
    column_dict = dict(zip(list_of_columns, s))
    df = pd.DataFrame(column_dict)
    return df

# Example

In [84]:
academic_year_str = '2007-2008'
season_str = "Semestre d'automne"
semester_str = 'Bachelor semestre 1'
unite_faculty_str =  'Informatique'

In [154]:
# Get the Raw Webpage
soup_table = get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)

In [155]:
# Create a DataFrame from the raw webpage
df = build_dataframe(soup_table, ['No Sciper', 'Statut'])