In [47]:
import numpy as np
import requests
from bs4 import BeautifulSoup

In [119]:
parser = 'html.parser' # or 'lxml'
filter_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'
table_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'

#define the parameter
payload = {'ww_i_reportModel':'133685247'}

#define the table name
table_name = {
    'unite_faculty':'ww_x_UNITE_ACAD',
    'academic_year':'ww_x_PERIODE_ACAD', 
    'semester':'ww_x_PERIODE_PEDAGO',
    'season':'ww_x_HIVERETE'
}

# request example
default_parameter_dict = {
    'ww_b_list':1,
    'ww_i_reportmodel':'133685247',
    'ww_i_reportModelXsl':'133685270'
}


table_columns = [
    'Civilité',
    'Nom Prénom',
    'Orientation Bachelor',
    'Orientation Master',
    'Spécialisation',
    'Filière opt.',
    'Mineur',
    'Statut',
    'Type Echange',
    'Ecole Echange',
    'No Sciper'
]

table_columns_dict = dict(
    zip(table_columns, range(len(table_columns)))
)           

Find the URL

In [54]:
def build_key_map():
    # get the URL
    r = requests.get(filter_url, params = payload)
    # convert to beautifulSoup
    soup = BeautifulSoup(r.text, parser)
    
    # extract the dictionary for Unité académique
    dict_unite = {}
    for table_key in table_name.keys():
        dict_unite_tem = {}
        for unite in soup.select('select[name=%s] option' % table_name[table_key]):
            if len(unite.contents) > 0:            
                dict_unite_tem[(unite.contents)[0]] = unite['value'] 
        dict_unite[table_key] = dict_unite_tem
    return dict_unite

dict_unite = build_key_map()

In [61]:
# function to complete the parameter dictionary
def func_parameter_dict(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite, gps = False):
    # request example
    parameter_dict = default_parameter_dict
    parameter_dict[table_name['academic_year']] = dict_unite['academic_year'][academic_year_str]
    parameter_dict[table_name['season']] = dict_unite['season'][season_str]
    parameter_dict[table_name['semester']] = dict_unite['semester'][semester_str]
    parameter_dict[table_name['unite_faculty']] = dict_unite['unite_faculty'][unite_faculty_str]
    if gps:
        parameter_dict['ww_x_GPS'] = -1
    return parameter_dict

In [71]:
def get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite):
    parameter_dict = func_parameter_dict(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)
    # get the URL
    r = requests.get(filter_url, params = parameter_dict)
    # convert to beautifulSoup
    soup = BeautifulSoup(r2.text, parser)
    # check if the ww_x_GPS is exist or not, if it existed, ww_x_GPS == -1
    if soup.select('a[class=%s]' % "ww_x_GPS")==[]:
        print('The input is not valide')
        raise RuntimeError
        
    # update parameter, add ww_x_GPS
    gps_exist = True
    parameter_dict_gps =  func_parameter_dict(academic_year_str,season_str,semester_str,
                                          unite_faculty_str,dict_unite, gps_exist)

    # get the URL
    r_table = requests.get(table_url, params = parameter_dict_gps)
    # convert to beautifulSoup
    soup_table = BeautifulSoup(r_table.text, parser)
    return soup_table

In [150]:
import pandas as pd

def build_dataframe(soup, list_of_columns):
    l = soup.select('table tr')
    # The first row is the title of the table
    # The second row is the label of columns
    s = []
    
    for column in list_of_columns:
        s.append([item.contents[table_columns_dict[column]].contents[0] for item in l[2:]])
    
    column_dict = dict(zip(list_of_columns, s))
    df = pd.DataFrame(column_dict)
    return df

# Example

In [84]:
academic_year_str = '2007-2008'
season_str = "Semestre d'automne"
semester_str = 'Bachelor semestre 1'
unite_faculty_str =  'Informatique'

In [154]:
# Get the Raw Webpage
soup_table = get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)

In [199]:
# Create a DataFrame from the raw webpage
df = build_dataframe(soup_table, ['No Sciper', 'Statut'])

# Assignment 1

In [202]:
season_str = "Semestre d'automne"
semester_str = 'Bachelor semestre 1'
unite_faculty_str =  'Informatique'

df_list = []
for year in range(2007, 2017):
    academic_year_str = str(year) + '-' + str(year+1)
    print(academic_year_str)
    soup_table = get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)
    df_list.append(build_dataframe(soup_table, ['Nom Prénom', 'No Sciper', 'Statut']))
    df = pd.concat(df_list, axis=0)
    
df['Semester'] = semester_str
print(df.head())

2007-2008
2008-2009
2009-2010
2010-2011
2011-2012
2012-2013
2013-2014
2014-2015
2015-2016
2016-2017
  No Sciper          Nom Prénom   Statut             Semester
0    169569   Arévalo Christian  Présent  Bachelor semestre 1
1    174905     Aubelle Flavien  Présent  Bachelor semestre 1
2    173922       Badoud Morgan  Présent  Bachelor semestre 1
3    179406  Baeriswyl Jonathan  Présent  Bachelor semestre 1
4    179428     Barroco Michael  Présent  Bachelor semestre 1


In [205]:
df_list

[   No Sciper              Nom Prénom   Statut
 0     169569       Arévalo Christian  Présent
 1     174905         Aubelle Flavien  Présent
 2     173922           Badoud Morgan  Présent
 3     179406      Baeriswyl Jonathan  Présent
 4     179428         Barroco Michael  Présent
 5     179324          Belfis Nicolas  Présent
 6     174597       Beliaev Stanislav  Présent
 7     179449   Bindschaedler Vincent  Présent
 8     178553      Bloch Marc-Olivier  Présent
 9     179426              Bloch Remi  Présent
 10    178271  Boéchat Marc-Alexandre  Présent
 11    182433          Böhler Michael  Présent
 12    180731    Bricola Jean-Charles  Présent
 13    171619    Buchschacher Nicolas  Présent
 14    179837            Caloz Daniel  Présent
 15    179157          Calvert Thomas  Présent
 16    179864        Cambazard Thomas  Présent
 17    174590     Camenzind Guillaume  Présent
 18    178843          Cattin Nicolas  Présent
 19    178711            Cino Emanuel  Présent
 20    178786

In [207]:
df.size

6876

In [165]:
# Get the Raw Webpage
soup_table = get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)
soup_table

<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"><link href="gedpublicreports.css?ww_x_path=Gestac.Moniteur.Style" rel="stylesheet" type="text/css"/></meta></head><body alink="#666666" bgcolor="#ffffff" link="#666666" marginheight="0" marginwidth="5" vlink="#666666"><fieldset style="text-align:right; width:40%; position:relative; margin-right: 10px;float:right; border: 0; padding: 0 0 8px 0;"><a href="!GEDREPORTS.html?ww_x_PERIODE_ACAD=978181&amp;ww_i_reportmodel=133685247&amp;ww_x_GPS=-1&amp;ww_x_UNITE_ACAD=249847&amp;ww_b_list=1&amp;ww_x_HIVERETE=2936286&amp;ww_x_PERIODE_PEDAGO=249108&amp;ww_i_reportModelXsl=133685270" style="color:#990033;">Identification pour accéder aux e-mails<br>Login to access email adresses</br></a></fieldset><script>
  function mailList(x) {
   var vtop = (screen.height-200)/2;
   var vleft=(screen.width-600)/2;
   var w=open("", "emaillist", "Scrollbars=1,resizable=1,width=600,height=200,top="+vtop+",left="+vleft);
   w.document

In [190]:
df1 = pd.DataFrame({'A' : [1 , 3], 'B' : [2, 8]})
df2 = pd.DataFrame({'A' : [1, 2], 'B' : [3,8]})

In [196]:
df3 = pd.concat([df1, df2], axis=0)
df4 = df3.set_index('B')

In [197]:
df4

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
2,1
8,3
3,1
8,2


In [198]:
df4.loc[8]

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
8,3
8,2
