In [1]:
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'

import requests
from bs4 import BeautifulSoup

# Collect the data

## Get the select options

In [2]:
reportModel = 133685247
full_form_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'
r = requests.get(full_form_url)
soup = BeautifulSoup(r.text, 'html.parser')


In [3]:
# get the name of the dropdown menus
select = soup.find_all('select')
select_name = [s.attrs['name'] for s in select]
select_name

['ww_x_UNITE_ACAD',
 'ww_x_PERIODE_ACAD',
 'ww_x_PERIODE_PEDAGO',
 'ww_x_HIVERETE']

In [4]:
select_fields = [soup.find('select',{'name': name}) for name in select_name]
# the html for each <select> field

In [5]:
# find the value for the informatique section
unite_acad_options = select_fields[0].find_all('option')
#unite_acad_options

In [6]:
unite_acad_informatique ={opt['value']: opt.text for opt in unite_acad_options if opt.text == 'Informatique'}
unite_acad_informatique

{'249847': 'Informatique'}

In [7]:
# periode academic
#select_fields[1].find_all('option')
period_acad = {opt['value']: opt.text for opt in select_fields[1].find_all('option') if opt['value'] != 'null' and int(opt.text.split('-')[0]) >= 2007}
period_acad

{'123455150': '2011-2012',
 '123456101': '2012-2013',
 '213637754': '2013-2014',
 '213637922': '2014-2015',
 '213638028': '2015-2016',
 '355925344': '2016-2017',
 '39486325': '2010-2011',
 '978181': '2007-2008',
 '978187': '2008-2009',
 '978195': '2009-2010'}

In [8]:
# get all the pedagogic periods
option = select_fields[2].find_all('option')
period_pedago = {opt['value']: opt.text for opt in option if opt.text != '' }
period_pedago

{'2063602308': 'Mise à niveau',
 '2226616': 'Stage automne 4ème année',
 '2226626': 'Stage printemps 4ème année',
 '2226768': 'Bachelor semestre 5b',
 '2226785': 'Bachelor semestre 6b',
 '2227132': 'Stage printemps master',
 '2230106': 'Master semestre 1',
 '2230128': 'Master semestre 3',
 '2230140': 'Master semestre 4',
 '2335667': 'Mineur semestre 1',
 '2335676': 'Mineur semestre 2',
 '249108': 'Bachelor semestre 1',
 '249114': 'Bachelor semestre 2',
 '249127': 'Projet Master automne',
 '2754553': 'Semestre printemps',
 '3781783': 'Projet Master printemps',
 '942120': 'Bachelor semestre 5',
 '942155': 'Bachelor semestre 3',
 '942163': 'Bachelor semestre 4',
 '942175': 'Bachelor semestre 6',
 '942192': 'Master semestre 2',
 '953137': 'Stage automne 3ème année',
 '953159': 'Semestre automne',
 '983606': 'Stage printemps 3ème année'}

In [9]:
option = select_fields[3].find_all('option')
hiverEte = {opt['value']: opt.text for opt in option if opt['value'] != 'null'}
hiverEte

{'2936286': "Semestre d'automne", '2936295': 'Semestre de printemps'}

## Collect

In [10]:
# arguments are tuples (key, 'description') eg: ('2936286': "Semestre d'automne")
def collect_dataframe(t_unite_acad, t_periode_acad, t_periode_pedago, t_hiver_ete, final_headers):
    print("collect_dataframe: input: "+str(t_unite_acad)+" & "+str(t_periode_acad)+" &"+str(t_periode_pedago)+" & "+str(t_hiver_ete))

    #Send request
    params = {
                'ww_x_GPS': -1, 
                'ww_i_reportModel': reportModel, 
                'ww_i_reportModelXsl': 133685270, 
                'ww_x_UNITE_ACAD': t_unite_acad[0], 
                'ww_x_PERIODE_ACAD': t_periode_acad[0], 
                'ww_x_PERIODE_PEDAGO': t_periode_pedago[0], 
                'ww_x_HIVERETE': t_hiver_ete[0]
             }
    
    url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'
    
    r = requests.get(url, params=params)
    soupe = BeautifulSoup(r.text, 'html.parser')
    
    # get all the tr tags
    tr_tags = soupe.find_all('tr')
    
    #Temporary dictionary that will collect all the entry of the dataframe
    data = {}
    
    # there may be several tables.
    current_table = 't1'
    
    # for each tr tag, determine if it is a table title, a header (ignore those) or a student row
    for tr in tr_tags:
        th = tr.find_all('th')
        if(len(th) == 1): #this is a table title
            current_table = th[0].text.split('\n')[0]
            data[current_table] = []
            #print('current table: '+str(current_table))
        
        elif(len(th) > 1): #this is the header row (ignore because is always the same)
            #print('headers: '+str([t.text for t in th]))
            pass
            
        else:
            # this is a student
            td_tags = tr.find_all('td')
            student = [td.text.replace('\xa0', ' ') for td in td_tags[:-1]] #drop last td because it is always empty
            # add the desired columns
            student.append(current_table)
            student.append(t_periode_acad[1])
            student.append(t_periode_pedago[1])
            #print('student: '+str(student))
            data[current_table].append(student)
    
    # return all different dataframes
    dframes = [pd.DataFrame(data[k], columns=final_headers) for k in list(data.keys())]

    return dframes
    

make one request for all permutations of (unite_acad, periode_acad, periode_pedago). We ignore hiverete because it is redundant.

In [11]:
perm_list = list(itertools.product(list(unite_acad_informatique.items()), list(period_acad.items()), list(period_pedago.items())))


In [12]:
# get the data
header = ['Civilité', 'Nom_Prénom', 'Orientation_Bachelor', 'Orientation_Master', 'Spécialisation', 'Filière_opt.', 'Mineur', 'Statut', 'Type_Echange', 'Ecole_Echange', 'No_Sciper', 'title', 'periode_acad', 'periode_pedago']
dframes = []
for (ua, pa, pp) in perm_list:
    res = collect_dataframe(ua, pa, pp,('null', 'null'), header)
    [dframes.append(df) for df in res]


collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('2230128', 'Master semestre 3') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('2063602308', 'Mise à niveau') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('2230140', 'Master semestre 4') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('2335676', 'Mineur semestre 2') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('2226616', 'Stage automne 4ème année') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('249114', 'Bachelor semestre 2') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informatique') & ('213637922', '2014-2015') &('942120', 'Bachelor semestre 5') & ('null', 'null')
collect_dataframe: input: ('249847', 'Informati

In [13]:
#concatenate
all_data = pd.concat([df for df in dframes])

In [14]:
# write to file
#all_data.to_csv('all_data.csv')

In [15]:
len(all_data)

8942

Note that the Mineur semestre X are always empty. That is why they dont appear here.

In [16]:
all_data['periode_pedago'].unique()

array(['Master semestre 3', 'Bachelor semestre 2', 'Bachelor semestre 5',
       'Semestre printemps', 'Master semestre 2', 'Bachelor semestre 3',
       'Bachelor semestre 1', 'Bachelor semestre 6', 'Semestre automne',
       'Master semestre 1', 'Projet Master automne', 'Bachelor semestre 4',
       'Projet Master printemps'], dtype=object)