In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
# New imports for HW2:
import requests
from bs4 import BeautifulSoup

In [2]:
form_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'
base_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'
params = {'ww_i_reportModel':'133685247', 
          'ww_i_reportModelXsl':'133685270', 
          'ww_x_GPS':'-1'}

In [3]:
page = requests.get(form_url, params)

In [4]:
soup = BeautifulSoup(page.content, 'lxml')
keys = {}
for s in soup.find_all('select'):
    for o in s.find_all('option'):
        if o.text != '': # No need for empty (default values)
            keys.setdefault(s['name'], {}).update({o.text: o['value']})
            
# A helper method to change a parameter
# name : the parameter
# key : the key in plain text (eg. 'Bachelor semestre 1')
def updateParam(name, key):
    return params.update({name: keys[name][key]})

In [5]:
updateParam('ww_x_UNITE_ACAD', 'Informatique')
keys['ww_x_PERIODE_PEDAGO']

{'Bachelor semestre 1': '249108',
 'Bachelor semestre 2': '249114',
 'Bachelor semestre 3': '942155',
 'Bachelor semestre 4': '942163',
 'Bachelor semestre 5': '942120',
 'Bachelor semestre 5b': '2226768',
 'Bachelor semestre 6': '942175',
 'Bachelor semestre 6b': '2226785',
 'Master semestre 1': '2230106',
 'Master semestre 2': '942192',
 'Master semestre 3': '2230128',
 'Master semestre 4': '2230140',
 'Mineur semestre 1': '2335667',
 'Mineur semestre 2': '2335676',
 'Mise à niveau': '2063602308',
 'Projet Master automne': '249127',
 'Projet Master printemps': '3781783',
 'Semestre automne': '953159',
 'Semestre printemps': '2754553',
 'Stage automne 3ème année': '953137',
 'Stage automne 4ème année': '2226616',
 'Stage printemps 3ème année': '983606',
 'Stage printemps 4ème année': '2226626',
 'Stage printemps master': '2227132'}

In [8]:
semestres = [s for s in keys['ww_x_PERIODE_PEDAGO'] if (s.find('Master') != -1)]
years = keys['ww_x_PERIODE_ACAD']
semestres

['Projet Master printemps',
 'Master semestre 2',
 'Master semestre 4',
 'Master semestre 3',
 'Master semestre 1',
 'Projet Master automne']

In [9]:
students = {}
for year in years.keys():
    for sem in semestres:
        updateParam('ww_x_PERIODE_ACAD', year)
        updateParam('ww_x_PERIODE_PEDAGO', sem)
        r = requests.get(base_url, params).text
        soup = BeautifulSoup(r, 'lxml')
        for row in soup.findAll('tr'):
            if row.contents[0].name != 'th':
                gender = 'M' if row.contents[0].text == 'Monsieur' else 'F'
                sciper = row.contents[10].text
                spe = row.contents[4].text
                minor = row.contents[6].text                    
                if not sciper in students:
                    students[sciper] = {'gender':gender, 'spe':'', 'minor':''}
                if not sem in students[sciper]:
                    students[sciper][sem] = 0
                if len(spe) > 0:
                    students[sciper]['spe'] = spe
                if len(minor) > 0:
                    students[sciper]['minor'] = minor
                students[sciper][sem] += 0.5

In [10]:
# Filtering only finished students
students_finished = {}
for sciper, v in students.items():
    if any (sem in v for sem in ('Master semestre 1', 'Master semestre 2')):
        if any (sem in v for sem in ('Master semestre 2', 'Master semestre 3')):
            if (v['spe'] != '' and 'Master semestre 3' in v and v['Master semestre 3'] == 0):
                print(sciper) # should print if no sem. 3 but spe. not empty
            students_finished[sciper] = v

In [11]:
# Mean of students with specialization
with_spe = {k: v for k,v in students_finished.items() if v['spe'] != ''}
pf = pd.DataFrame(with_spe).drop(['gender', 'minor', 'spe'])
ps = pf.loc[['Master semestre 1', 'Master semestre 2', 'Master semestre 3']].fillna(0)
ps = ps.apply(sum)
pm = pf.loc[['Projet Master automne', 'Projet Master printemps']].fillna(0)
pm = pm.apply(sum).replace(0, 0.5)
lengths = pd.concat([ps, pm], axis=1, join_axes=[ps.index]).apply(sum, axis=1)
lengths.mean()

2.3346456692913384

In [12]:
# Mean without specialization
no_spe = {k: v for k,v in students_finished.items() if v['spe'] == ''}
pf = pd.DataFrame(no_spe).drop(['gender', 'minor', 'spe'])
ps = pf.loc[['Master semestre 1', 'Master semestre 2', 'Master semestre 3']].fillna(0)
ps = ps.apply(sum)
pm = pf.loc[['Projet Master automne', 'Projet Master printemps']].fillna(0)
pm = pm.apply(sum).replace(0, 0.5)
lengths = pd.concat([ps, pm], axis=1, join_axes=[ps.index]).apply(sum, axis=1)
lengths.mean()

1.997196261682243