In [89]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
# New imports for HW3:
import requests
from bs4 import BeautifulSoup

# Intro

## IS-Academia

After looking at the webpage and some requests (using Postman), we can derive the following:
* the URL for the search form (containing all the possibilities)
* the URL for the querries
* the base parameters used for querries (espacially the `ww_x_GPS` set to -1 to get all the answers)

In [97]:
url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'
req_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'
params = {'ww_i_reportModel':'133685247', 
          'ww_i_reportModelXsl':'133685270', 
          'ww_x_GPS':'-1'}

In [98]:
page = requests.get(url, params)

Now that we get the page, we can use Beautifulsoup to parse it and retrieve all the select fields and their corresponding options:

In [148]:
soup = BeautifulSoup(page.content, 'lxml')
keys = {}
for s in soup.find_all('select'):
    for o in s.find_all('option'):
        if o.text != '': # No need for empty (default values)
            keys.setdefault(s['name'], {}).update({o.text: o['value']})

# A shorter version but maybe less readable:
# keys = {s['name']: {o.text: o['value'] for o in s.find_all('option')} for s in soup.find_all('select')}

def updateParam(name, key):
    return params.update({name: keys[name][key]})

Now we have the information needed to querry the server.  
__Remark:__ the field for winter/spring semester is purely redondent. (eg. selecting 'Bachelor semester 1' and spring semester produces an empty answer.)

## Task 1

Getting all Bachelor students from year 2007:

In [140]:
updateParam('ww_x_UNITE_ACAD', 'Informatique')
# Getting all the years from 2007:
years = keys['ww_x_PERIODE_ACAD']

# Getting Bachelor semester 1 and 6:
semesters = ['Bachelor semestre 1', 'Bachelor semestre 6']

# Requesting all the needed students' information:
students = {}
for year in years.keys():
    for sem in semesters:
        updateParam('ww_x_PERIODE_ACAD', year)
        updateParam('ww_x_PERIODE_PEDAGO', sem)
        r = requests.get(req_url, params).text
        soup = BeautifulSoup(r, 'lxml')
        for row in soup.findAll('tr'):
            if row.contents[0].name != 'th':
                gender = 'M' if row.contents[0].text == 'Monsieur' else 'F'
                sciper = row.contents[10].text
                if not sciper in students:
                    students[sciper] = {'gender':gender, 'sem':{}}
                if not sem in students[sciper]['sem']:
                    students[sciper]['sem'][sem] = []
                students[sciper]['sem'][sem].append(year)


{'181612': {'gender': 'F', 'sem': {'Bachelor semestre 1': ['2007-2008']}},
 '274615': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2016-2017']}},
 '187143': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2009-2010']}},
 '272430': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2016-2017']}},
 '260589': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2016-2017']}},
 '170225': {'gender': 'M', 'sem': {'Bachelor semestre 6': ['2008-2009']}},
 '169795': {'gender': 'M',
  'sem': {'Bachelor semestre 1': ['2007-2008'],
   'Bachelor semestre 6': ['2009-2010', '2010-2011']}},
 '260709': {'gender': 'M',
  'sem': {'Bachelor semestre 1': ['2016-2017', '2015-2016']}},
 '184228': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2008-2009']}},
 '219731': {'gender': 'F', 'sem': {'Bachelor semestre 1': ['2013-2014']}},
 '181445': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2007-2008']}},
 '245846': {'gender': 'M', 'sem': {'Bachelor semestre 1': ['2014-2015']}},
 '262259': {'gender': 'M',
  

In [142]:
students['234523']

{'gender': 'M',
 'sem': {'Bachelor semestre 1': ['2013-2014'],
  'Bachelor semestre 6': ['2015-2016']}}

In [146]:
updateParam('ww_x_PERIODE_ACAD', '2007-2008')
updateParam('ww_x_PERIODE_PEDAGO', 'Bachelor semestre 5')
r = requests.get(req_url, params).text
table = BeautifulSoup(r, 'lxml').find('table')
#print(table)
bachelors = pd.read_html(r, skiprows=1, header=0, index_col=10)[0]
df = bachelors[bachelors.columns[[0,1,7]]]
df.loc[df['Statut'] != 'Présent']

Unnamed: 0_level_0,Civilité,Nom Prénom,Statut
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
166548,Monsieur,Delort Vincent,Congé
161091,Madame,Grivet Ekaterina,Congé
161127,Monsieur,Sahy François,Congé


In [147]:
students = {}
params['ww_x_UNITE_ACAD'] = keys['ww_x_UNITE_ACAD']['Informatique']
for year, year_value in years:
    for sem, sem_value in semesters.items():
        params['ww_x_PERIODE_ACAD'] = year_value
        params['ww_x_PERIODE_PEDAGO'] = sem_value
        r = requests.get(req_url, params).text
        soup = BeautifulSoup(r, 'lxml')
        for row in soup.findAll('tr'):
            if row.contents[0].name != 'th':
                sciper = row.contents[10].text
                gender = 'M' if row.contents[0].text == 'Monsieur' else 'F'
                if not sciper in students:
                    students[sciper] = {'gender':gender, 'sem':{}}
                if not sem in students[sciper]['sem']:
                    students[sciper]['sem'][sem] = 0
                students[sciper]['sem'][sem] += 1

ValueError: too many values to unpack (expected 2)

In [None]:
#{k: v for k, v in students.items() if k == '235568'}
students_finished = {k: v for k, v in students.items() if 'Bachelor semestre 6' in v['sem'] and 'Bachelor semestre 1' in v['sem']}
result = []
for k, v in students_finished.items():
    count = 0
    for s, c in v['sem'].items():
        count += c
    result.append([v['gender'],count])
result = pd.DataFrame(result, columns=['gender', 'sem']).groupby('gender')
print(result.describe())
males = result.get_group('M')['sem']
females = result.get_group('F')['sem']
plt.hist([males, females], color=['r','b'], alpha=0.5, normed=True)
#males.plot(kind='hist', alpha=.4, legend=True)
#result.get_group('M').describe()
#result.get_group('F').describe()
#print(result[0].value_counts())
#print(result.describe())
#sns.distplot([result], kde=False)
#plt.hist(result)
#plt.show()