In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
# New imports for HW3:
import requests
from bs4 import BeautifulSoup

# Intro
## Requirements
This homework requires the python package 'html5lib'. install it with

```conda install html5lib```

## IS-Academia

First looking at the public portal, we retrieve the URL for requests to the server API and the base report model key/value pair needed to get some reply.

In [3]:
url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter"
params = {"ww_i_reportModel":"133685247"}

In [4]:
r = requests.get(url, params)

In [5]:
soup = BeautifulSoup(r.content, 'lxml')
isa_form = {s['name']:{o.text:o['value'] for o in s.find_all('option')} for s in soup.find_all('select')}
isa_form

{'ww_x_HIVERETE': {'': 'null',
  "Semestre d'automne": '2936286',
  'Semestre de printemps': '2936295'},
 'ww_x_PERIODE_ACAD': {'': 'null',
  '2007-2008': '978181',
  '2008-2009': '978187',
  '2009-2010': '978195',
  '2010-2011': '39486325',
  '2011-2012': '123455150',
  '2012-2013': '123456101',
  '2013-2014': '213637754',
  '2014-2015': '213637922',
  '2015-2016': '213638028',
  '2016-2017': '355925344'},
 'ww_x_PERIODE_PEDAGO': {'': 'null',
  'Bachelor semestre 1': '249108',
  'Bachelor semestre 2': '249114',
  'Bachelor semestre 3': '942155',
  'Bachelor semestre 4': '942163',
  'Bachelor semestre 5': '942120',
  'Bachelor semestre 5b': '2226768',
  'Bachelor semestre 6': '942175',
  'Bachelor semestre 6b': '2226785',
  'Master semestre 1': '2230106',
  'Master semestre 2': '942192',
  'Master semestre 3': '2230128',
  'Master semestre 4': '2230140',
  'Mineur semestre 1': '2335667',
  'Mineur semestre 2': '2335676',
  'Mise à niveau': '2063602308',
  'Projet Master automne': '2491

Now we will extract and save some request parameters from the above dict.

In [6]:
periode_acad_list = {k: v for k,v in isa_form["ww_x_PERIODE_ACAD"].items() if k}
periode_acad_list

{'2007-2008': '978181',
 '2008-2009': '978187',
 '2009-2010': '978195',
 '2010-2011': '39486325',
 '2011-2012': '123455150',
 '2012-2013': '123456101',
 '2013-2014': '213637754',
 '2014-2015': '213637922',
 '2015-2016': '213638028',
 '2016-2017': '355925344'}

In [7]:
unite_acad = isa_form["ww_x_UNITE_ACAD"]["Informatique"]
periode_pedago_list_bachelor = {k: v for k,v in isa_form["ww_x_PERIODE_PEDAGO"].items() if k.startswith("Bachelor")}
periode_pedago_list_master = {k: v for k,v in isa_form["ww_x_PERIODE_PEDAGO"].items() if "Master" in k or "master" in k}
periode_pedago_list_master

{'Master semestre 1': '2230106',
 'Master semestre 2': '942192',
 'Master semestre 3': '2230128',
 'Master semestre 4': '2230140',
 'Projet Master automne': '249127',
 'Projet Master printemps': '3781783',
 'Stage printemps master': '2227132'}

In [8]:
hiver_ete = isa_form["ww_x_HIVERETE"] 

Now, for each combination of "Période académique" and "Période pedagogique", we fetch the data and store it. We then merge all those dataframes into one.

In [14]:
report_url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"
params = {
    "ww_x_GPS": "-1",
    "ww_i_reportModel" : "133685247",
    "ww_i_reportModelXsl" : "133685270", 
    "ww_x_HIVERETE" : "null",
    "ww_x_UNITE_ACAD" : "249847" # = section Informatique
}

pieces = {}

for periode_acad_k, periode_acad_v in periode_acad_list.items():
    pieces_acad = {}
    for periode_pedago_k, periode_pedago_v in periode_pedago_list_master.items():
        params["ww_x_PERIODE_PEDAGO"] = periode_pedago_v
        params["ww_x_PERIODE_ACAD"] = periode_acad_v
        r = requests.get(report_url, params)

        try:
            data = pd.read_html(r.content, header=1, index_col=[10, 0])
            if not data:
                continue
            data = data[0]
            pieces_acad[periode_pedago_k] = data
        except ValueError:
            # Skip pages with no tables
            # print(r.url)
            continue
    if not len(pieces_acad):
        continue
    pieces[periode_acad_k] = pd.concat(pieces_acad)
data = pd.concat(pieces)
data_Bak = data.copy()
#data.columns = ["semestre", "periode", "sciper", "","","","","","","",""]
data.index = data.index.set_names(["year", "semester", "sciper", "civ"])
data["count"] = 1.0
data.index = data.index.swaplevel(0, 2)

In [15]:
data
#data.index = data.index.sortlevel(0)
#total_semesters_per_sciper = data.sort_index(0)[["count"]].unstack(1)#.groupby(level="sciper").agg(np.sum).dropna()#.stack()\
#.groupby(level="sciper").agg(np.sum)
#total_semesters_per_sciper


#unstacked  = data.sort_index(0)[["count"]].unstack(1).reset_index().set_index(["sciper", "civ"])




Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,Unnamed: 11,count
sciper,semester,year,civ,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
153066,Master semestre 1,2007-2008,Monsieur,Aeberhard François-Xavier,,,,,,Présent,,,,1.0
180027,Master semestre 1,2007-2008,Madame,Agarwal Megha,,,,,,Présent,,,,1.0
152232,Master semestre 1,2007-2008,Monsieur,Anagnostaras David,,,,,,Présent,,,,1.0
177395,Master semestre 1,2007-2008,Monsieur,Auroux Damien,,,,,,Présent,,,,1.0
161970,Master semestre 1,2007-2008,Monsieur,Awalebo Joseph,,,,,,Présent,,,,1.0
166258,Master semestre 1,2007-2008,Monsieur,Balet Ken,,,,,,Présent,,,,1.0
173600,Master semestre 1,2007-2008,Monsieur,Barazzutti Raphaël Pierre,,,,,,Présent,,,,1.0
178879,Master semestre 1,2007-2008,Monsieur,Bayramoglu Ersoy,,,,,,Présent,,,,1.0
154573,Master semestre 1,2007-2008,Madame,Benabdallah Zeineb,,,,,,Présent,,,,1.0
160492,Master semestre 1,2007-2008,Monsieur,Bettex Marc,,,,,,Présent,,,,1.0


In [169]:
semester_aggregation = unstacked.groupby(level="sciper").agg(np.sum).dropna()
semester_aggregation

Unnamed: 0_level_0,year,count,count,count,count,count,count
semester,Unnamed: 1_level_1,Bachelor semestre 1,Bachelor semestre 2,Bachelor semestre 3,Bachelor semestre 4,Bachelor semestre 5,Bachelor semestre 6
sciper,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
147008,2008-20092009-20102010-2011,1,1,1,1,1,1
169731,2007-20082009-20102010-2011,1,1,1,1,1,1
174905,2007-20082009-20102010-20112011-2012,1,1,1,1,2,2
175379,2007-20082008-20092009-20102010-2011,2,2,1,1,1,1
175407,2008-20092009-20102010-2011,1,1,1,1,1,1
175410,2009-20102010-20112011-20122012-20132013-2014,1,1,1,1,3,3
178287,2008-20092009-20102010-2011,1,1,1,1,1,1
178942,2008-20092009-20102010-2011,1,1,1,1,1,1
179703,2008-20092009-20102010-2011,1,1,1,1,1,1
179848,2008-20092009-20102010-2011,1,1,1,1,1,1
