In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
parser = 'html.parser' # or 'lxml'
filter_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'
table_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'

#define the parameter
payload = {'ww_i_reportModel':'133685247'}

#define the table name
table_name = {
    'unite_faculty':'ww_x_UNITE_ACAD',
    'academic_year':'ww_x_PERIODE_ACAD', 
    'semester':'ww_x_PERIODE_PEDAGO',
    'season':'ww_x_HIVERETE'
}

# request example
default_parameter_dict = {
    'ww_b_list':1,
    'ww_i_reportmodel':'133685247',
    'ww_i_reportModelXsl':'133685270'
}


table_columns = [
    'Civilité',
    'Nom Prénom',
    'Orientation Bachelor',
    'Orientation Master',
    'Spécialisation',
    'Filière opt.',
    'Mineur',
    'Statut',
    'Type Echange',
    'Ecole Echange',
    'No Sciper'
]

table_columns_dict = dict(
    zip(table_columns, range(len(table_columns)))
)           

Find the URL

In [3]:
def build_key_map():
    # get the URL
    r = requests.get(filter_url, params = payload)
    # convert to beautifulSoup
    soup = BeautifulSoup(r.text, parser)
    
    # extract the dictionary for Unité académique
    dict_unite = {}
    for table_key in table_name.keys():
        dict_unite_tem = {}
        for unite in soup.select('select[name=%s] option' % table_name[table_key]):
            if len(unite.contents) > 0:            
                dict_unite_tem[(unite.contents)[0]] = unite['value'] 
        dict_unite[table_key] = dict_unite_tem
    return dict_unite

dict_unite = build_key_map()

In [4]:
# function to complete the parameter dictionary
def func_parameter_dict(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite, gps = False):
    # request example
    parameter_dict = default_parameter_dict
    parameter_dict[table_name['academic_year']] = dict_unite['academic_year'][academic_year_str]
    parameter_dict[table_name['season']] = dict_unite['season'][season_str]
    parameter_dict[table_name['semester']] = dict_unite['semester'][semester_str]
    parameter_dict[table_name['unite_faculty']] = dict_unite['unite_faculty'][unite_faculty_str]
    if gps:
        parameter_dict['ww_x_GPS'] = -1
    return parameter_dict

In [310]:
def get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite):
    parameter_dict = func_parameter_dict(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)
    # get the URL
    r = requests.get(filter_url, params = parameter_dict)
    # convert to beautifulSoup
    soup = BeautifulSoup(r.text, parser)
    # check if the ww_x_GPS is exist or not, if it existed, ww_x_GPS == -1
    if soup.select('a[class=%s]' % "ww_x_GPS")==[]:
        print('The input is not valide')
        print(semester_str)
        raise RuntimeError
        
    # update parameter, add ww_x_GPS
    gps_exist = True
    parameter_dict_gps =  func_parameter_dict(academic_year_str,season_str,semester_str,
                                          unite_faculty_str,dict_unite, gps_exist)

    # get the URL
    r_table = requests.get(table_url, params = parameter_dict_gps)
    # convert to beautifulSoup
    soup_table = BeautifulSoup(r_table.text, parser)
    return soup_table

In [306]:
import pandas as pd

def build_dataframe(soup, list_of_columns):
    l = soup.select('table tr')
    # The first row is the title of the table
    # The second row is the label of columns
    s = []
    
    for column in list_of_columns:
        temp_s = []
        for item in l[2:]:
            content = item.contents[table_columns_dict[column]].contents
            if (content == []):
                temp_s.append('None')
            else:
                temp_s.append(content[0])
        s.append(temp_s)
                
    column_dict = dict(zip(list_of_columns, s))
    df = pd.DataFrame(column_dict)
    return df

# Example

In [136]:
academic_year_str = '2007-2008'
season_str = "Semestre d'automne"
semester_str = 'Bachelor semestre 1'
unite_faculty_str =  'Informatique'

In [137]:
# Get the Raw Webpage
soup_table = get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)

In [139]:
# Create a DataFrame from the raw webpage
df = build_dataframe(soup_table, ['No Sciper', 'Statut'])

# Assignment 1

In [281]:
def func_get_semester_student(season_str,semester_str,unite_faculty_str, feature_list):
    df_list = []
    for year in range(2007, 2017):
        academic_year_str = str(year) + '-' + str(year+1)
        #print(academic_year_str)
        soup_table = get_required_webpage(academic_year_str,season_str,semester_str,unite_faculty_str,dict_unite)    
        df =build_dataframe(soup_table, feature_list)
        df['academic_year']=academic_year_str
        df_list.append(df)
    df = pd.concat(df_list)
    df['Semester'] = semester_str
    df1 = df.set_index(['No Sciper','academic_year'])
    df2 = df.set_index(['No Sciper'])
    unique_index = df2.index.unique()
    return df1,unique_index

In [183]:
season_str = ["Semestre d'automne","Semestre de printemps"]
semester_str = ['Bachelor semestre 1','Bachelor semestre 6']
feature_list = ['Nom Prénom', 'No Sciper', 'Statut', 'Civilité']
unite_faculty_str =  'Informatique'
df_semester_1,unique_index_1 = func_get_semester_student(season_str[0],semester_str[0],unite_faculty_str, feature_list)
df_semester_6,unique_index_6 = func_get_semester_student(season_str[1],semester_str[1],unite_faculty_str, feature_list)

In [308]:
def find_end_year(df_semester_6,unique_index_6, indicate_end ):
    unique_index = unique_index_6
    offset = 0 if indicate_end else 1
    #print(unique_index)
    aa = df_semester_6.sort_index()
    dic = {}
    list_year = []
    list_Sciper = []
    list_sex = []
    for i in unique_index:
        academic_year = aa.loc[i,:].iloc[-1+offset].name
        list_year.append(academic_year)
        list_Sciper.append(i)
        list_sex.append(aa['Civilité'].loc[(i,academic_year)])
    dic = {'No Sciper':list_Sciper, 'academic_year':list_year, 'Civilité':list_sex}    
    dataframe_dic = pd.DataFrame(dic)
    
    return dataframe_dic.set_index('No Sciper')

In [228]:
semester1_frame = find_end_year(df_semester_1,unique_index_1, False)
semester6_frame = find_end_year(df_semester_6,unique_index_6, True)

In [266]:
id = semester1_frame.index.intersection(semester6_frame.index)
#print(semester1_frame.loc[id].shape)
result = pd.concat([semester1_frame.loc[id],semester6_frame.loc[id]],axis = 1)
# result = pd.merge(semester1_frame.loc[id],semester6_frame.loc[id],how='inner',on='Civilité')
result['period'] = result.apply(lambda x: int(x.iloc[3].split('-')[1])-int(x.iloc[1].split('-')[0]), axis=1)
result.iloc[:,[0, 4]].groupby('Civilité')[['period']].mean()

Unnamed: 0_level_0,period
Civilité,Unnamed: 1_level_1
Madame,3.310345
Monsieur,3.480978


In [280]:
result.iloc[:,[0, 4]].reset_index().groupby(['Civilité','period']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,No Sciper
Civilité,period,Unnamed: 2_level_1
Madame,3,21
Madame,4,7
Madame,5,1
Monsieur,3,238
Monsieur,4,91
Monsieur,5,32
Monsieur,6,6
Monsieur,7,1


## Assignment 2

In [312]:
# extract the semester 1 and semester 2 student list
season_str = ["Semestre d'automne","Semestre de printemps"]
semester_str = ['Master semestre 1','Master semestre 2','Master semestre 3','Projet Master automne','Projet Master printemps']
feature_list = ['Nom Prénom', 'No Sciper', 'Statut', 'Civilité','Spécialisation','Mineur']
unite_faculty_str =  'Informatique'
df_master_semester_1,unique_master_index_1 = func_get_semester_student(season_str[0],semester_str[0],unite_faculty_str, feature_list)
df_master_semester_2,unique_master_index_2 = func_get_semester_student(season_str[1],semester_str[1],unite_faculty_str, feature_list)
df_master_semester_3,unique_master_index_3 = func_get_semester_student(season_str[0],semester_str[2],unite_faculty_str, feature_list)
df_master_project_1,unique_master_project_index_1 = func_get_semester_student(season_str[0],semester_str[3],unite_faculty_str, feature_list)
df_master_project_2,unique_master_project_index_2 = func_get_semester_student(season_str[1],semester_str[4],unite_faculty_str, feature_list)

In [319]:
df_master_project_2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Civilité,Mineur,Nom Prénom,Spécialisation,Statut,Semester
No Sciper,academic_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
159852,2007-2008,Monsieur,,Brutsche Florian,Internet computing,Congé,Projet Master printemps
153819,2007-2008,Monsieur,,Dotta Mirco,,Stage,Projet Master printemps
145957,2007-2008,Monsieur,,Hügli Michael,,Stage,Projet Master printemps
173257,2007-2008,Monsieur,,Indra Saurabh,,Présent,Projet Master printemps
160150,2007-2008,Monsieur,,Lépine Simon,Biocomputing,Présent,Projet Master printemps


In [338]:
master_all_semesters = pd.concat([df_master_semester_1, df_master_semester_2,\
                                  df_master_semester_3, df_master_project_1, df_master_project_2]).reset_index()

In [344]:
present_in_epfl = master_all_semesters.groupby(['No Sciper', 'Statut']).count()['Civilité']

In [351]:
count_stay = present_in_epfl.loc[:, 'Présent']
average_stay = count_stay.mean()

In [365]:
specialization_count = master_all_semesters[(master_all_semesters.Statut == 'Présent') 
                     & (master_all_semesters.Spécialisation != 'None')].groupby(['Spécialisation','No Sciper']).count()['Civilité']

In [403]:
specialization_count

Spécialisation                  No Sciper
Biocomputing                    153640       1
                                154573       4
                                154936       1
                                154969       1
                                160150       2
                                160831       1
                                161090       1
                                169734       3
                                172500       1
                                172503       1
                                172611       1
                                183185       1
Computer Engineering - SP       161344       1
                                166262       1
                                175775       1
                                178271       3
                                178283       2
                                183984       2
                                185005       2
                                185458       3
                  

In [369]:
specialization_count.reset_index().groupby(['Spécialisation']).mean()

Unnamed: 0_level_0,Civilité
Spécialisation,Unnamed: 1_level_1
Biocomputing,1.5
Computer Engineering - SP,2.227273
Computer Science Theory,2.0
Data Analytics,1.75
Foundations of Software,2.289855
Information Security - SP,2.428571
Internet Information Systems,2.0
Internet computing,2.240741
Service science,2.0
"Signals, Images and Interfaces",2.175


In [371]:
master_all_semesters[(master_all_semesters.Spécialisation == 'Biocomputing') ]

Unnamed: 0,No Sciper,academic_year,Civilité,Mineur,Nom Prénom,Spécialisation,Statut,Semester
42,172611,2007-2008,Monsieur,,Kwanga Rodrigue,Biocomputing,Présent,Master semestre 1
971,172500,2007-2008,Madame,,Diatchka Kremena Sotirova,Biocomputing,Présent,Master semestre 2
1039,154573,2008-2009,Madame,,Benabdallah Zeineb,Biocomputing,Présent,Master semestre 2
1068,169734,2008-2009,Monsieur,,Koller Yannick,Biocomputing,Présent,Master semestre 2
2022,172500,2007-2008,Madame,,Diatchka Kremena Sotirova,Biocomputing,Stage,Master semestre 3
2024,160831,2007-2008,Monsieur,,Dubout Charles,Biocomputing,Présent,Master semestre 3
2026,172503,2007-2008,Monsieur,,Evans Nathaniel,Biocomputing,Présent,Master semestre 3
2033,161090,2007-2008,Monsieur,,Habersaat Alexandre,Biocomputing,Présent,Master semestre 3
2046,160150,2007-2008,Monsieur,,Lépine Simon,Biocomputing,Présent,Master semestre 3
2048,153640,2007-2008,Monsieur,,Matthey-de-l'Endroit Loïc,Biocomputing,Présent,Master semestre 3


In [378]:
people_with_spec_idx = master_all_semesters[(master_all_semesters.Statut == 'Présent') 
                     & (master_all_semesters.Spécialisation != 'None')]['No Sciper'].unique()

In [406]:
specialization_count = master_all_semesters.set_index('No Sciper').loc[people_with_spec_idx].reset_index().groupby(['Spécialisation','No Sciper']).count()['Civilité']

In [407]:
specialization_count.reset_index().groupby(['Spécialisation']).mean()

Unnamed: 0_level_0,Civilité
Spécialisation,Unnamed: 1_level_1
Biocomputing,1.583333
Computer Engineering - SP,2.409091
Computer Science Theory,2.0
Data Analytics,1.75
Foundations of Software,2.782609
Information Security - SP,2.428571
Internet Information Systems,2.0
Internet computing,2.555556
,1.381443
Service science,2.0


# Bonus