In [1]:
# imports (may be more than needed)
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'


#  Master Data
_Obtain all the data for the Bachelor students, starting from 2007. Keep only the students for which you have an entry for both Bachelor semestre 1 and Bachelor semestre 6. Compute how many months it took each student to go from the first to the sixth semester. Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?_


2) Perform a similar operation to what described above, this time for Master students. Notice that this data is more tricky, as there are many missing records in the IS-Academia database. Therefore, try to guess how much time a master student spent at EPFL by at least checking the distance in months between Master semestre 1 and Master semestre 2. If the Mineur field is not empty, the student should also appear registered in Master semestre 3. Last but not the least, don't forget to check if the student has an entry also in the Projet Master tables. Once you can handle well this data, compute the "average stay at EPFL" for master students. Now extract all the students with a Spécialisation and compute the "average stay" per each category of that attribute -- compared to the general average, can you find any specialization for which the difference in average is statistically significant?

## Read the data from csv

In [2]:
all_data = pd.read_csv('all_data.csv', index_col=0)

In [3]:
all_data.head()

Unnamed: 0,Civilité,Nom_Prénom,Orientation_Bachelor,Orientation_Master,Spécialisation,Filière_opt.,Mineur,Statut,Type_Echange,Ecole_Echange,No_Sciper,title,periode_acad,periode_pedago
0,Monsieur,Arévalo Christian,,,,,,Présent,,,169569,"Informatique, 2009-2010, Bachelor semestre 5",2009-2010,Bachelor semestre 5
1,Monsieur,Badoud Morgan,,,,,,Présent,,,173922,"Informatique, 2009-2010, Bachelor semestre 5",2009-2010,Bachelor semestre 5
2,Monsieur,Baeriswyl Jonathan,,,,,,Présent,,,179406,"Informatique, 2009-2010, Bachelor semestre 5",2009-2010,Bachelor semestre 5
3,Monsieur,Balas Martin,,,,,,Présent,,,174774,"Informatique, 2009-2010, Bachelor semestre 5",2009-2010,Bachelor semestre 5
4,Monsieur,Barroco Michael,,,,,,Présent,,,179428,"Informatique, 2009-2010, Bachelor semestre 5",2009-2010,Bachelor semestre 5


In [4]:
len(all_data)

8942

## Clean the data

In [5]:
all_data['periode_pedago'].unique()

array(['Bachelor semestre 5', 'Master semestre 1', 'Bachelor semestre 2',
       'Bachelor semestre 6', 'Master semestre 2',
       'Projet Master printemps', 'Bachelor semestre 3',
       'Bachelor semestre 1', 'Master semestre 3', 'Semestre printemps',
       'Projet Master automne', 'Semestre automne', 'Bachelor semestre 4'], dtype=object)

In [6]:
all_data['title'].unique()

array(['Informatique, 2009-2010, Bachelor semestre 5',
       'Informatique, 2009-2010, Master semestre 1',
       'Informatique, 2009-2010, Bachelor semestre 2',
       'Informatique, 2009-2010, Bachelor semestre 6',
       'Informatique, 2009-2010, Master semestre 2',
       'Informatique, 2009-2010, Projet Master printemps',
       'Informatique, 2009-2010, Bachelor semestre 3',
       'Informatique, 2009-2010, Bachelor semestre 1',
       'Informatique, 2009-2010, Master semestre 3',
       'Passerelle HES - IN, 2009-2010, Semestre printemps',
       'Informatique, 2009-2010, Projet Master automne',
       'Passerelle HES - IN, 2009-2010, Semestre automne',
       'Echange IN, 2009-2010, Semestre automne',
       'Informatique, 2009-2010, Bachelor semestre 4',
       'Informatique, 2014-2015, Bachelor semestre 5',
       'Informatique, 2014-2015, Master semestre 1',
       'Informatique, 2014-2015, Bachelor semestre 2',
       'Informatique, 2014-2015, Bachelor semestre 6',
       

checkout what the Semester printemps and automne are

In [7]:
all_data[all_data['periode_pedago'].isin(['Semestre printemps', 'Semestre automne'])]['title'].unique()

array(['Passerelle HES - IN, 2009-2010, Semestre printemps',
       'Passerelle HES - IN, 2009-2010, Semestre automne',
       'Echange IN, 2009-2010, Semestre automne',
       'Passerelle HES - IN, 2014-2015, Semestre printemps',
       'Echange IN, 2014-2015, Semestre printemps',
       'Passerelle HES - IN, 2014-2015, Semestre automne',
       'Echange IN, 2014-2015, Semestre automne',
       'Passerelle HES - IN, 2012-2013, Semestre printemps',
       'Passerelle HES - IN, 2012-2013, Semestre automne',
       'Echange IN, 2012-2013, Semestre automne',
       'Echange IN, 2008-2009, Semestre printemps',
       'Echange IN, 2008-2009, Semestre automne',
       'Passerelle HES - IN, 2015-2016, Semestre printemps',
       'Echange IN, 2015-2016, Semestre printemps',
       'Echange IN, 2015-2016, Semestre automne',
       'Passerelle HES - IN, 2015-2016, Semestre automne',
       'Passerelle HES - IN, 2011-2012, Semestre printemps',
       'Echange IN, 2011-2012, Semestre printemps',
 

This are the students in exchange (epfl students in exchange somewhere else) and students doing the Passerelle HES. As the students in exchange are in thir 3rd year Bachelor we ignore them. Also we won't count students in Passerelle HES as beeing in the Master just yet because they have to succeed the passerelle to optain a master. So if they do a master they are inscribed in Master anyways.

In [8]:
master_periode_pedago = ['Master semestre 1', 'Master semestre 2', 'Projet Master printemps', 'Master semestre 3', 'Projet Master automne']
master_data = all_data[all_data['periode_pedago'].isin(master_periode_pedago)]

remove the columns with only NaN. (In particular the 'Orientation_Bachelor' col)

In [9]:
master_data.dropna(axis=1, how='all')

Unnamed: 0,Civilité,Nom_Prénom,Spécialisation,Mineur,Statut,Type_Echange,Ecole_Echange,No_Sciper,title,periode_acad,periode_pedago
0,Monsieur,Barras Florian,,"Mineur en Management, technologie et entrepren...",Présent,,,170220,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
1,Monsieur,Beuret Thibaut,,,Présent,,,166701,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
2,Monsieur,Bindschaedler Laurent,,,Présent,,,170654,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
3,Monsieur,Biollay Jean Isaac Jamal Pachacutec,,,Présent,,,161279,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
4,Monsieur,Blanc Régis William,,,Congé,Bilatéral,"University of California, Berkeley (UCB)",175339,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
5,Monsieur,Blatter Jérémy,,,Présent,,,166344,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
6,Monsieur,Bolkensteyn Dinesh,,Mineur en Etudes asiatiques contemporaines,Présent,,,170451,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
7,Monsieur,Brot Benoît,,,Présent,,,191471,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
8,Monsieur,Brunet Yorick,,,Présent,,,178283,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1
9,Monsieur,Burgener Raphaël,,Mineur en Biocomputing,Présent,,,170225,"Informatique, 2009-2010, Master semestre 1",2009-2010,Master semestre 1


Rename the columns (remove the é and give shorter names)

In [10]:
master_data.rename(columns={'Civilité': 'Sex', 'Nom_Prénom': 'Name', 'Spécialisation': 'Specialisation'}, inplace=True)

In [11]:
master_data.sort_values(by='Name')

Unnamed: 0,Sex,Name,Orientation_Bachelor,Orientation_Master,Specialisation,Filière_opt.,Mineur,Statut,Type_Echange,Ecole_Echange,No_Sciper,title,periode_acad,periode_pedago
0,Madame,Abbadi Hajar,,,,,,Présent,,,238067,"Informatique, 2013-2014, Master semestre 1",2013-2014,Master semestre 1
0,Madame,Abbadi Hajar,,,,,"Mineur en Management, technologie et entrepren...",Présent,,,238067,"Informatique, 2014-2015, Master semestre 3",2014-2015,Master semestre 3
0,Madame,Abbadi Hajar,,,,,"Mineur en Management, technologie et entrepren...",Présent,,,238067,"Informatique, 2013-2014, Master semestre 2",2013-2014,Master semestre 2
0,Monsieur,Abelenda Diego,,,,,,Présent,,,170646,"Informatique, 2011-2012, Master semestre 3",2011-2012,Master semestre 3
0,Monsieur,Abelenda Diego,,,,,,Présent,,,170646,"Informatique, 2011-2012, Master semestre 2",2011-2012,Master semestre 2
0,Monsieur,Abelenda Diego,,,"Signals, Images and Interfaces",,,Présent,,,170646,"Informatique, 2010-2011, Master semestre 1",2010-2011,Master semestre 1
0,Monsieur,Abelenda Diego,,,"Signals, Images and Interfaces",,,Présent,,,170646,"Informatique, 2010-2011, Master semestre 2",2010-2011,Master semestre 2
0,Madame,Abi Akar Nora,,,,,,Présent,,,251253,"Informatique, 2016-2017, Master semestre 3",2016-2017,Master semestre 3
0,Madame,Abi Akar Nora,,,,,,Présent,,,251253,"Informatique, 2015-2016, Master semestre 2",2015-2016,Master semestre 2
0,Madame,Abi Akar Nora,,,,,,Présent,,,251253,"Informatique, 2015-2016, Master semestre 1",2015-2016,Master semestre 1


## How many month did it take each student

## And per specialisation

## Female vs Male