In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import networkx as nx

%matplotlib inline

# Course statistics

In this notebook we will retrieve statistics for each course. In our final visualization, clicking on a course will open a chart in which we will be able to see some insights about the past of that course. Here, we build a dataset ad-hoc for our needs: this will enhance the performance of our website, because we won't need to recompute everytime the statistics needed for the plots.

## 1. Professor

One important information for a course is the professor (or the list of professors) who is lecturing the course.

In [36]:
df_course = pd.read_csv('../../data/csv/courses.csv')
df_course.head()

Unnamed: 0.1,Unnamed: 0,course_id,course_name,year
0,0,0,Biological and physiological transport,2006-2007
1,1,1,Biological and physiological transport,2007-2008
2,2,2,Special topics in reactor physics,2015-2016
3,3,3,Special topics in reactor physics,2014-2015
4,4,4,Special topics in reactor physics,2013-2014


In [37]:
df_teaching = pd.read_csv('../../data/csv/teaching.csv')
df_teaching.head()

Unnamed: 0.1,Unnamed: 0,course_id,prof
0,0,0,Swartz Melody
1,2,1,Swartz Melody
2,7,2,Pautz Andreas
3,13,3,Pautz Andreas
4,21,4,Mikityuk Konstantin


After we loeaded the dataset, we want to retrieve for each course (name) who was the lecturer the last time it was held.

In [38]:
df_course_last = df_course.copy()
df_course_last['year'] = df_course['year'].apply(lambda x: int(x[:4]))
df_course_last = df_course_last.sort_values('year', ascending=False)
df_course_last = df_course_last.drop_duplicates('course_name')

In [39]:
df_course_last.groupby('year').count()

Unnamed: 0_level_0,Unnamed: 0,course_id,course_name
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,198,198,198
2005,122,122,122
2006,132,132,132
2007,131,131,131
2008,150,150,150
2009,179,179,179
2010,134,134,134
2011,145,145,145
2012,134,134,134
2013,122,122,122


In [40]:
df_last_prof = df_course_last.merge(df_teaching, on='course_id')[['course_name', 'prof']]\
        .groupby('course_name')['prof'].apply(list).to_frame()
df_last_prof.head()

Unnamed: 0_level_0,prof
course_name,Unnamed: 1_level_1
Numerical approximation of PDE's II,[Picasso Marco]
3D Electron Microscopy and FIB-Nanotomography,[Cantoni Marco]
A History of Evolutionary Theory,[Jensen Jeffrey David]
A Political History of Urban Form,[Aureli Pier Vittorio]
A guided tour for engineers in applied stochastic modelling,[Hongler Max-Olivier]


In [41]:
df_last_prof.to_csv('../../data/csv/server/course_professor_v0.csv')

## 2. Number of enrolled students by year

In [42]:
df_student = pd.read_csv('../../data/csv/student.csv')
df_enrollment = pd.read_csv('../../data/csv/enrollment.csv')

In [43]:
df_complete = df_course.merge(df_enrollment, on='course_id').merge(df_student, on='student_id')
df_complete.head()

Unnamed: 0.1,Unnamed: 0_x,course_id,course_name,year,Unnamed: 0_y,student_id,semester,Unnamed: 0,student_name,section
0,0,0,Biological and physiological transport,2006-2007,0,3692,Master semestre 2,3692,Berlier Guillaume,Bioingénierie
1,381,381,Biomaterials,2006-2007,14382,3692,Master semestre 1,3692,Berlier Guillaume,Bioingénierie
2,1435,1435,Instrumentation bio-optique,2006-2007,74292,3692,Master semestre 1,3692,Berlier Guillaume,Bioingénierie
3,1442,1442,Capteurs en instrumentation médicale,2006-2007,74354,3692,Master semestre 2,3692,Berlier Guillaume,Bioingénierie
4,1779,1779,Drug discovery from bench to clinics,2006-2007,86653,3692,Master semestre 2,3692,Berlier Guillaume,Bioingénierie


In [65]:
df_student_by_year = df_complete[['course_name', 'year', 'student_id', 'semester']]
df_student_by_year = df_student_by_year.groupby(['course_name', 'year']).agg('count')\
    .sort_values('student_id', ascending=False).drop(axis=1, columns=['semester'])\
    .rename(columns={'student_id': 'nr_students'})
df_student_by_year.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nr_students
course_name,year,Unnamed: 2_level_1
Chimie générale,2012-2013,1208
Analyse III,2018-2019,1133
Analyse III,2019-2020,1103
Probabilités et statistique,2013-2014,1015
Chimie générale,2011-2012,995


In [66]:
df_student_by_year.to_csv('../../data/csv/server/students_by_year_v0.csv')

## 3. Number of students divided by year of enrollment

In [70]:
df_student_by_enroyear = df_complete[['course_name', 'year', 'student_id', 'semester']]
df_student_by_enroyear = df_student_by_enroyear.groupby(['course_name', 'year', 'semester']).agg('count')\
    .rename(columns={'student_id': 'nr_students'}).reset_index()
df_student_by_enroyear.head()

Unnamed: 0,course_name,year,semester,nr_students
0,Numerical approximation of PDE's II,2012-2013,Master semestre 2,7
1,Numerical approximation of PDE's II,2013-2014,Master semestre 2,4
2,Numerical approximation of PDE's II,2013-2014,Master semestre 4,1
3,Numerical approximation of PDE's II,2014-2015,Master semestre 2,4
4,Numerical approximation of PDE's II,2015-2016,Master semestre 2,11


In [71]:
df_student_by_enroyear.to_csv('../../data/csv/server/students_by_enroyear_v0.csv')

## 4. Number of students by section

In [74]:
df_student_by_section = df_complete[['course_name', 'year', 'student_id', 'section']]
df_student_by_section = df_student_by_section.groupby(['course_name', 'year', 'section']).agg('count')\
    .rename(columns={'student_id': 'nr_students'}).reset_index()
df_student_by_section.head()

Unnamed: 0,course_name,year,section,nr_students
0,Numerical approximation of PDE's II,2012-2013,Ingénierie mathématique,4
1,Numerical approximation of PDE's II,2012-2013,Mathématiques - master,1
2,Numerical approximation of PDE's II,2012-2013,Science et ingénierie computationnelles,2
3,Numerical approximation of PDE's II,2013-2014,Ingénierie mathématique,1
4,Numerical approximation of PDE's II,2013-2014,Mathématiques - master,2


In [75]:
df_student_by_section.to_csv('../../data/csv/server/students_by_section_v0.csv')

In [78]:
df_student_by_section[df_student_by_section['course_name'] == 'Machine learning']

Unnamed: 0,course_name,year,section,nr_students
13041,Machine learning,2017-2018,Data Science,15
13042,Machine learning,2017-2018,Informatique,168
13043,Machine learning,2017-2018,Informatique et communications,7
13044,Machine learning,2017-2018,"Management, technologie et entrepreneuriat",1
13045,Machine learning,2017-2018,Mineur en Biocomputing,1
13046,Machine learning,2017-2018,Science et ingénierie computationnelles,9
13047,Machine learning,2017-2018,Sciences et technologies du vivant - master,8
13048,Machine learning,2017-2018,Systèmes de communication - master,19
13049,Machine learning,2018-2019,Data Science,36
13050,Machine learning,2018-2019,Génie électrique,5
