In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import networkx as nx

%matplotlib inline

# Course statistics

In this notebook we will retrieve statistics for each course. In our final visualization, clicking on a course will open a chart in which we will be able to see some insights about the past of that course. Here, we build a dataset ad-hoc for our needs: this will enhance the performance of our website, because we won't need to recompute everytime the statistics needed for the plots.

## 1. Professor

One important information for a course is the professor (or the list of professors) who is lecturing the course.

In [2]:
df_course = pd.read_csv('../../data/csv/courses.csv')
df_course.head()

Unnamed: 0.1,Unnamed: 0,course_id,course_name,year
0,0,0,Biological and physiological transport,2006-2007
1,1,1,Biological and physiological transport,2007-2008
2,2,2,Special topics in reactor physics,2015-2016
3,3,3,Special topics in reactor physics,2014-2015
4,4,4,Special topics in reactor physics,2013-2014


In [3]:
df_teaching = pd.read_csv('../../data/csv/teaching.csv')
df_teaching.head()

Unnamed: 0.1,Unnamed: 0,course_id,prof
0,0,0,Swartz Melody
1,13,1,Swartz Melody
2,23,2,Pautz Andreas
3,29,3,Pautz Andreas
4,37,4,Mikityuk Konstantin


After we loeaded the dataset, we want to retrieve for each course (name) who was the lecturer the last time it was held.

In [4]:
df_course_last = df_course.copy()
df_course_last['year'] = df_course['year'].apply(lambda x: int(x[:4]))
df_course_last = df_course_last.sort_values('year', ascending=False)
df_course_last = df_course_last.drop_duplicates('course_name')

In [5]:
# Counting the number of courses which ended in each year
df_course_last.groupby('year').count()

Unnamed: 0_level_0,Unnamed: 0,course_id,course_name
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,198,198,198
2005,122,122,122
2006,132,132,132
2007,131,131,131
2008,150,150,150
2009,179,179,179
2010,134,134,134
2011,145,145,145
2012,134,134,134
2013,122,122,122


In [6]:
df_last_prof = df_course_last.merge(df_teaching, on='course_id')[['course_name', 'prof']]\
        .groupby('course_name')['prof'].apply(list).to_frame()
df_last_prof.head()

Unnamed: 0_level_0,prof
course_name,Unnamed: 1_level_1
Numerical approximation of PDE's II,[Picasso Marco]
3D Electron Microscopy and FIB-Nanotomography,[Cantoni Marco]
A History of Evolutionary Theory,[Jensen Jeffrey David]
A Political History of Urban Form,[Aureli Pier Vittorio]
A guided tour for engineers in applied stochastic modelling,[Hongler Max-Olivier]


In [7]:
df_last_prof.to_csv('../../data/csv/server/course_professor_v0.csv')

## 2. Number of enrolled students by year, semester, major

In [8]:
df_student = pd.read_csv('../../data/csv/student.csv')
df_enrollment = pd.read_csv('../../data/csv/enrollment.csv')

In [9]:
df_complete = df_course.merge(df_enrollment, on='course_id').merge(df_student, on='student_id')
df_complete.head()

Unnamed: 0.1,Unnamed: 0_x,course_id,course_name,year,Unnamed: 0_y,student_id,semester,Unnamed: 0,student_name,section
0,0,0,Biological and physiological transport,2006-2007,0,2959,Master semestre 2,2959,Bays Emmanuelle,Ingénierie des sciences du vivant
1,19,19,Bioinformatics II,2005-2006,268,2959,Bachelor semestre 6,2959,Bays Emmanuelle,Ingénierie des sciences du vivant
2,270,270,Advanced transport phenomena,2006-2007,14244,2959,Master semestre 1,2959,Bays Emmanuelle,Ingénierie des sciences du vivant
3,381,381,Biomaterials,2006-2007,19320,2959,Master semestre 1,2959,Bays Emmanuelle,Ingénierie des sciences du vivant
4,401,401,"Biomechanics, Biofluids and Biotransport",2005-2006,20287,2959,Bachelor semestre 5,2959,Bays Emmanuelle,Ingénierie des sciences du vivant


In [10]:
df_course_stud = df_complete[['course_name', 'year', 'semester', 'section', 'student_id']]
df_course_stud = df_course_stud.groupby(['course_name', 'year', 'semester', 'section']).agg('count')\
    .rename(columns={'student_id': 'nr_students'}).reset_index()
df_course_stud.head()

Unnamed: 0,course_name,year,semester,section,nr_students
0,Numerical approximation of PDE's II,2012-2013,Master semestre 2,Ingénierie mathématique,5
1,Numerical approximation of PDE's II,2012-2013,Master semestre 2,Mathématiques,11
2,Numerical approximation of PDE's II,2012-2013,Master semestre 2,Mathématiques - master,1
3,Numerical approximation of PDE's II,2012-2013,Master semestre 2,Science et ingénierie computationnelles,2
4,Numerical approximation of PDE's II,2013-2014,Master semestre 2,Mathématiques,1


In [11]:
df_course_stud.to_csv('../../data/csv/server/course_stud.csv')

Analyzing further the semester and the major to better understand which visualization could be themost appropriate.

In [16]:
df_course_stud[['course_name', 'year', 'semester', 'section']].drop_duplicates()\
    .groupby(['course_name', 'year', 'semester']).agg('count').reset_index()\
    .sort_values('section', ascending=False)

Unnamed: 0,course_name,year,semester,section
11315,Machine learning,2018-2019,Master semestre 1,32
5256,Deep learning,2018-2019,Master semestre 2,30
11318,Machine learning,2019-2020,Master semestre 1,28
3539,Chimie générale,2012-2013,Bachelor semestre 1,26
9757,Informatique I,2012-2013,Bachelor semestre 1,26
...,...,...,...,...
17579,Selected topics on security and cryptography,2005-2006,edoc,1
5236,Decision-aid methodologies in transportation,2011-2012,Master semestre 2,1
17577,Selected topics on security and cryptography,2004-2005,Semestre printemps,1
11598,Materials science,2006-2007,Bachelor semestre 6,1
