# Compute course graph

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline

Read three tables into dataframe.

In [2]:
df_enrollment = pd.read_csv('../../data/csv/enrollment.csv')
df_course = pd.read_csv('../../data/csv/courses.csv')
df_student = pd.read_csv('../../data/csv/student.csv')

In [7]:
df_course.head(2)

Unnamed: 0.1,Unnamed: 0,course_id,course_name,year
0,0,0,Biological and physiological transport,2006-2007
1,1,1,Biological and physiological transport,2007-2008


In [8]:
df_enrollment.head(1)

Unnamed: 0.1,Unnamed: 0,student_id,course_id,semester
0,0,3692,0,Master semestre 2


In [9]:
df_student.head(2)

Unnamed: 0.1,Unnamed: 0,student_name,section,student_id
0,0,Aabid Fouad,Génie mécanique,0
1,1,Aamodt Simen,Génie mécanique,1


## 1. Compute list of student for each course

For each course we will now compute the set of students attending that course, we will create a file 'jaccard.csv' to save this info as we will use it often in the visualization.

In [20]:
df_complete = df_course.merge(df_enrollment, on='course_id')
df_jaccard = df_complete.groupby('course_name')['student_id'].agg({'set': lambda x: set(x)}).reset_index()
df_jaccard.head(2)


using a dict on a Series for aggregation
is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)




Unnamed: 0,course_name,set
0,Numerical approximation of PDE's II,"{33792, 18439, 36362, 37773, 15374, 20879, 359..."
1,3D Electron Microscopy and FIB-Nanotomography,"{35843, 34309, 5650, 539, 32805, 7720, 7733, 2..."


In [21]:
df_jaccard.to_csv('../../data/csv/jaccard.csv')

We will particularly use this file in order to compute the jaccard coefficient between two courses using the students enrolled over the years.

## 2. The course network

Now, we will compute the connection between the courses in our database. We will again use the jaccard coefficient to obtain a similarity measure between courses.

In [40]:
df_jaccard['key'] = 1
df_product = df_jaccard.merge(df_jaccard, on='key')
df_product.drop('key', axis=1, inplace=True)
df_product.head(2)

Unnamed: 0,course_name_x,set_x,course_name_y,set_y
0,Numerical approximation of PDE's II,"{33792, 18439, 36362, 37773, 15374, 20879, 359...",Numerical approximation of PDE's II,"{33792, 18439, 36362, 37773, 15374, 20879, 359..."
1,Numerical approximation of PDE's II,"{33792, 18439, 36362, 37773, 15374, 20879, 359...",3D Electron Microscopy and FIB-Nanotomography,"{35843, 34309, 5650, 539, 32805, 7720, 7733, 2..."


In [41]:
def jaccard(s1, s2):
    common = 0
    for e in s1:
        if e in s2:
            common +=1
    return common / (len(s1) + len(s2) - common)

df_edges = df_product.copy()
df_edges['jaccard'] = df_product.apply(lambda x: jaccard(x['set_x'], x['set_y']), axis=1)
#df_edges['jaccard'] = df_save
df_edges.drop(['set_x', 'set_y'], axis=1, inplace=True)
df_edges.head()

Unnamed: 0,course_name_x,course_name_y,jaccard
0,Numerical approximation of PDE's II,Numerical approximation of PDE's II,1.0
1,Numerical approximation of PDE's II,3D Electron Microscopy and FIB-Nanotomography,0.0
2,Numerical approximation of PDE's II,A History of Evolutionary Theory,0.0
3,Numerical approximation of PDE's II,A Political History of Urban Form,0.0
4,Numerical approximation of PDE's II,A guided tour for engineers in applied stochas...,0.0


Save all the jaccard coefficient computed.

In [42]:
df_edges.to_csv('../../data/csv/all_edges.csv')

Now, we will apply an euristic method in order to keep only the most relevant edges. For each node we will keep the top 5% of its neighbours (jaccard > 0).

In [49]:
df_count = df_edges[df_edges['jaccard'] > 0].groupby('course_name_x').count().drop('course_name_y', axis=1).reset_index()
df_count.rename(columns={"course_name_x": "course_name", "jaccard": "neighbours"}, inplace=True)
df_count.head()

Unnamed: 0,course_name,neighbours
0,Numerical approximation of PDE's II,94
1,3D Electron Microscopy and FIB-Nanotomography,147
2,A History of Evolutionary Theory,3
3,A Political History of Urban Form,118
4,A guided tour for engineers in applied stochas...,99


In [53]:
size_map = {}
for index, row in df_count.iterrows():
    size_map[row['course_name']] = row['neighbours']
len(size_map) # Should be 2893

2893

In [55]:
# How many isolated nodes? Looks like none 
df_count[df_count['neighbours'] == 0]

Unnamed: 0,course_name,neighbours


In [61]:
# Remove self-loops
df_edges = df_edges[df_edges['course_name_x'] != df_edges['course_name_y']]
# Remove edges with zero value
df_edges = df_edges[df_edges['jaccard'] > 0]
df_edges.head(5)

Unnamed: 0,course_name_x,course_name_y,jaccard
63,Numerical approximation of PDE's II,Advanced methods in computational solid mechanics,0.017857
65,Numerical approximation of PDE's II,Advanced multiprocessor architecture,0.007407
67,Numerical approximation of PDE's II,Advanced numerical analysis,0.025
74,Numerical approximation of PDE's II,Advanced regression,0.02439
76,Numerical approximation of PDE's II,Advanced scientific computing,0.018182


In [77]:
def keep_friends(g):
    for course_name in set(g['course_name_x']):
        keep = max(3, int(size_map[course_name]/100*5))
        return g.nlargest(keep, "jaccard")

df_edges_5 = df_edges.groupby('course_name_x', group_keys=False).apply(keep_friends)
df_edges_5.head(20)

Unnamed: 0,course_name_x,course_name_y,jaccard
1768,Numerical approximation of PDE's II,Numerical approximation of PDE's I,0.242105
1773,Numerical approximation of PDE's II,Numerical integration of dynamical systems,0.219512
532,Numerical approximation of PDE's II,Computational linear algebra,0.176471
1774,Numerical approximation of PDE's II,Numerical methods for conservation laws,0.12
4644,3D Electron Microscopy and FIB-Nanotomography,Non-destructive evaluation methods,0.153846
3562,3D Electron Microscopy and FIB-Nanotomography,Céramiques,0.08589
5447,3D Electron Microscopy and FIB-Nanotomography,Thermodynamique,0.075556
3303,3D Electron Microscopy and FIB-Nanotomography,"Ceramics, properties",0.074257
3659,3D Electron Microscopy and FIB-Nanotomography,Déformation et rupture à basse température,0.068182
3554,3D Electron Microscopy and FIB-Nanotomography,Cristallographie,0.065637


Save this edges in a CSV file.

In [78]:
df_edges_5.to_csv('../../data/csv/personal_graph.csv')