# Pre-processing

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1. Select master courses relevant files

In [2]:
# Directory with the .xls files taken from is-academia public access
xls_dir = '../../data/'

# Iterate over files and keep only the master courses files
master_files = []
for file in os.listdir(xls_dir):
    path = xls_dir + file
    if os.path.isdir(path):
        continue
    f = open(path, encoding='latin-1')      
    content = f.read()
    if 'Master' in content:
        master_files += [file]

In [3]:
len(master_files)

3484

## 2. Parse files

In [4]:
file = master_files[143]

In [5]:
def parse_class_info(class_info):
    class_name = prof = ''
    if len(class_info) > 0:
        class_name = class_info[0].text
    if len(class_info) > 1:
        prof_info = class_info[1].text
        if ':' in prof_info:
            prof = prof_info.split(':')[1]
        if prof.find('Assistant-e-(s)') != -1:
            prof = prof[:prof.find('Assistant-e-(s)')]
        prof = prof.split(',')
        prof_lst = [x.strip() for x in prof]
    return class_name, prof_lst
    
    

In [29]:
def parse_group_info(group_info, def_section, def_year, def_sem):
    def_section = def_year = def_sem = ''
    info = group_info.split(',')
    if len(info) == 0:
        section = def_section
        year = def_year
        sem = def_sem
    elif len(info) == 1:
        section = info
        year = def_year
        sem = def_sem
    elif len(info) == 2:
        section, sem = info
        year = def_year
    elif len(info) == 3:
        section, year, sem = info[:3]
    if len(info) > 3:
        section = group_info[:group_info.find('201') - 2]
        year = group_info[group_info.find('201'):]
        year = year[:year.find(',')]
        sem = group_info.split(',')[-1]
    return section, year.strip(), sem

In [30]:
%%time
course_id = 1
enrollment_lst = []
ignored_files = []
for file in master_files: 
    ignoreFile = False
    f = open(xls_dir + file, encoding='latin-1')      
    page_body = f.read()
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # Parse the course general information:
    # - class_name
    # - professors
    
    # empty values
    class_name = None
    profs = []
    def_section = None
    def_year = None
    def_semester = None
    for tr in soup.body.table.findAll('tr',recursive=False):
        class_info = tr.findAll('th', recursive=False)
        # If it contains info for a new class
        if len(class_info) > 0:
            if len(class_info) != 2:
                print('Error: More than two lines descrbing the course (the lines should be course name and prof)')
            class_name, profs = parse_class_info(class_info)
            #print(file, class_name, profs)
        else:
            # Check if it contains a table of students or it start a new set of students
            # New set of students
            if len(tr.findAll('table')) == 0:
                # Parse common info for this group of students
                def_section, def_year, def_semester = parse_group_info(tr.text, def_section, def_year, def_semester)
                
            # Table containing the students
            else:
                for table in tr.findAll('table', recurive=False):
                    # Format for files that ocntains project info
                    if len(table.findAll('table')) > 0:
                        ignoreFile = True
                        break
                    
                    # Format for list of students in class
                    for student in table.findAll('tr'):
                        tds = student.findAll('td')
                        #if len(tds) != 3:
                            #print(file, student)
                            #ignoreFile = True
                        name, student_info, extra = tds[:3]
                        name = name.text
                        if len(extra) > 0:
                            print("Error: " + extra)
                        if ',' in student_info:
                            section, year, semester = parse_group_info(student_info.text, def_section, def_year, def_semester)
                        else:
                            section, year, semester = def_section, def_year, def_semester
                        
                        # Save enrollment
                        if not ignoreFile:
                            #if len(year) == 0:
                            #    print(file, name)
                            enrollment_lst += [(name, section, year, semester, class_name, profs)]
                        
    if ignoreFile:
        ignored_files += [file]
#enrollment_lst

CPU times: user 3min 29s, sys: 2.41 s, total: 3min 31s
Wall time: 3min 46s


Strange formats:
- 1563.xls Cours UNIL - Faculté de biologie et de médecine II (automne)  Profs divers *
- 5739.xls Slope stability  Laloui Lyesse, Ferrari Alessio
- 6508.xls Théorie et critique du projet MA1 (Lamunière) ['Lamunière Inès']



In [35]:
# Example file prettify for the project files (aka 3rd field of the entry for a student in table is not empty)
# E.g. file 5184
f = open(xls_dir + '5184.xls', encoding='latin-1')      
page_body = f.read()
soup = BeautifulSoup(page_body, 'html.parser')
pretty_f = open(xls_dir + 'pretty/' + '5184.xls', 'w', encoding='latin-1')
pretty_f.write(soup.prettify())

16751

In [36]:
len(ignored_files)

289

## 3. Create tables

In [37]:
len(enrollment_lst)

831129

In [38]:
enrollment_lst[:3]

[('Bays Emmanuelle',
  'Bioingénierie',
  '2006-2007',
  ' Master semestre 2',
  'Biological and physiological transport',
  ['Swartz Melody']),
 ('Berlier Guillaume',
  'Bioingénierie',
  '2006-2007',
  ' Master semestre 2',
  'Biological and physiological transport',
  ['Swartz Melody']),
 ('Cosson Steffen',
  'Bioingénierie',
  '2006-2007',
  ' Master semestre 2',
  'Biological and physiological transport',
  ['Swartz Melody'])]

In [39]:
df_enrollment_large = pd.DataFrame(enrollment_lst, columns=['student_name', 'section', 'year', 'semester', 'course_name', 'profs'])
df_enrollment_large.head(2)

Unnamed: 0,student_name,section,year,semester,course_name,profs
0,Bays Emmanuelle,Bioingénierie,2006-2007,Master semestre 2,Biological and physiological transport,[Swartz Melody]
1,Berlier Guillaume,Bioingénierie,2006-2007,Master semestre 2,Biological and physiological transport,[Swartz Melody]


In [40]:
df_course = df_enrollment_large[['course_name', 'year']]
df_course = df_course.drop_duplicates().reset_index().drop('index',axis=1)
df_course[df_course['course_name'] == 'Data visualization']

Unnamed: 0,course_name,year
12450,Data visualization,2018-2019
12451,Data visualization,
12452,Data visualization,2019-2020
12453,Data visualization,2017-2018


In [41]:
df_course = df_course.reset_index().rename(columns={'index': 'course_id'})
df_course.head()

Unnamed: 0,course_id,course_name,year
0,0,Biological and physiological transport,2006-2007
1,1,Biological and physiological transport,2007-2008
2,2,Special topics in reactor physics,2015-2016
3,3,Special topics in reactor physics,2014-2015
4,4,Special topics in reactor physics,2013-2014


In [42]:
tmp = df_enrollment_large.join(df_enrollment_large, lsuffix='_1', rsuffix='_2')
tmp[(tmp['student_name_1'] == tmp['student_name_2']) & (tmp['section_1'] != tmp['section_2'])]

Unnamed: 0,student_name_1,section_1,year_1,semester_1,course_name_1,profs_1,student_name_2,section_2,year_2,semester_2,course_name_2,profs_2


We checked that there are not people withthe same name but different section.

In [174]:
df_students = df_enrollment_large[['student_name', 'section']].drop_duplicates()
df_students.head(2)

Unnamed: 0,student_name,section
0,Bays Emmanuelle,Bioingénierie
1,Berlier Guillaume,Bioingénierie


In [177]:
df_tmp_students = df_students
df_tmp_students['student_id'] = df_students.index
df_enrollment_tmp = df_tmp_students.merge(df_enrollment_large, left_on = ['student_name', 'section'], right_on = ['student_name', 'section'])
df_enrollment_med = df_enrollment_tmp.drop(['student_name', 'section'], axis=1)
df_enrollment_med

Unnamed: 0,student_id,year,semester,course_name,profs
0,0,2006-2007,Master semestre 2,Biological and physiological transport,[Swartz Melody]
1,0,2006-2007,Master semestre 1,Advanced transport phenomena,[Swartz Melody]
2,0,2006-2007,Master semestre 1,Biomaterials,[Hubbell Jeffrey Alan]
3,0,2006-2007,Master semestre 2,Drug discovery from bench to clinics,[Wells Timothy]
4,0,2006-2007,Master semestre 1,Topics in bioinformatics I,"[Bucher Philipp, Moret Bernard, Naef Felix]"
...,...,...,...,...,...
831124,831039,2015-2016,Master semestre 1,Hardware systems modeling I,[Vachoux Alain]
831125,831044,2015-2016,Master semestre 1,Hardware systems modeling I,[Vachoux Alain]
831126,831086,2014-2015,Master semestre 1,Hardware systems modeling I,[Vachoux Alain]
831127,831097,2014-2015,Master semestre 1,Hardware systems modeling I,[Vachoux Alain]


<b>Create Teaching table:</b> course_id, prof <br>
N.B. There will be multiple tuple with the same course_id in df_teaching because one class can be taught by morethan one prof

In [185]:
df_tmp = df_course.merge(df_enrollment_med, left_on = ['year', 'course_name'], right_on =  ['year', 'course_name'])
df_tmp.head()

Unnamed: 0,course_id,course_name,year,student_id,semester,profs
0,0,Biological and physiological transport,2006-2007,0,Master semestre 2,[Swartz Melody]
1,0,Biological and physiological transport,2006-2007,1,Master semestre 2,[Swartz Melody]
2,0,Biological and physiological transport,2006-2007,2,Master semestre 2,[Swartz Melody]
3,0,Biological and physiological transport,2006-2007,3,Master semestre 2,[Swartz Melody]
4,0,Biological and physiological transport,2006-2007,4,Master semestre 2,[Swartz Melody]


In [190]:
df_enrollment = df_tmp[['student_id', 'course_id', 'semester']].drop_duplicates()
df_enrollment.head()

Unnamed: 0,student_id,course_id,semester
0,0,0,Master semestre 2
1,1,0,Master semestre 2
2,2,0,Master semestre 2
3,3,0,Master semestre 2
4,4,0,Master semestre 2


In [207]:
df_teaching = df_tmp[['course_id', 'profs']]
df_teaching = df_teaching.explode('profs').drop_duplicates().rename(columns={'profs':'prof'})
df_teaching

Unnamed: 0,course_id,prof
0,0,Swartz Melody
13,1,Swartz Melody
23,2,Pautz Andreas
29,3,Pautz Andreas
37,4,Mikityuk Konstantin
...,...,...
831057,15736,Vachoux Alain
831075,15737,Vachoux Alain
831102,15738,Dutoit Bertrand
831108,15739,Dutoit Bertrand


## 4. Save table

In [202]:
# Save student table
csv_dir = 'csv/'
df_students.to_csv(xls_dir + csv_dir + 'student.csv')

In [209]:
# Save courses table
df_course.to_csv(xls_dir + csv_dir + 'courses.csv')

In [208]:
# Save teaching table
df_teaching.to_csv(xls_dir + csv_dir + 'teaching.csv')

In [205]:
# Save enrollment table
df_enrollment.to_csv(xls_dir + csv_dir + 'enrollment.csv')

In [206]:
# Save all in one table
df_enrollment.to_csv(xls_dir + csv_dir + 'enrollment_large.csv')