# Pre-processing

In [80]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1. Select master courses relevant files

In [81]:
# Directory with the .xls files taken from is-academia public access
xls_dir = '../../data/'

# Iterate over files and keep only the master courses files
master_files = []
for file in os.listdir(xls_dir):
    path = xls_dir + file
    if os.path.isdir(path):
        continue
    f = open(path, encoding='latin-1')      
    content = f.read()
    if 'Master' in content:
        master_files += [file]

In [82]:
len(master_files)

3484

## 2. Parse files

In [83]:
file = master_files[143]

In [84]:
def parse_class_info(class_info):
    class_name = prof = ''
    if len(class_info) > 0:
        class_name = class_info[0].text
    if len(class_info) > 1:
        prof_info = class_info[1].text
        if ':' in prof_info:
            prof = prof_info.split(':')[1]
        if prof.find('Assistant-e-(s)') != -1:
            prof = prof[:prof.find('Assistant-e-(s)')]
        prof = prof.split(',')
        prof_lst = [x.strip() for x in prof]
    return class_name, prof_lst
    
    

In [85]:
def parse_group_info(group_info, def_section, def_year, def_sem):
    def_section = def_year = def_sem = ''
    info = group_info.split(',')
    if len(info) == 0:
        section = def_section
        year = def_year
        sem = def_sem
    elif len(info) == 1:
        section = info
        year = def_year
        sem = def_sem
    elif len(info) == 2:
        section, sem = info
        year = def_year
    else:  
         section, year, sem = info[:3]
    return section, year.strip(), sem

In [86]:
%%time
course_id = 1
course_lst = []
enrollment_lst = []
ignored_files = []
for file in master_files: 
    ignoreFile = False
    f = open(xls_dir + file, encoding='latin-1')      
    page_body = f.read()
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # Parse the course general information:
    # - class_name
    # - professors
    
    # empty values
    class_name = None
    profs = []
    def_section = None
    def_year = None
    def_semester = None
    for tr in soup.body.table.findAll('tr',recursive=False):
        class_info = tr.findAll('th', recursive=False)
        # If it contains info for a new class
        if len(class_info) > 0:
            if len(class_info) != 2:
                print('Error: More than two lines descrbing the course (the lines should be course name and prof)')
            class_name, profs = parse_class_info(class_info)
            #print(file, class_name, profs)
        else:
            # Check if it contains a table of students or it start a new set of students
            # New set of students
            if len(tr.findAll('table')) == 0:
                # Parse common info for this group of students
                def_section, def_year, def_semester = parse_group_info(tr.text, def_section, def_year, def_semester)
                # Save course info
                if not ignoreFile:
                    course_lst += [(course_id, class_name, profs, def_year)]
                    course_id += 1
                    if course_id % 10000 == 0:
                        print("Found courses: " + str(course_id))
            # Table containing the students
            else:
                for table in tr.findAll('table', recurive=False):
                    # Format for files that ocntains project info
                    if len(table.findAll('table')) > 0:
                        ignoreFile = True
                        break
                    
                    # Format for list of students in class
                    for student in table.findAll('tr'):
                        tds = student.findAll('td')
                        #if len(tds) != 3:
                            #print(file, student)
                            #ignoreFile = True
                        name, student_info, extra = tds[:3]
                        name = name.text
                        if len(extra) > 0:
                            print("Error: " + extra)
                        if ',' in student_info:
                            section, year, semester = parse_group_info(student_info.text, def_section, def_year, def_semester)
                        else:
                            section, year, semester = def_section, def_year, def_semester
                        
                        # Save enrollment
                        if not ignoreFile:
                            enrollment_lst += [(name, section, year, semester, course_id)]
                        
    if ignoreFile:
        ignored_files += [file]
#enrollment_lst

Found courses: 10000
Found courses: 20000
Found courses: 30000
Found courses: 40000
CPU times: user 3min 39s, sys: 1.94 s, total: 3min 41s
Wall time: 3min 45s


Strange formats:
- 1563.xls Cours UNIL - Faculté de biologie et de médecine II (automne)  Profs divers *
- 5739.xls Slope stability  Laloui Lyesse, Ferrari Alessio
- 6508.xls Théorie et critique du projet MA1 (Lamunière) ['Lamunière Inès']



In [87]:
# Example file prettify for the project files (aka 3rd field of the entry for a student in table is not empty)
# E.g. file 5184
f = open(xls_dir + '5184.xls', encoding='latin-1')      
page_body = f.read()
soup = BeautifulSoup(page_body, 'html.parser')
pretty_f = open(xls_dir + 'pretty/' + '5184.xls', 'w', encoding='latin-1')
pretty_f.write(soup.prettify())

16751

In [88]:
len(ignored_files)

289

## 3. Create tables

In [89]:
len(course_lst)

41060

In [90]:
len(enrollment_lst)

831129

In [91]:
course_lst[:3]

[(1, 'Biological and physiological transport', ['Swartz Melody'], '2006-2007'),
 (2, 'Biological and physiological transport', ['Swartz Melody'], '2007-2008'),
 (3, 'Biological and physiological transport', ['Swartz Melody'], '2007-2008')]

In [92]:
enrollment_lst[:3]

[('Bays Emmanuelle', 'Bioingénierie', '2006-2007', ' Master semestre 2', 2),
 ('Berlier Guillaume', 'Bioingénierie', '2006-2007', ' Master semestre 2', 2),
 ('Cosson Steffen', 'Bioingénierie', '2006-2007', ' Master semestre 2', 2)]

In [93]:
df_course = pd.DataFrame(course_lst, columns=['course_id', 'course_name', 'profs', 'year' ]).set_index('course_id')
df_course.head(2)

Unnamed: 0_level_0,course_name,profs,year
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Biological and physiological transport,[Swartz Melody],2006-2007
2,Biological and physiological transport,[Swartz Melody],2007-2008


In [94]:
df_enrollment_large = pd.DataFrame(enrollment_lst, columns=['student_name', 'section', 'year', 'semester', 'course_id'])
df_enrollment.head(2)

Unnamed: 0,student_name,section,student_id,year,semester,course_id
0,Bays Emmanuelle,Bioingénierie,0,2006-2007,Master semestre 2,2
1,Bays Emmanuelle,Bioingénierie,0,2006-2007,Master semestre 1,754


In [95]:
tmp = df_enrollment_large.join(df_enrollment_large, lsuffix='_1', rsuffix='_2')
tmp[(tmp['student_name_1'] == tmp['student_name_2']) & (tmp['section_1'] != tmp['section_2'])]

Unnamed: 0,student_name_1,section_1,year_1,semester_1,course_id_1,student_name_2,section_2,year_2,semester_2,course_id_2


We checked that there are not people withthe same name but different section.

In [98]:
df_students = df_enrollment_large[['student_name', 'section']].drop_duplicates()
df_students.head(2)

Unnamed: 0,student_name,section
0,Bays Emmanuelle,Bioingénierie
1,Berlier Guillaume,Bioingénierie


In [107]:
df_tmp_students = df_students
df_tmp_students['student_id'] = df_students.index
df_enrollment_tmp = df_tmp_students.merge(df_enrollment_large, left_on = ['student_name', 'section'], right_on = ['student_name', 'section'])
df_enrollment = df_enrollment_tmp.drop(['student_name', 'section'], axis=1)
df_enrollment

Unnamed: 0,student_id,year,semester,course_id
0,0,2006-2007,Master semestre 2,2
1,0,2006-2007,Master semestre 1,754
2,0,2006-2007,Master semestre 1,1038
3,0,2006-2007,Master semestre 2,4693
4,0,2006-2007,Master semestre 1,8632
...,...,...,...,...
831124,831039,2015-2016,Master semestre 1,41051
831125,831044,2015-2016,Master semestre 1,41051
831126,831086,2014-2015,Master semestre 1,41055
831127,831097,2014-2015,Master semestre 1,41055


### <b> Split courses table in two tables</b>
- Course table: course_id, name, year
- Teaching table: course_id, prof
N.B. There will be multiple tuple with the same course_id in df_teaching because one class can be taught by morethan one prof

In [134]:
df_courses_large = df_course.reset_index()
df_courses_large = df_courses_large.explode('profs')
df_courses_large.head(2)

Unnamed: 0,course_id,course_name,profs,year
0,1,Biological and physiological transport,Swartz Melody,2006-2007
1,2,Biological and physiological transport,Swartz Melody,2007-2008


In [135]:
len(df_course),len(df_courses_large)

(41060, 54603)

In [139]:
df_courses = df_courses_large[['course_id', 'course_name', 'year']].drop_duplicates().set_index('course_id')
df_courses.head(2)

Unnamed: 0_level_0,course_name,year
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Biological and physiological transport,2006-2007
2,Biological and physiological transport,2007-2008


In [144]:
df_teaching = df_courses_large[['course_id', 'profs']]
df_teaching = df_teaching.rename(columns={'profs':'prof'})
df_teaching.head()

Unnamed: 0,course_id,prof
0,1,Swartz Melody
1,2,Swartz Melody
2,3,Swartz Melody
3,4,Pautz Andreas
4,5,Pautz Andreas


## 4. Save table

In [146]:
# Save student table
csv_dir = 'csv/'
df_students.to_csv(xls_dir + csv_dir + 'student.csv')

In [147]:
# Save courses table
df_courses.to_csv(xls_dir + csv_dir + 'courses.csv')

In [148]:
# Save teaching table
df_teaching.to_csv(xls_dir + csv_dir + 'teaching.csv')

In [149]:
# Save enrollment table
df_enrollment.to_csv(xls_dir + csv_dir + 'enrollment.csv')