# Pre-processing

In [23]:
import os
import requests
from bs4 import BeautifulSoup

## 1. Select master courses relevant files

In [109]:
# Directory with the .xls files taken from is-academia public access
xls_dir = '../../data/'

# Iterate over files and keep only the master courses files
master_files = []
for file in os.listdir(xls_dir):
    f = open(xls_dir + file, encoding='latin-1')      
    content = f.read()
    if 'Master' in content:
        master_files += [file]

In [110]:
len(master_files)

3484

## 2. Parse files

In [111]:
file = master_files[143]

In [187]:
def parse_class_info(class_info):
    class_name = prof = ''
    if len(class_info) > 0:
        class_name = class_info[0].text
    if len(class_info) > 1:
        prof_info = class_info[1].text
        if ':' in prof_info:
            prof = prof_info.split(':')[1]
        if prof.find('Assistant-e-(s)') != -1:
            prof = prof[:prof.find('Assistant-e-(s)')]
        prof = prof.split(',')
        prof_lst = [x.strip() for x in prof]
    return class_name, prof_lst
    
    

In [267]:
def parse_group_info(group_info, def_section, def_year, def_sem):
    def_section = def_year = def_sem = ''
    info = group_info.split(',')
    if len(info) == 0:
        section = def_section
        year = def_year
        sem = def_sem
    elif len(info) == 1:
        section = info
        year = def_year
        sem = def_sem
    elif len(info) == 2:
        section, sem = info
        year = def_year
    else:  
         section, year, sem = info[:3]
    return section, year.strip(), sem

In [270]:
%%time
course_id = 1
course_lst = []
enrollment_lst = []
ignored_files = []
for file in master_files: 
    ignoreFile = False
    f = open(xls_dir + file, encoding='latin-1')      
    page_body = f.read()
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # Parse the course general information:
    # - class_name
    # - professors
    
    # empty values
    class_name = None
    profs = []
    def_section = None
    def_year = None
    def_semester = None
    for tr in soup.body.table.findAll('tr',recursive=False):
        class_info = tr.findAll('th', recursive=False)
        # If it contains info for a new class
        if len(class_info) > 0:
            if len(class_info) != 2:
                print('Error: More than two lines descrbing the course (the lines should be course name and prof)')
            class_name, profs = parse_class_info(class_info)
            #print(file, class_name, profs)
        else:
            # Check if it contains a table of students or it start a new set of students
            # New set of students
            if len(tr.findAll('table')) == 0:
                # Parse common info for this group of students
                def_section, def_year, def_semester = parse_group_info(student_info.text, def_section, def_year, def_semester)
                # Save course info
                if not ignoreFile:
                    course_lst += [(course_id, class_name, profs, def_year)]
                    course_id += 1
                    if course_id % 10000 == 0:
                        print("Found courses: " + str(course_id))
            # Table containing the students
            else:
                for table in tr.findAll('table', recurive=False):
                    # Format for files that ocntains project info
                    if len(table.findAll('table')) > 0:
                        ignoreFile = True
                        break
                    
                    # Format for list of students in class
                    for student in table.findAll('tr'):
                        tds = student.findAll('td')
                        #if len(tds) != 3:
                            #print(file, student)
                            #ignoreFile = True
                        name, student_info, extra = tds[:3]
                        name = name.text
                        if len(extra) > 0:
                            print("Error: " + extra)
                        if ',' in student_info:
                            section, year, semester = parse_group_info(student_info.text, def_section, def_year, def_semester)
                        else:
                            section, year, semester = def_section, def_year, def_semester
                        
                        # Save enrollment
                        if not ignoreFile:
                            enrollment_lst += [(name, section, year, semester, course_id)]
                        
    if ignoreFile:
        ignored_files += [file]
        

Found courses: 10000
Found courses: 20000
Found courses: 30000
Found courses: 40000
CPU times: user 3min 22s, sys: 1.94 s, total: 3min 24s
Wall time: 3min 28s


Strange formats:
- 1563.xls Cours UNIL - Faculté de biologie et de médecine II (automne)  Profs divers *
- 5739.xls Slope stability  Laloui Lyesse, Ferrari Alessio
- 6508.xls Théorie et critique du projet MA1 (Lamunière) ['Lamunière Inès']



In [229]:
# Example file prettify for the project files (aka 3rd field of the entry for a student in table is not empty)
# E.g. file 5184
f = open(xls_dir + '5184.xls', encoding='latin-1')      
page_body = f.read()
soup = BeautifulSoup(page_body, 'html.parser')
pretty_f = open(xls_dir + 'pretty/' + '5184.xls', 'w', encoding='latin-1')
pretty_f.write(soup.prettify())

16751

In [272]:
print(len(ignored_files))

289


## 3. Create tables

In [274]:
len(course_lst)

41060

In [275]:
len(enrollment_lst)

831129

In [276]:
course_lst[:10]

[(1, 'Biological and physiological transport', ['Swartz Melody'], '2009-2010'),
 (2, 'Biological and physiological transport', ['Swartz Melody'], ''),
 (3, 'Biological and physiological transport', ['Swartz Melody'], ''),
 (4, 'Special topics in reactor physics', ['Pautz Andreas'], '2007-2008'),
 (5, 'Special topics in reactor physics', ['Pautz Andreas'], ''),
 (6,
  'Special topics in reactor physics',
  ['Mikityuk Konstantin', 'Pautz Andreas'],
  ''),
 (7, 'Special topics in reactor physics', ['Mikityuk Konstantin'], ''),
 (8, 'Special topics in reactor physics', ['Mikityuk Konstantin'], ''),
 (9, 'Adaptation and learning', ['Sayed Ali H.'], ''),
 (10, 'Adaptation and learning', ['Sayed Ali H.'], '')]

### What is wrong?
- the course list is not made of completed entry I should better define when to assign a course id

## 4. Save table