In [1]:
!pip install pyyaml
!pip install icalendar
!pip install python-dateutil



In [2]:
import re
import csv
import json

#pip install beautifulsoup4
from bs4 import BeautifulSoup

from pathlib import Path

import shutil
import json
import csv
import yaml

banner_cols = {
    'CRN':1,
    'Subj':2,
    'Crse':3,
    'Sec':4,
    'Cmp':5,
    'Cred':6,
    'Title':7,
    'Days':8,
    'Time':9,
    'Cap':10,
    'Act':11,
    'Rem':12,
    'Instructor':16,
    'Date':17,
    'Location':18
}

In [3]:
def scrape_banner_course_schedule(filename):
    '''Uses Beautiful Soup to parse an HTML file exported from Banner Web'''

    with open(filename) as fp:
        course_specs = []
        soup = BeautifulSoup(fp,'html.parser')
        data_display_table_rows = soup.find('table',class_='datadisplaytable').find_all('tr')
        #print(data_display_table_rows)

        #read the table one row at a time, skipping things we don't need
        for row in data_display_table_rows:
            cols = row.select("td.dddefault")
            if (cols):
                crn_raw = str(cols[banner_cols['CRN']].string).strip('\xa0')
                times = str(cols[banner_cols['Time']].string)
                times = times.replace(' pm','pm')
                times = times.replace(' am','am')
                times = times.replace(':','')
                times = times.strip('\xa0')
                meeting = {'days':str(cols[banner_cols['Days']].string).strip('\xa0').strip('\u00a0'),
                            'times':times,
                            'dates':str(cols[banner_cols['Date']].string),
                            'location':str(cols[banner_cols['Location']].string)}
                timecode = meeting['days']+" "+meeting['times']+" "+meeting['dates']+" "+meeting['location']


                if crn_raw:
                    # the normal case, not a continuation of the previous row with more timecodes
                    course_spec = {}
                    course_spec['crn']=int(crn_raw)
                    course_spec['catalog_id'] = str(cols[banner_cols['Subj']].string)+" "+ str(cols[banner_cols['Crse']].string)
                    course_spec['section'] = str(cols[banner_cols['Sec']].string)
                    course_spec['credits'] = str(cols[banner_cols['Cred']].string)
                    course_spec['title'] = str(cols[banner_cols['Title']].string)
                    course_spec['meetings'] = []
                    course_spec['timecodes'] = []
                    if meeting['days']:
                        course_spec['meetings'] += [meeting]
                        course_spec['timecodes'] += [timecode]
                    course_spec['primary_instructor'] = str(cols[banner_cols['Instructor']].get_text()).split(' (P)')[0]
                    course_spec['instructors'] = str(cols[banner_cols['Instructor']].get_text())
                    course_spec['cap'] = str(cols[banner_cols['Cap']].string)
                    course_spec['act'] = str(cols[banner_cols['Act']].string)
                    course_spec['rem'] = str(cols[banner_cols['Rem']].string)
                    course_specs += [course_spec]
                else:
                    # extra timecodes
                    if meeting['days']:
                        course_specs[-1]['meetings'] += [meeting]
                        course_specs[-1]['timecodes'] += [timecode]
                        course_specs[-1]['timecodes'] += [timecode]

    return course_specs;

In [4]:
from dateutil.rrule import *
from dateutil.parser import *

from datetime import date,time,datetime,timedelta

# conda install -c conda-forge icalendar
from icalendar import Calendar, Event,vPeriod,vDatetime

# courses = []

# A bunch of regular expressions and constants for parsing timecode strings
tc_date_range_re = re.compile(r'([0-9]+)/([0-9]+)-([0-9]+)/([0-9]+)')
tc_time_range_re = re.compile(r'([0-9][0-9][0-9][0-9][PpAa][Mm])-([0-9][0-9][0-9][0-9][PpAa][Mm])')
tc_days_of_week_re = re.compile('(^[MTWRFSU]*) ')
tc_day_map = {'M':'MO','T':'TU','W':'WE','R':'TH','F':'FR','S':'SA','U':'SU'} # needed for icalendar
tc_day_map_du = {'M':MO,'T':TU,'W':WE,'R':TH,'F':FR,'S':SA,'U':SU} # needed for dateutil
tc_days_of_week = "UMTWRFS"

def generate_course_calendar(course,cal_rules):
    '''
    Generates one icalendar Calendar() and a list of meetings from the course timecodes
    Timecodes are dicts with 'days' (of week),'times' (range),'dates' (range), and 'location'
    '''

    meetings=[]
    cal = Calendar()
    cal['summary'] = str(course['crn']) + " " + course['catalog_id']+" "+ str(course['section'])
    cal['description'] = course['catalog_id'] +":" + course['title'] +" ("+ course['primary_instructor']+")"

    # generate one recurring event per timecode
    for meeting in course['meetings']:
        mdates = []

        # skip if there are no days
        if not meeting['days'] or meeting['days'] == [''] or meeting['days'] == '\xa0':
            break
        #print(course['crn'],meeting['days'],meeting['times'],meeting['location'])

        # handle explicit date ranges
        if '/' in meeting['dates']:
             year = cal_rules['term-year']
             drange = tc_date_range_re.findall(meeting['dates'])[0]
             startdate = date(year, int(drange[0]), int(drange[1]))
             enddate = date(year, int(drange[2]), int(drange[3]))

        # event metadata
        event = Event()
        event['summary'] = course['catalog_id']+" "+course['section']+" "+meeting['location']
        #event['uid'] = 'fairfield'+str(course['crn'])

        # timing parameters
        trange = tc_time_range_re.findall(meeting['times'])
        if not trange or trange == ['']:
            break
        starttime = datetime.strptime(trange[0][0],'%I%M%p')
        endtime = datetime.strptime(trange[0][1],'%I%M%p')

        # use datetime and dateutil to enumerate all the event start times
        tc_rrule = ''
        if startdate==enddate:
            mdates += [datetime.combine(startdate,starttime.time())]
        else:
            wdays =[tc_day_map_du[d] for d in meeting['days']]
            # print(wdays)
            tc_rrule=rrule(WEEKLY, dtstart=datetime.combine(startdate,starttime.time()),byweekday=wdays, until=datetime.combine(enddate,endtime.time()))
            # print(tc_rrule)
            # print(course['crn'],datetime.combine(startdate,starttime.time()),wdays)
            mdates += list(tc_rrule)

        # the first icalendar event
        event.add('dtstart',datetime.combine(mdates[0].date(),starttime.time()))
        event.add('dtend', datetime.combine(mdates[0].date(),endtime.time()))

        # icalendar recurrence rules
        days = [tc_day_map[d] for d in meeting['days'][0].strip()]
        event.add('rrule',{'freq':'weekly','byday':days,'until':enddate})

        # set up to use the rules to modify the icalendar dates
        cancel_dates = [] # These become exclusions to the rrule
        new_dates =[] # These are additional dates not coverd by the rrule

        # date shift rules ("Tuesday is on a Monday schedule")
        if 'date-shift-rules' in cal_rules:
            for ds_rule in cal_rules['date-shift-rules']:
                to_date = datetime.strptime(ds_rule['to-date'],"%Y-%m-%d")
                from_date = datetime.strptime(ds_rule['from-date'],"%Y-%m-%d")

                # check to see if the course is excluded
                if 'exclusions' in ds_rule:
                    skip = False
                    for exclusion in ds_rule['exclusions']:
                        exclusion_re = re.compile(exclusion)
                        if exclusion_re.match(course['catalog_id']):
                            skip = True
                    if skip:
                        break

                for rdate in mdates:
                    # cancel (pre-empt) classes on to-date
                    if rdate.date() == to_date.date():
                        cancel_dates += [rdate]

                    # add classes on from-date
                    if rdate.date() == from_date.date():
                        new_start = datetime.combine(to_date.date(),starttime.time())
                        new_end = datetime.combine(to_date.date(),endtime.time())
                        new_dates += [{'dtstart':new_start,'dtend':new_end}]

        # holiday rules
        if 'holiday-rules' in cal_rules:
            for holiday_rule in cal_rules['holiday-rules']:
                # check to see if the course is excluded
                if 'exclusions' in holiday_rule:
                    skip = False
                    for exclusion in holiday_rule['exclusions']:
                        exclusion_re = re.compile(exclusion)
                        if exclusion_re.search(course['catalog_id']):
                            skip = True
                    if skip:
                        break
                if tc_rrule:
                    start_dt = datetime.strptime(holiday_rule['start-dt'],"%Y-%m-%dT%H:%M")
                    end_dt = datetime.strptime(holiday_rule['end-dt'],"%Y-%m-%dT%H:%M")
                    cancel_dates += tc_rrule.between(start_dt,end_dt)

        if cancel_dates:
            for d in cancel_dates:
                mdates.remove(d)
            event.add('exdate',cancel_dates)

        cal.add_component(event)

        # add an event for each new_date not covered by the recurrence rule
        for new_date in new_dates:
            mdates += [new_date['dtstart']]
            new_event = Event()
            new_event['summary']=event['summary']
            new_event.add('dtstart',new_date['dtstart'])
            new_event.add('dtend',new_date['dtend'])
            cal.add_component(new_event)

        for m in mdates:
            starttime_iso = datetime.isoformat(datetime.combine(m.date(),starttime.time()))
            endtime_iso = datetime.isoformat(datetime.combine(m.date(),endtime.time()))
            meetings += [{'crn':course['crn'],'location':meeting['location'],'day':"MTWRFSU"[m.date().weekday()],'start':starttime_iso,'end':endtime_iso}]

    return {'ical':cal.to_ical(),'meetings':meetings}

In [18]:
# Course Offering and Course Meeting Data
terms = [   'Fall2014', 'Winter2015', 'Spring2015', 'Summer2015',
            'Fall2015', 'Winter2016', 'Spring2016', 'Summer2016',
            'Fall2016', 'Winter2017', 'Spring2017', 'SpringBreak2017', 'Summer2017',
            'Fall2017', 'Winter2018','Spring2018','Summer2018',
            'Fall2018', 'Winter2019','Spring2019','SummerI2019','SummerII2019','Summer2019',
            'Fall2019', 'Winter2020','Spring2020','Summer2020',
            'Fall2020', 'Winter2021','Spring2021','Summer2021',
            'Fall2021', 'Winter2022','Spring2022','Summer2022',
            'Fall2022'
        ]

for term in terms:
    banner_html_path = f"SourceData/{term}/banner.html"
    print(term)
    course_offerings = scrape_banner_course_schedule(banner_html_path)
    
    json_path = f"SourceData/{term}/course_offerings.json"
    f = open(json_path,"w")
    json.dump(course_offerings,f)
    
    # generate courses.csv from course_offerings.json
    courses_csv_path_str = f"SourceData/{term}/courses.csv"
    with Path(courses_csv_path_str).open('w',newline='') as csvfile:
            field_names = ['term']+list(course_offerings[0].keys())
            writer = csv.DictWriter(csvfile,field_names)
            writer.writeheader()
            for c in course_offerings:
                c['term'] = term
                writer.writerow(c)
    
    # generate calendars and meetings
    yaml_path_str = f"SourceData/{term}/cal_rules.yaml"
    f = open(yaml_path_str,'r')
    cal_rules = yaml.load(f,yaml.CLoader)

    meetings=[]
    for c in course_offerings:
        out = generate_course_calendar(c,cal_rules)
        meetings += out['meetings']
        # cal = out['ical']
        # with term_folder_path.joinpath("Calendars").joinpath(str(c['crn'])+".ics").open("wb") as icalfile:
        #     icalfile.write(cal)

    meetings_csv_path_str = f"SourceData/{term}/course_meetings.csv"
    with Path(meetings_csv_path_str).open('w',newline='') as csvfile:
        field_names = ['term']+ list(meetings[0].keys())
        writer = csv.DictWriter(csvfile,field_names)
        writer.writeheader()
        for meeting in meetings:
            meeting['term'] = term
            writer.writerow(meeting)

Fall2014
Winter2015
Spring2015
Summer2015
Fall2015
Winter2016
Spring2016
Summer2016
Fall2016
Winter2017
Spring2017
SpringBreak2017
Summer2017
Fall2017
Winter2018
Spring2018
Summer2018
Fall2018
Winter2019
Spring2019
SummerI2019
SummerII2019
Summer2019
Fall2019
Winter2020
Spring2020
Summer2020
Fall2020
Winter2021
Spring2021
Summer2021
Fall2021
Winter2022
Spring2022
Summer2022
Fall2022
