In [535]:
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import re
import datetime

contents = open('transcript.html').read()

soup = BeautifulSoup(contents, 'html')

In [536]:
sectionHeaders = soup.findAll('span', {"class": 'fieldOrangetextbold'})
quartersBlock = sectionHeaders[1:]

In [581]:
def parseRecord(element):
    return [i.text.strip() for i in element.children if type(i) != bs4.element.NavigableString]

def quarterToDate(quarter):
    matches = re.match(r'([a-zA-z]+) Quarter ([0-9]+)-([0-9]+)', quarter)
    season, _year, _ = matches.groups()
    seasonMonth = {'Fall': 9, 'Winter': 1, 'Spring': 4, 'Summer': 7}
    yearAdjustment = 1 if season in ['Winter', 'Spring', 'Summer'] else 0
    year = int('20' + str(int(_year) + yearAdjustment))
    date = datetime.date(year=year, month=seasonMonth[season], day=1)
    status = {11: 'Freshman', 12: 'Sophomore', 13: 'Pre-Junior', 14: 'Junior', 15: 'Senior'} # sen16orz!!!
    return date, status[int(_year)]

def gradeToPoints(grade):
    #return 4.0 lol
    if grade in ['CR', 'DCU']:
        return 4.0
    if grade in ['W']:
        return 0.0
    mapping = {
        "A+": 4.0,
        "A": 4.0,
        "A-": 3.67,
        "B+": 3.33,
        "B": 3.0,
        "B-": 2.67,
        "C+": 2.33,
        "C": 2.0,
        "C-": 1.67,
        "D+": 1.33,
        "D": 1.0,
        "F": 0.0
    }
    return mapping[grade]

def courseType(subject):
    mapping = {
        'la' : ['HNRS', 'UNIV', 'PHTO', 'MGMT', 'COM', 'HIST', 'ENGL', 'PHIL', 'COOP'],
        'stem' : ['ENGR', 'PHYS', 'MATH', 'ECE'],
        'cs' : ['CS', 'CI'],
        'co-op' : ['CO-OP']
    }
    for (name, subjects) in mapping.iteritems():
        if subject in subjects:
            return name
    return subject

In [587]:
co_op_counter = 0
def getDataFromSectionHeader(header):
    global co_op_counter
    headerRow = header.parent.parent
    curRow = headerRow
    for i in range(16):
        curRow = curRow.nextSibling
        if type(curRow) == bs4.element.NavigableString:
            continue
        firstColContents = [child for child in curRow.children][1].text
        if firstColContents == 'Subject':
            break
    cols = [col.text for col in curRow.children if type(col) != bs4.element.NavigableString]
    curData = curRow.nextSibling.nextSibling
    records = []
    while curData is not None and not re.search('Term Totals', curData.text):
        records.append(parseRecord(curData))
        curData = curData.nextSibling
        if type(curData) == bs4.element.NavigableString:
            curData = curData.nextSibling
    excluded = ['Start and End Dates', 'Quality Points', 'Level', 'R']
    data = [dict(zip(cols, record)) for record in records]
    quarter = headerRow.text.strip().replace("Term: ", '')
    for record in data:
        for col in excluded:
            if col in record:
                del record[col]
        record['Quarter'] = quarter
        record['Date'], record['Year'] = quarterToDate(quarter)
        if 'Course' in record and 'Subject' in record:
            # this is messed up online.... wonder if that's a problem?
            if record['Title'].startswith('Co-op Experience'):
                record['Title'] = record['Title'].replace('Experience', 'Experience - ')
                record['Name'] = 'CO-OP %d/%d' % (1 + co_op_counter / 2, 1 + co_op_counter % 2)
                record['Subject'] = 'CO-OP'
                co_op_counter += 1
            else:
                record['Name'] = record['Subject'] + ' ' + record['Course']
        if 'Grade' in record:
            record['Grade Points'] = gradeToPoints(record['Grade'])
        record['Type'] = courseType(record['Subject'])
    return data

In [588]:
nestedData = [getDataFromSectionHeader(_header) for _header in quartersBlock]
flatData = []
# leave out last quarter -- i ain't done yet!!! lol fuck
for quarter in nestedData[:-1]:
    for record in quarter:
        flatData.append(record)
transcript = pd.DataFrame(flatData).sort_values(['Date', 'Type'])

transcript

Unnamed: 0,Course,Credit Hours,Date,Grade,Grade Points,Name,Quarter,Subject,Title,Type,Year
0,121,1.000,2011-09-01,A,4.00,CS 121,Fall Quarter 11-12,CS,Computation Lab I,cs,Freshman
1,164,3.000,2011-09-01,A+,4.00,CS 164,Fall Quarter 11-12,CS,Intro to Computer Science,cs,Freshman
2,101,3.000,2011-09-01,A+,4.00,ENGL 101,Fall Quarter 11-12,ENGL,Expository Writing and Reading,la,Freshman
4,200,1.000,2011-09-01,A,4.00,HNRS 200,Fall Quarter 11-12,HNRS,Introduction to Honors Program,la,Freshman
6,101,3.000,2011-09-01,A-,3.67,PHIL 101,Fall Quarter 11-12,PHIL,Intro to Western Philosophy,la,Freshman
7,E101,1.000,2011-09-01,A,4.00,UNIV E101,Fall Quarter 11-12,UNIV,The Drexel Experience,la,Freshman
3,101,2.000,2011-09-01,A,4.00,ENGR 101,Fall Quarter 11-12,ENGR,Engineering Design Lab I,stem,Freshman
5,121,4.000,2011-09-01,A,4.00,MATH 121,Fall Quarter 11-12,MATH,Calculus I,stem,Freshman
9,122,1.000,2012-01-01,A+,4.00,CS 122,Winter Quarter 11-12,CS,Computation Lab II,cs,Freshman
10,175,3.000,2012-01-01,A+,4.00,CS 175,Winter Quarter 11-12,CS,Computer Programming I-IICourse Taken for Hono...,cs,Freshman


In [589]:
transcript.to_json('transcript.json', orient='records')