In [50]:
# -*- coding: utf-8 -*- 
from collections import OrderedDict
import xmltodict
import re

input_filename = "courses1718.xml"
with open(input_filename) as f:
    doc = xmltodict.parse(f.read())
print("loaded")

loaded


In [52]:
def collapse(the_dict, field1, field2):
    if (
        the_dict
        and field1 in the_dict
        and the_dict[field1]
        and field2 in the_dict[field1]
    ):
        if type(the_dict[field1][field2]) is list:
            the_dict[field1] = the_dict[field1][field2]
        else:
            the_dict[field1] = [the_dict[field1][field2]]
            
for course in doc["xml"]["courses"]["course"]:
    # parsing xml has some redundancies. We'll collapse those 
    # to make the final object easier to read through
    collapse(course, "sections", "section")
    collapse(course, "attributes", "attribute")
    collapse(course, "tags", "tag")
    collapse(course, "learningObjectives", "learningObjective")

    if "sections" in course and course["sections"]:
        for section in course["sections"]:
            collapse(section, "attributes", "attribute")
            collapse(section, "schedules", "schedule")
            if "schedules" in section and section["schedules"]:
                for schedule in section["schedules"]:
                    collapse(schedule, "instructors", "instructor")



In [53]:
courses = doc["xml"]["courses"]["course"][:]
print('{} courses total'.format(len(courses)))

zeros = 0 # decided to remove all courses with zero enrollment, still tracking how many were zero though
parsed = []
for i, course in enumerate(courses):
#     if i > 20:
#         break
#     print course['title']
    curr = OrderedDict() # sparse vec?
    if not course['description'] or not course['sections']: 
        continue
    curr['description'] = course['description'].strip()
    curr['title'] = course['title'].strip()
    # curr['year'] = course['year'] # redundant with sections term/termId
    curr['subject'] = course['subject'].strip()
    curr['course_number'] = course['code'].strip()
    curr['course_level'] = int(re.sub("[^0-9]", "", course['code']))/100 # truncate into 0, 1, 2.. 
    curr['num_reqs'] = 0 
    if course['gers']: 
        curr['num_reqs'] = len(course['gers'].split(",")) # care how many satisfy, not which ones
    curr['repeatable'] = course['repeatable'].strip() # change to bool? strings true/false
    curr['grading'] = course['grading'].strip() 
    curr['unitsMin'] = float(course['unitsMin'])
    curr['unitsMax'] = float(course['unitsMax'])
    
    # skip learningObjectives, too few have this info
    # skip attributes - more complete version under sections
    # skip tags, seems irrelevant
    
    # pull out finalExamFlag, academicCareer (UG vs GR vs ?) from admin (getting courseId from sections)
    curr['level'] = ""
    curr['final'] = ""
    admin_info = course['administrativeInformation']
    if admin_info:
        curr['level'] = admin_info['academicCareer'].strip()
        curr['final'] = admin_info['finalExamFlag'].strip() 
    
    # lots of info to pull out of sections:
    sections = course['sections']
    for section in sections:
        #print(section['classId'])
        
        # lots of 0 enroll for independent study, research etc. since list per prof
        # course may have not happened yet (2018)
        if int(section['numEnrolled']) == 0: 
            zeros+=1 
            continue
        data = OrderedDict()
        data['courseId'] = long(section['courseId'])
        data['sectionId'] = int(section['classId']) # unique for mul sections within one courseId
        #data['termId'] = int(section['termId']) # redundant 
        data['term'] = section['term'].strip()
        data['component'] = section['component'].strip() # ? not sure what this means 
        #data['enrollStatus'] = section['enrollStatus'] # threw out # does this affect numEnrolled - or irrelevant since may now reflect closed after enrollment finished? 
        data['numEnrolled'] = int(section['numEnrolled'])
        data['maxEnrolled'] = int(section['maxEnrolled'])
        data['numWaitlist'] = int(section['numWaitlist'])
        data['maxWaitlist'] = int(section['maxWaitlist'])
        data['addConsent'] = section['addConsent']
        data['dropConsent'] = section['dropConsent']
        if data['addConsent']:
            data['addConsent'] = data['addConsent'].strip()
        if data['dropConsent']:
            data['dropConsent'] = data['dropConsent'].strip()
            
        # threw out attributes 
#         data['attributes'] = ""
#         attributes = section['attributes']
#         if attributes: # list
#             agg = []
#             for attribute in attributes:
#                 if attribute['description']:
#                     agg.append(attribute['description'])
#             if agg:
#                 data['attributes'] = ";".join(agg) # attribute descriptions semicolon separated (bc some attribute descriptions have commas)
                
        schedules = section['schedules'] # mul offerings within one section of one course
        for schedule in schedules: 
            d2 = OrderedDict()
            # convert startTime, endTime to be a measure of num hours past midnight
            start = schedule.get('startTime') 
            if start:
                num, seg = start.split(" ")
                h, m, s = num.split(":")
                if h == '12':
                    h = 0
                if seg == "PM":
                    d2['startTime'] = 12 + int(h) + float(m)/60
                else:
                    d2['startTime'] = int(h) + float(m)/60
            else:
                d2['startTime'] = "" 
            end = schedule.get('endTime') 
            if end:
                num, seg = end.split(" ")
                h, m, s = num.split(":")
                if h == '12':
                    h = 0
                if seg == "PM":
                    d2['endTime'] = 12 + int(h) + float(m)/60
                else:
                    d2['endTime'] = int(h) + float(m)/60
            else:
                d2['endTime'] = ""
            d2['location'] = schedule.get('location')
            if d2['location']:
                d2['location'] = d2['location'].strip()
            d2['days'] = schedule.get('days') 
            if d2['days']:
                d2['days'] = ",".join(d2['days'].strip().split()) # days of week comma separated
            instructors_list = schedule.get('instructors')
            instructors = []
            if instructors_list:
                for m, instructor in enumerate(instructors_list):
                    name = ""
                    first = instructor.get('firstName')
                    name = name + first if first else name
                    mid = instructor.get('middleName')
                    name = name + " " + mid if mid else name
                    last = instructor.get('lastName')
                    name = name + " " + last if last else name
                    if name:
                        instructors.append(name.strip())
            # convert instructor popularity / success to some score?
            d2['instructors'] = ",".join(instructors).strip() # instructor full names comma separated
            if d2['startTime'] or d2['endTime'] or d2['location'] or d2['days'] or d2['instructors']:
                parsed.append(OrderedDict(curr.items() + data.items() + d2.items()))
                #print("appended")
print('done')



14840 courses total
done


In [54]:
print(len(parsed))
print(zeros) 
# zeros + len(parsed) when keeping in continue != len(parsed) when commenting out continue
# because there are cases where zero is hit AND not appended because all of d2 was empty

#sanity check output
for i, item in enumerate(parsed):
    if i < 5:
        for k, v in item.items():
            print("{}: {}".format(k, v))
        print("***")

6003
82876
description: This class introduces the basics of aeronautics and astronautics through applied physics, hands-on activities, and real world examples. The principles of fluid flow, flight, and propulsion for aircraft will be illustrated, including the creation of lift and drag, aerodynamic performance including takeoff, climb, range, and landing. The principles of orbits, maneuvers, space environment, and propulsion for spacecraft will be illustrated. Students will be exposed to the history and challenges of aeronautics and astronautics.
title: Introduction to Aeronautics and Astronautics
subject: AA
course_number: 100
course_level: 1
num_reqs: 3
repeatable: false
grading: Letter or Credit/No Credit
unitsMin: 3.0
unitsMax: 3.0
level: UG
final: Y
courseId: 103093
sectionId: 8528
term: 2017-2018 Autumn
component: LEC
numEnrolled: 38
maxEnrolled: 80
numWaitlist: 0
maxWaitlist: 0
addConsent: N
dropConsent: N
startTime: 13.5
endTime: 14.8333333333
location: 200-030
days: Monday,Wed

In [55]:
import unicodecsv as csv
with open('1718new.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', encoding='utf-8')
    for i, record in enumerate(parsed):
        if i == 0:
            writer.writerow([k for k, v in record.items()]) 
        writer.writerow([v for k, v in record.items()]) 
tsvfile.close()
print("done writing")


done writing
