In [74]:
# -*- coding: utf-8 -*- 
from collections import OrderedDict
import xmltodict

input_filename = "courses1617.xml"
with open(input_filename) as f:
    doc = xmltodict.parse(f.read())

In [112]:
def collapse(the_dict, field1, field2):
    if (
        the_dict
        and field1 in the_dict
        and the_dict[field1]
        and field2 in the_dict[field1]
    ):
        if type(the_dict[field1][field2]) is list:
            the_dict[field1] = the_dict[field1][field2]
        else:
            the_dict[field1] = [the_dict[field1][field2]]
            
for course in doc["xml"]["courses"]["course"]:
    # parsing xml has some redundancies. We'll collapse those 
    # to make the final object easier to read through
    collapse(course, "sections", "section")
    collapse(course, "attributes", "attribute")
    collapse(course, "tags", "tag")
    collapse(course, "learningObjectives", "learningObjective")

    if "sections" in course and course["sections"]:
        for section in course["sections"]:
            collapse(section, "attributes", "attribute")
            collapse(section, "schedules", "schedule")
            if "schedules" in section and section["schedules"]:
                for schedule in section["schedules"]:
                    collapse(schedule, "instructors", "instructor")



In [121]:
courses = doc["xml"]["courses"]["course"][:]
print('{} courses total'.format(len(courses)))

zeros = 0
parsed = []
for i, course in enumerate(courses):
#     if i > 20:
#         break
#     print course['title']
    curr = OrderedDict() # sparse vec?
    if not course['description'] or not course['sections']: 
        continue
    # curr['year'] = course['year'] # redundant with sections term/termId
    curr['subject'] = course['subject']
    curr['code'] = course['code']
    curr['title'] = course['title']
    curr['description'] = course['description']
    curr['reqs'] = "" # doesn't necessarily mean missing, just not fufilling reqs?
    if course['gers']: # clean?
        curr['reqs'] = course['gers']
    curr['repeatable'] = course['repeatable'] # change to bool? strings true/false
    curr['grading'] = course['grading'] # clean?
    curr['unitsMin'] = float(course['unitsMin'])
    curr['unitsMax'] = float(course['unitsMax'])
    
    # skip learningObjectives, too few have this info
    # skip attributes - more complete version under sections
    # skip tags, seems irrelevant
    
    # pull out finalExamFlag, academicCareer (UG vs GR vs ?) from admin (getting courseId from sections)
    curr['level'] = ""
    curr['final'] = ""
    admin_info = course['administrativeInformation'] 
    if admin_info:
        curr['level'] = admin_info['academicCareer']
        curr['final'] = admin_info['finalExamFlag'] # change to bool? strings N/Y
    
    # lots of info to pull out of sections:
    sections = course['sections']
    for section in sections:
        #print(section['classId'])
        
        # lots of 0 enroll for independent study, research etc. since list per prof
        # course may have not happened yet (2018)
        if int(section['numEnrolled']) == 0: 
            zeros+=1 
            #continue
        data = OrderedDict()
        data['id'] = long(section['courseId'])
        data['classId'] = int(section['classId']) # unique for mul sections within one courseId
        data['termId'] = int(section['termId'])
        data['term'] = section['term']
        data['component'] = section['component'] # ? not sure what this means 
        data['enrollStatus'] = section['enrollStatus'] # does this affect numEnrolled - or irrelevant since may now reflect closed after enrollment finished? 
        data['numEnrolled'] = int(section['numEnrolled'])
        data['maxEnrolled'] = int(section['maxEnrolled'])
        data['numWaitlist'] = int(section['numWaitlist'])
        data['maxWaitlist'] = int(section['maxWaitlist'])
        data['addConsent'] = section['addConsent'] # convert to bool? strings N/Y
        data['dropConsent'] = section['dropConsent'] # convert to bool? strings N/Y
        
        data['attributes'] = ""
        attributes = section['attributes']
        if attributes: # list
            agg = []
            for attribute in attributes:
                if attribute['description']:
                    agg.append(attribute['description'])
            if agg:
                data['attributes'] = ";".join(agg) # attribute descriptions semicolon separated (bc some attribute descriptions have commas)
                
        schedules = section['schedules'] # mul offerings within one section of one course
        for schedule in schedules: 
            d2 = OrderedDict()
            d2['startTime'] = schedule.get('startTime') 
            d2['endTime'] = schedule.get('endTime')
            d2['location'] = schedule.get('location')
            d2['days'] = schedule.get('days') 
            if d2['days']:
                d2['days'] = ",".join((d2['days']).split()) # days of week comma separated
            instructors_list = schedule.get('instructors')
            instructors = []
            if instructors_list:
                for m, instructor in enumerate(instructors_list):
                    name = ""
                    first = instructor.get('firstName')
                    name = name + first if first else name
                    mid = instructor.get('middleName')
                    name = name + " " + mid if mid else name
                    last = instructor.get('lastName')
                    name = name + " " + last if last else name
                    if name:
                        instructors.append(name)
            d2['instructors'] = ",".join(instructors) # instructor full names comma separated
            if d2['startTime'] or d2['endTime'] or d2['location'] or d2['days'] or d2['instructors']:
                parsed.append(OrderedDict(curr.items() + data.items() + d2.items()))
                #print("appended")

    # come back for attributes within sections (list/none)
    # come back for tag stuff

print('done')



15114 courses total
done


In [122]:
print(len(parsed))
print(zeros) 
# zeros + len(parsed) when keeping in continue != len(parsed) when commenting out continue
# because there are cases where zero is hit AND not appended because all of d2 was empty

#sanity check output
# for i, item in enumerate(parsed):
#     if i < 10:
#         for k, v in item.items():
#             print("{}: {}".format(k, v))
#         print("***")

87692
71549


In [123]:
import unicodecsv as csv
with open('1617.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', encoding='utf-8')
    for i, record in enumerate(parsed):
        if i == 0:
            writer.writerow([k for k, v in record.items()]) 
        writer.writerow([v for k, v in record.items()]) 
tsvfile.close()
print("done writing")


done writing
