In [1]:
# -*- coding: utf-8 -*- 
from collections import OrderedDict
import xmltodict

input_filename = "courses.xml"
# this step takes a while
with open(input_filename) as f:
    doc = xmltodict.parse(f.read())

In [40]:
def collapse(the_dict, field1, field2):
    if (
        the_dict
        and field1 in the_dict
        and the_dict[field1]
        and field2 in the_dict[field1]
    ):
        if type(the_dict[field1][field2]) is list:
            the_dict[field1] = the_dict[field1][field2]
        else:
            the_dict[field1] = [the_dict[field1][field2]]
            
for course in doc["xml"]["courses"]["course"]:
    # parsing xml has some redundancies. We'll collapse those 
    # to make the final object easier to read through
    collapse(course, "sections", "section")
    collapse(course, "attributes", "attribute")
    collapse(course, "tags", "tag")
    collapse(course, "learningObjectives", "learningObjective")

    if "sections" in course and course["sections"]:
        for section in course["sections"]:
            collapse(section, "attributes", "attribute")
            collapse(section, "schedules", "schedule")
            if "schedules" in section and section["schedules"]:
                for schedule in section["schedules"]:
                    collapse(schedule, "instructors", "instructor")


In [115]:
a = doc["xml"]["courses"]["course"][:]
print("{} courses".format(len(a)))

h1_keys = a[0].keys()
h1_values = {k: set() for k, v in a[0].items()}
for i, course in enumerate(a):
    if course.keys() != h1_keys:
        print("diff keys")
        break
    for k, v in course.items():
        h1_values[k].add(type(v)) 

# print h1_keys
# print "**"
# print h1_values
# print "****"

for k, v in h1_values.items():
    print "{}: {}".format(k, v)

14840 courses
administrativeInformation: set([<class 'collections.OrderedDict'>])
code: set([<type 'unicode'>])
description: set([<type 'unicode'>, <type 'NoneType'>])
title: set([<type 'unicode'>])
grading: set([<type 'unicode'>])
learningObjectives: set([<type 'list'>, <type 'NoneType'>])
gers: set([<type 'unicode'>, <type 'NoneType'>])
unitsMin: set([<type 'unicode'>])
repeatable: set([<type 'unicode'>])
year: set([<type 'unicode'>])
attributes: set([<type 'list'>, <type 'NoneType'>])
sections: set([<type 'list'>, <type 'NoneType'>])
unitsMax: set([<type 'unicode'>])
tags: set([<type 'list'>, <type 'NoneType'>])
subject: set([<type 'unicode'>])


In [10]:
# analyzing top level fields

for i, course in enumerate(a):
    if i < 75:
        continue
    if i > 80:
        break
    for k, v in course.items():
        print "{}: {}".format(k, v)
    print("***")

year: 2017-2018
subject: ACCT
code: 210
title: Financial Accounting
description: Financial accounting is the measurement of economic activity for decision-making. Financial statements are a key product of this measurement process and an important component of firms' financial reporting activities. The objective of this course is not to train you to become an accountant but rather to help you develop into an informed user of financial statement information. While financial statement users face a wide variety of decisions, they are often interested in understanding the implications of financial statement information for the future cash flows and earnings potential of a firm. We will focus on understanding the mapping between underlying economic events and financial statements, and on understanding how this mapping affects inferences about future profitability and liquidity. The following learning objectives will be emphasized: (1) familiarity with the transactions businesses engage in, (

In [13]:
# checking which of the sometimes null fields are worth keeping
# throw out learningObj, too many null 

for j in ['learningObjectives', 'attributes', 'sections', 'tags']:
    y, n = 0, 0
    for i, course in enumerate(a):
        x = course[j]
        if isinstance(x, list):
            y+=1
        else:
            n+=1
    print("{}, {}".format(y, n))

2393, 12447
14327, 513
8624, 6216
12839, 2001


In [19]:
#bleh lists/dicts have nested dicts within..

for j in ['attributes', 'sections', 'tags', 'administrativeInformation']:
    for i, course in enumerate(a):
        x = course[j]
        if isinstance(x, list):
            for k, v in x[0].items():
                 print "{}: {}".format(k, v)
            print("-----")
            for k, v in x[0].items():
                 print "{}: {}".format(k, type(v))
            print("****")
            break
        if isinstance(x, OrderedDict):
            for k, v in x.items():
                print "{}: {}".format(k, v)
            for k, v in x.items():
                print "{}: {}".format(k, type(v))
            print("****")
            break

name: NQTR
value: NOTTHIS
description: not given this year
catalogPrint: true
schedulePrint: false
-----
name: <type 'unicode'>
value: <type 'unicode'>
description: <type 'unicode'>
catalogPrint: <type 'unicode'>
schedulePrint: <type 'unicode'>
****
classId: 8528
term: 2017-2018 Autumn
termId: 1182
subject: AA
code: 100
units: 3
sectionNumber: 01
component: LEC
numEnrolled: 38
maxEnrolled: 80
numWaitlist: 0
maxWaitlist: 0
enrollStatus: Open
addConsent: N
dropConsent: N
courseId: 103093
schedules: [OrderedDict([(u'startDate', u'Sep 25, 2017'), (u'endDate', u'Dec 8, 2017'), (u'startTime', u'1:30:00 PM'), (u'endTime', u'2:50:00 PM'), (u'location', u'200-030'), (u'days', u'Monday\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tWednesday'), (u'instructors', [OrderedDict([(u'name', u'Kroo, I.'), (u'firstName', u'Ilan'), (u'middleName', u'M'), (u'lastName', u'Kroo'), (u'sunet', u'kroo'), (u'role', u'PI')]), OrderedDict([(u'name', u'Hockman, B.'), (u'firstName', u'Ben'), (u'middleName', None), 

In [30]:
# admin analysis
# keep finalExamFlag and academicCareer 

admin_keys = [u'courseId', u'effectiveStatus', u'offerNumber', u'academicGroup', u'academicOrganization', u'academicCareer', u'finalExamFlag', u'catalogPrint', u'schedulePrint', u'maxUnitsRepeat', u'maxTimesRepeat']

for i, course in enumerate(a):
    if i < 70:
        continue
    if i > 100:
        break
    print(i)
    print(course['title'])
    x = course['administrativeInformation'] 
    if x: # if dict, then always a dict with keys admin_keys 
        if x.keys() != admin_keys:
            print("!")
            break
        for k, v in x.items():
            print "{}: {}".format(k, v)
        print("****")

70
Seminar in Guidance, Navigation, and Control
courseId: 103253
effectiveStatus: A
offerNumber: 1
academicGroup: ENGR
academicOrganization: AEROASTRO
academicCareer: GR
finalExamFlag: N
catalogPrint: Y
schedulePrint: Y
maxUnitsRepeat: 999
maxTimesRepeat: 99
****
71
Engineer Thesis
courseId: 103259
effectiveStatus: A
offerNumber: 1
academicGroup: ENGR
academicOrganization: AEROASTRO
academicCareer: GR
finalExamFlag: N
catalogPrint: Y
schedulePrint: Y
maxUnitsRepeat: 999
maxTimesRepeat: 99
****
72
Ph.D. Dissertation
courseId: 103260
effectiveStatus: A
offerNumber: 1
academicGroup: ENGR
academicOrganization: AEROASTRO
academicCareer: GR
finalExamFlag: N
catalogPrint: Y
schedulePrint: Y
maxUnitsRepeat: 999
maxTimesRepeat: 99
****
73
TGR Engineer Thesis
courseId: 103262
effectiveStatus: A
offerNumber: 1
academicGroup: ENGR
academicOrganization: AEROASTRO
academicCareer: GR
finalExamFlag: N
catalogPrint: Y
schedulePrint: Y
maxUnitsRepeat: 999
maxTimesRepeat: 99
****
74
TGR Ph.D. Dissertatio

In [38]:
# attributes analysis 

dict_type_keys = [u'name', u'value', u'description', u'catalogPrint', u'schedulePrint']

for i, course in enumerate(a):
    if i < 70:
        continue
    if i > 100:
        break
    #print(i)
    #print(course['title'])
    x = course['attributes'] 
    if x: 
        if isinstance(x, list): # list of dicts with keys dict_type_keys
            #print(len(x))
            if len(x) > 1:
                print(course['title'])
                print(x[0])
                print(x[1])
            print("******")
        else:
            print('shouldnt be another type')
            break


******
Engineer Thesis
OrderedDict([(u'name', u'NQTR'), (u'value', u'SPR'), (u'description', u'Spring'), (u'catalogPrint', u'true'), (u'schedulePrint', u'false')])
OrderedDict([(u'name', u'NQTR'), (u'value', u'WIN'), (u'description', u'Winter'), (u'catalogPrint', u'true'), (u'schedulePrint', u'false')])
******
Ph.D. Dissertation
OrderedDict([(u'name', u'NQTR'), (u'value', u'SPR'), (u'description', u'Spring'), (u'catalogPrint', u'true'), (u'schedulePrint', u'false')])
OrderedDict([(u'name', u'NQTR'), (u'value', u'WIN'), (u'description', u'Winter'), (u'catalogPrint', u'true'), (u'schedulePrint', u'false')])
******
TGR Engineer Thesis
OrderedDict([(u'name', u'NQTR'), (u'value', u'SPR'), (u'description', u'Spring'), (u'catalogPrint', u'true'), (u'schedulePrint', u'false')])
OrderedDict([(u'name', u'NQTR'), (u'value', u'WIN'), (u'description', u'Winter'), (u'catalogPrint', u'true'), (u'schedulePrint', u'false')])
******
TGR Ph.D. Dissertation
OrderedDict([(u'name', u'NQTR'), (u'value', u'SP

In [141]:
# sections analysis 

# len 23
dict_type_keys = [u'classId', u'term', u'termId', u'subject', u'code', u'units', u'sectionNumber', 
                  u'component', u'numEnrolled', u'maxEnrolled', u'numWaitlist', u'maxWaitlist', 
                  u'enrollStatus', u'addConsent', u'dropConsent', u'courseId', u'schedules', u'currentClassSize', 
                  u'maxClassSize', u'currentWaitlistSize', u'maxWaitlistSize', u'notes', u'attributes']

dict_type_values = [set() for i in range(23)]
# [set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>]),
#  set([<type 'unicode'>]), set([<type 'unicode'>, <type 'NoneType'>]), set([<type 'unicode'>]),
#  set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>]),
#  set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>]),
#  set([<type 'unicode'>]), set([<type 'list'>]), set([<type 'unicode'>]), set([<type 'unicode'>]),
#  set([<type 'unicode'>]), set([<type 'unicode'>]), set([<type 'unicode'>, <type 'NoneType'>]),
#  set([<type 'list'>, <type 'NoneType'>])]

# len 7
schedule_type_keys = [u'startDate', u'endDate', u'startTime', u'endTime', u'location', u'days', u'instructors']
schedule_type_values = [set() for i in range(7)]
# startDate: set([<type 'unicode'>, <type 'NoneType'>])
# endDate: set([<type 'unicode'>, <type 'NoneType'>])
# startTime: set([<type 'unicode'>, <type 'NoneType'>])
# endTime: set([<type 'unicode'>, <type 'NoneType'>])
# location: set([<type 'unicode'>, <type 'NoneType'>])
# days: set([<type 'unicode'>, <type 'NoneType'>])
# instructors: set([<type 'list'>, <type 'NoneType'>])

for i, course in enumerate(a):
    if i < 85:
        continue
    if i > 95:
        break
    #print(i)
    #print(course['title'])
    x = course['sections'] 
    if x: 
        if isinstance(x, list): # list of dicts with keys dict_type_keys
            #get value types
#             for m in range(len(x)):
#                 for k, v in x[m].items():
#                     n = dict_type_keys.index(k)
#                     dict_type_values[n].add(type(v))


            #shows attributes more complete than notes!
#             for m in range(len(x)):
#                 if x[m]['notes']:
#                     if not x[m]['attributes']:
#                         print("list{}".format(1))
#                         break
#                 if x[m]['attributes']:
#                     if not x[m]['notes']:
#                         print("list{}".format(2))
#                         break

            #viewing example fields
#             print(len(x))
#             for k, v in x[0].items():
#                 print "{}: {}".format(k, v)
#             print("------")
#             for k, v in x[1].items():
#                 print "{}: {}".format(k, v)  
            
            #parsing schedules
#             for m, section in enumerate(x): 
#                 schedules = section['schedules'] # list
#                 for n, schedule in enumerate(schedules): 
#                     if schedule.keys()!= schedule_type_keys:
#                         print("!")
#                     for k, v in schedule.items():
#                         j = schedule_type_keys.index(k)
#                         schedule_type_values[j].add(type(v))
                                        
#                     startTime = schedule.get('startTime')
#                     endTime = schedule.get('endTime')
#                     location = schedule.get('location')
#                     days = schedule.get('days')
#                     if days:
#                         days = ",".join((schedule.get('days')).split())
#                     instructors_list = schedule.get('instructors')
#                     if instructors_list:
#                         instructors = []
#                         for m, instructor in enumerate(instructors_list):
#                             name = ""
#                             first = instructor.get('firstName')
#                             name = name + first if first else name
#                             mid = instructor.get('middleName')
#                             name = name + " " + mid if mid else name
#                             last = instructor.get('lastName')
#                             name = name + " " + last if last else name
#                             if name:
#                                 instructors.append(name)
                   
#                     if startTime or endTime or location or days or instructors:
#                         for k, v in schedule.items():
#                             print "{}: {}".format(k, v) 
#                         print(startTime)
#                         print(endTime)
#                         print(location)
#                         print(days)
#                         print(",".join(instructors))

            #parsing attributes
#             for m, section in enumerate(x): 
#                 if section['attributes'] and len(attributes) > 1:
#                     print(course['title'])
#                     for n, attribute in enumerate(attributes): 
#                         if n > 2:
#                             break
#                         for k, v in attribute.items():
#                             print "{}: {}".format(k, v) 
#                         print("-----")
#                 print("****")
#                 break

            for m, section in enumerate(x): 
                attributes = section['attributes'] # list
                print(course['title'])
                if section['attributes']:
                    agg = []
                    for n, attribute in enumerate(attributes):
                        if attribute['description']:
                            agg.append(attribute['description'])
                    if agg:
                        print(",".join(agg))
            print("-------")

        else:
            print('shouldnt be another type')
            break
    #print("******") 


# for i in range(len(dict_type_keys)):
#     print("{}: {}".format(dict_type_keys[i], dict_type_values[i]))
# print("--------------")

# for i in range(len(schedule_type_keys)):
#     print("{}: {}".format(schedule_type_keys[i], schedule_type_values[i]))
# print("--------------")
            

Analysis and Valuation for Event-Driven Investing
No Exam,Participation 20% Projects/Papers 60% Other 20%,Enrolled + waitlisted must attend or be dropped.,P/F Allowed with Permission of Instructor,Seminar,Open to MBA and MSx students,Capacity limited to 30 Students,Non-GSB students: See gsb.stanford.edu/NonGSBReg.
-------
Analysis and Valuation of Emerging Market Firms
No Exam,GSB PhD students need instructor permission,Case and Problem Discussion,1 Group Project/Paper,Participation 25% Project/Paper 75%,Mandatory attendance. Absences impact grade.,Open to MBA2 and MSx students,P/F Allowed,Non-GSB students: See gsb.stanford.edu/NonGSBReg.
-------
Board Governance
Participation 100%,No Exam,Open to MBA2 students,GSB PhD students need instructor permission,Capacity limited to 40 students,Enrolled + waitlisted must attend or be dropped.,Seminar,P/F Allowed,Not Open to Non-GSB Students
Board Governance
Participation 100%,No Exam,Open to MBA2 students,GSB PhD students need instructor permis

In [150]:
# tags analysis

for i, course in enumerate(a):
    if i < 700:
        continue
    if i > 800:
        break
    #print(i)
    #print(course['title'])
    x = course['tags'] 
    if x: 
        if isinstance(x, list): 
            print(course['title'])
            for i in range(len(x)):
                print(x[i])
            print("******")
        else:
            print('shouldnt be another type')
            break




World Heritage in Global Conflict (ANTHRO 247B, ARCHLGY 147B)
OrderedDict([(u'organization', u'ISSTUD'), (u'name', u'ALCW12')])
OrderedDict([(u'organization', u'IR'), (u'name', u'ihc')])
OrderedDict([(u'organization', u'EDUC'), (u'name', u'alluniversityabove100')])
OrderedDict([(u'organization', u'EDUC'), (u'name', u'alluniversity')])
******
Health, Politics, and Culture of Modern China (ANTHRO 248)
OrderedDict([(u'organization', u'CIGH'), (u'name', u'globalhealth')])
OrderedDict([(u'organization', u'EDUC'), (u'name', u'alluniversityabove100')])
OrderedDict([(u'organization', u'EDUC'), (u'name', u'alluniversity')])
******
South Asia: History, People, Politics (ANTHRO 249)
OrderedDict([(u'organization', u'ISSTUD'), (u'name', u'ALCW12')])
OrderedDict([(u'organization', u'URBANST'), (u'name', u'comphist')])
OrderedDict([(u'organization', u'IR'), (u'name', u'east_soasia')])
OrderedDict([(u'organization', u'SGS'), (u'name', u'southasian')])
OrderedDict([(u'organization', u'EDUC'), (u'name',