In [1]:
# INM430 Week09 City Uni courses scraper - helper functions

def jsonifier(courses, toplevel):
    import json
    jsonified = "{\"" + toplevel + "\":" + json.dumps(results) + "}"
    # to keep return type consistent with xmlifier
    return str.encode(jsonified)

def xmlifier(courses, toplevel):
    # courses - list of dictionaries
    # toplevel - xml doc root level
    from xml.etree.ElementTree import Element, SubElement, Comment, tostring
    from xml.dom import minidom
    # since we want to generate a single XML file, we start with the root
    # before running the loop.
    root = Element(toplevel)  
    keys = courses[0].keys()
    #print(keys)
    for i in range(len(courses)):
        #"name" : courseName,
        #"description" : courseDescription,
        #"school" : courseSchool,
        #"code" : courseCode,
        #"level" : level,
        # And let's start populating the XML file by adding the scraped data one by one
        rootXML = SubElement(root, 'Course') # hardcoded, TODO abstract
        #print(i)         
        for key in keys:
            detailXML = SubElement(rootXML, key)
            detailXML.text = results[i][key]
            #print (results[i][key]) 
    xmlified = tostring(root, 'utf-8')
    return xmlified
    
def getCityCourseCount(course):
    # Find the number of City University of London courses shown on website
    # by scraping tag course-finder__results__summary
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://www.city.ac.uk/courses?level=" + course
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    pageCount = soup.find_all('div', attrs={'class': 'course-finder__results__summary'})
    for courseListItem in pageCount:
        try:
            # we are expecting a span tag like this "<span>67 search results</span>"            
            myspan = courseListItem.find('span').text
            # we've now stripped the tags and have "67 search results"
            # we split the string into an array, using the default white space delimiter
            # and take the zeroeth element which is 67
            mycount = myspan.split()[0]
        except Exception as e:
            mycount = "0" # no results
    return mycount

def getCourses(level, count):
    from bs4 import BeautifulSoup
    import urllib.request as ur    
    import re # regular expressions
    i = 1
    step = 10
    results = []
    while i <= int(count):
        # we know that the general URL format will be
        # https://www.city.ac.uk/courses?level=(level)&p=(n)
        # where paging is requested in querystring (text to the right of question mark)
        # attribute p, where (n) represents the starting number
        # we know that paging increments in counts of 10 i.e. 1, 11, 21 and so forth
        # hence we count up to 61, the url then requests pages starting from 61, we know
        # that our count for undergraduate courses was 67 can we can stop there
        # url = "https://www.city.ac.uk/courses?level=Undergraduate&p=" + str(i)
        url = "https://www.city.ac.uk/courses?level=" + level + "&p=" + str(i)
        r = ur.urlopen(url).read()
        soup = BeautifulSoup(r, "lxml")

        # get all the "div"s that ar of the class we are interested in.
        # note that the following line will return us a collection of "div" sections
        # courseList = soup.find_all('div', attrs={'class': 'course-finder__results__item course-finder__results__item--undergraduate'})
        # Let's use regular expressions wildcard instead
        courseList = soup.find_all('div', attrs={'class': re.compile('course-finder__results__item course-finder__results__item--.*')})
        
        for courseListItem in courseList:

            # Here we cover everything with try/except constructs to ensure that we are not failing when an element is not there.    
            try:
                # first go inside the DIV to access the course name, note that it is under a <a> tag, so that's where we need to access here.
                courseNameElement = courseListItem.find('div', attrs={'class': "col-sm-24 col-md-18 col-lg-20"})
                courseName = courseNameElement.find('a').text

            except Exception as e:
                courseName = ""

            try:
                # here, we access the course description, you can find that it is under a DIV
                courseDescriptionElement = courseListItem.find('div', attrs={'class': "course-finder__results__item__description"})
                courseDescription = courseDescriptionElement.text
            except Exception as e:
                courseDescription = ""

            try:
                # now on to scraping the name of the school offering the course, this time under an <a> tag 
                courseSchoolElement = courseListItem.find('div', attrs={'class': "course-finder__results__item__md course-finder__results__item__md--school"})
                courseSchool = courseSchoolElement.find('a').text
            except Exception as e:
                courseSchool = ""

            try:
                # when we try to get the course code, we notice that there are two <span> sections under the section 
                # we are interested in, we want to get the second one, hence => .find_all('span')[1]

                courseCodeElement_1 = courseListItem.find('div', attrs={'class': "course-finder__results__item__md course-finder__results__item__md--code"})
                courseCodeElement_2 = courseCodeElement_1.find_all('span')[2]
                courseCode = courseCodeElement_2.text
            except Exception as e:
                courseCode = ""
            # let's populate a dictionary for now, and decide how to format it later
            if len(courseName) > 0:
                course = {
                "name" : courseName,
                "description" : courseDescription,
                "school" : courseSchool,
                "code" : courseCode,
                "level" : level,
                }
                results.append(course)
        # increment count
        i += step
    return results

In [3]:
import datetime

# Items in citycourselist list correspond to city url query string "level" e.g.
# https://www.city.ac.uk/courses?level=Executive Education
citycourselist = ["Undergraduate", "Postgraduate", "Research Degrees", "Foundation", "CPD", "Short Courses", "Executive Education"]
runningtotal = 0
results = []

now = datetime.datetime.now()
print ("City Uni scraper started:", now.strftime("%Y-%m-%d %H:%M:%S"))

for course in citycourselist:
    mycount = getCityCourseCount(course)
    results.extend(getCourses(course, mycount))
    print("Number of", course, "courses =", mycount)
    runningtotal += int(mycount)
print("Running total =", runningtotal)

myxmlfile = xmlifier(results, "Courses")
f = open('CityCourses.xml', 'wb')
f.write(myxmlfile)
print("Wrote city courses xml file to disk.")
f.close()

myjsonfile = jsonifier(results, "Courses")
f = open('CityCourses.json', 'wb')
f.write(myjsonfile)
print("Wrote city courses json file to disk.")
f.close()

now = datetime.datetime.now()
print ("City Uni scraper finished:", now.strftime("%Y-%m-%d %H:%M:%S"))

City Uni scraper started: 2018-11-22 12:21:59
Number of Undergraduate courses = 67
Number of Postgraduate courses = 142
Number of Research Degrees courses = 24
Number of Foundation courses = 9
Number of CPD courses = 251
Number of Short Courses courses = 133
Number of Executive Education courses = 15
Running total = 641
Wrote city courses xml file to disk.
Wrote city courses json file to disk.
City Uni scraper finished: 2018-11-22 12:30:20
