### Import Libraries

In [4]:
import os
import pickle
import requests

from bs4 import BeautifulSoup

### Make a Request

In [6]:
def make_request(url="https://economics.gmu.edu/course_sections",
                 headers={'User-Agent': 'Chrome/74.0.3729.169'},
                 params={'term': '201970'},
                 print_info=False):
    '''
    Submits a GET request using the Requests library for a given URL. Default values are from George Mason
    Economics Department website. Returns a Request object.
    '''
    try:
        response = requests.get(url, headers=headers, params=params)
    except requests.exceptions.RequestException as e:
        raise SystemExit(e)

    if print_info == True:
        print(response.url, response.status_code)

    return response

### Build List of Semesters Available

In [7]:
def build_semester_list():    
    '''
    From the Geroge Mason website, return all semesters with data available from the dropdown menu. Returns a list
    of values which represent a semester.
    '''
    #Make default request.
    response = make_request()

    #Parse resulting HTML data.
    soup = BeautifulSoup(response.text)

    #Semesters available are those available thourhg the drop-down menu. Search for drop-down menu options.
    avail_semesters = soup.find_all('select', id='term-select')    

    #Loop over each option in the drop-down menu. For each option, determine the code for the website url and its text
    #name. [1:] removes the default menu title "Choose a term".
    semesters = []

    for sem in avail_semesters[0].find_all('option')[1:]:
        semesters.append([sem['value'], sem.text])

    return semesters

### Download GMU Website HTML

In [21]:
def download_html(semesters, directory, file_name):
    '''
    Download the HTML using the Requests library. User supplied list has parameters needed for the make_requests function.
    Returns a dictionary, but also saves the dictoinary as a Pickle file in a user-defined directory.
    '''
    semester_html_data = {}

    #Download GMU Website HTML for each semester, store in a dictionary.
    for sem_value, sem_string in semesters:
        params = {'term': sem_value}
        response = make_request(params=params)
        
        #Replace "Fall 2019" with "Fall_2019", it helps later if one wants to use it as a file name.
        sem_string = sem_string.replace(' ','_')
        semester_html_data[sem_string] = {}
        semester_html_data[sem_string]['value'] = sem_value
        semester_html_data[sem_string]['url'] = response.url
        semester_html_data[sem_string]['response'] = response

    #Save the resulting dictionary as a Pickle file.
    path = os.path.join(directory, file_name)

    #Create directory if directory does not exist.
    if not os.path.exists(directory):
        os.makedirs(directory)

    #Write dictionary as Pickle file.
    with open(path,'wb') as file:
        pickle.dump(semester_html_data, file)

    return semester_html_data

### Download Syllabi

In [9]:
def download_syllabi(directory, file, url, params):
    '''
    Download syllabus at the url and save it to the user-defined directory.
    '''
    #Assign path, create if it does not exist.
    path = os.path.join(directory, file)

    #Write 
    if not os.path.exists(directory):
        os.makedirs(directory)

    #Download PDFs.
    response_pdf = make_request(url=url, params=params)

    #Write to drive.
    with open(path, "wb") as pdf:
        for chunk in response_pdf.iter_content():
            pdf.write(chunk)

### Compile Course Syllabi

In [60]:
def edit_text(text):
    '''
    Edit text, specifically the course titles from various link and header sections of the HTML.
    '''
    text = (text.strip()
            .replace(':','')
            .replace(',','')
            .replace('\'','')
            .replace(' ','_')
            .replace('&','')
            .replace('.',''))
    
    return text

In [64]:
def course_title_parser(html):
    '''
    Takes the course block and parses out and returns the course title.
    '''
    #Parse HTML for header section.
    course_header = html.find('header').text.split('\n')    
    course_header_title = course_header[2]
    
    #Initially, set course title is to the course title parsed from the course header.
    course_title = edit_text(course_header_title)
    
    #If the course title parsed from the header is "Special Topics in Economics", then instead of using
    #the course title parsed from the header, update the course title to the course title parsed from
    #the links for each individual section of the course offered.
    if course_title == 'Special_Topics_in_Economics':
        course_links = html.find_all('a')
        for link in course_links:
            if len(link.text.split(':')) > 1:
                course_title = edit_text(link.text.split(':')[1])
    
    return course_title

In [124]:
def course_syllabi_parser(html):
    '''
    Parse the HTML for a course syllabus URL.
    '''
    course_links = html.find_all('a')

    course_syllabus_link = []
    
    for link in course_links:
        if link.text == "Section Syllabus":
            has_syllabus = True
            pdf_url = 'https:{}'.format(link['href'].split("?")[0])
            pdf_params = link['href'].split("?")[1]
            pdf_file_name = os.path.basename(pdf_url)
            course_syllabus_link.append([pdf_file_name, pdf_url, pdf_params])
            
    return course_syllabus_link

In [119]:
def compile_course_syllabi(html_dictionary):
    '''
    Comple course titles, syllabi url details, and file names. Returns a list with each course section syallbus as
    an element associated with the course title.
    '''
    gmu_econ_syllabi = []

    #For each semester and course block, find each course title and course syllabus for each course section.
    for semester, course_block in gmu_html_dict.items():
        block_soup = BeautifulSoup(course_block['response'].text)
        sem_course_content = block_soup.find_all('div', class_='course content')

        for course in sem_course_content:
            #Parse the course content for the course title.
            course_title = course_title_parser(course)

            #Parse course content for the url to the course syllabus.
            course_syllabi = course_syllabi_parser(course)

            #Append course title and syllabus details to list.
            gmu_econ_syllabi.append([semester, course_title, course_syllabi])

    return gmu_econ_syllabi        

### Driver

In [126]:
semester_list = build_semester_list()

gmu_html_dict = download_html(semesters=semester_list,
                              directory='html-data',
                              file_name='gmu-semester-html')

gmu_syllabi_list = compile_course_syllabi(gmu_html_dict)

for semester, course_title, syllabi in gmu_syllabi_list:
    for syllabus in syllabi:
        out_dir = os.path.join('gmu-syllabi', course_title, semester)
        download_syllabi(directory=out_dir, file=syllabus[0], url=syllabus[1], params=syllabus[2])

{'Fall_2019': {'value': '201970',
  'url': 'https://economics.gmu.edu/course_sections?term=201970',
  'response': <Response [200]>}}