In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

In [3]:
BASE_URL = 'https://bulletin.gwu.edu/courses/'

In [4]:
dept_list = requests.get(BASE_URL).text

In [5]:
depts = BeautifulSoup(dept_list)

In [6]:
dept_links = [d for d in depts.find_all('a') if d.get('href', '').startswith('/courses/')]

In [7]:
course_pages = []
for link in dept_links[:-1]:
    page = requests.get(BASE_URL + link['href'].split('/', maxsplit=2)[-1])
    page.raise_for_status()
    course_pages.append(page.text)

In [8]:
courses = defaultdict(dict)
for page in course_pages:
    soup = BeautifulSoup(page)
    dept_title = soup.find('h1').text
    for title, desc in zip(soup.find_all('p', class_='courseblocktitle'), soup.find_all('p', class_='courseblockdesc')):
        courses[dept_title][title.text] = desc.text

In [10]:
import spacy
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [11]:
docs = list(nlp.pipe([desc for dept_dict in courses.values() for desc in dept_dict.values()]))

In [44]:
cleaned_courses = []
i = 0
for dept, dept_dict in courses.items():
    for title, desc in dept_dict.items():
        desc_tokens = [t.text.lower() for t in docs[i] if t.is_alpha]
        if desc_tokens:
            title = title.replace('\xa0', ' ')
            desc = desc.replace('\xa0', ' ')
            cleaned_courses.append({'dept': dept, 'title': title, 'desc': desc, 'tokens': desc_tokens})
        i += 1

In [45]:
import json
with open('gw_bulletin.json', 'w') as f:
    json.dump(cleaned_courses, f)

In [13]:
from random import sample

In [20]:
sample_keys = sample(list(cleaned_courses.keys()), k=10)

In [21]:
sample_keys

['School of Media and Public Affairs (SMPA)',
 'Corcoran Interaction Design (CIXD)',
 'Political Science (PSC)',
 'Computer Science (CSCI)',
 'Professional Studies Public Leadership (PSPL)',
 'Chinese (CHIN)',
 'Regulatory Affairs (RAFF)',
 'Hominid Paleobiology (HOMP)',
 'English (ENGL)',
 'Speech, Language, and Hearing Science (SLHS)']

In [22]:
num_docs = 100

In [56]:
sample = [c for c in cleaned_courses if c['dept'] in sample_keys]

In [58]:
len(sample)

825

In [24]:
from itertools import *

def roundrobin(*iterables):
    "Visit input iterables in a cycle until each is exhausted."
    # roundrobin('ABC', 'D', 'EF') → A D E B F C
    # Algorithm credited to George Sakkis
    iterators = map(iter, iterables)
    for num_active in range(len(iterables), 0, -1):
        iterators = cycle(islice(iterators, num_active))
        yield from map(next, iterators)

In [60]:
groups = [list(g) for k, g in groupby(sample, key=lambda x: x['dept'])]
sample = [c for c in roundrobin(*groups)][:100]

In [62]:
sample = sorted(sample, key=lambda x: x['dept'])

In [63]:
fieldnames = ['department', 'course', 'description', 'tokens']

In [65]:
from csv import DictWriter
with open('gw_bulletin.csv', 'w') as f:
    writer = DictWriter(f, fieldnames)
    writer.writeheader()
    for course in sample:
        writer.writerow(dict(zip(fieldnames, (course['dept'], course['title'], course['desc'], '|'.join(course['tokens'])))))