In [25]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

In [3]:
BASE_URL = 'https://bulletin.gwu.edu/courses/'

In [5]:
dept_list = requests.get(BASE_URL).text

In [7]:
depts = BeautifulSoup(dept_list)

In [20]:
dept_links = [d for d in depts.find_all('a') if d.get('href', '').startswith('/courses/')]

In [23]:
course_pages = []
for link in dept_links[:-1]:
    page = requests.get(BASE_URL + link['href'].split('/', maxsplit=2)[-1])
    page.raise_for_status()
    course_pages.append(page.text)

In [28]:
courses = defaultdict(dict)
for page in course_pages:
    soup = BeautifulSoup(page)
    dept_title = soup.find('h1').text
    for title, desc in zip(soup.find_all('p', class_='courseblocktitle'), soup.find_all('p', class_='courseblockdesc')):
        courses[dept_title][title.text] = desc.text

In [31]:
import spacy
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [32]:
docs = list(nlp.pipe([desc for dept_dict in courses.values() for desc in dept_dict.values()]))

In [41]:
cleaned_courses = defaultdict(dict)
i = 0
for dept, dept_dict in courses.items():
    for title, desc in dept_dict.items():
        desc_tokens = [t.text for t in docs[i] if not t.is_space and not t.is_punct]
        if desc_tokens:
            title = title.replace('\xa0', ' ')
            cleaned_courses[dept][title] = desc_tokens
        i += 1

In [43]:
import json
with open('gw_bulletin.json', 'w') as f:
    json.dump(cleaned_courses, f)

In [1]:
import json
with open('gw_bulletin.json') as f:
    cleaned_courses = json.load(f)

In [2]:
from random import sample

In [6]:
sample_keys = sample(list(cleaned_courses.keys()), k=10)

In [7]:
sample_keys

['International Business (IBUS)',
 'Anatomy and Cell Biology (ANAT)',
 'Management (MGT)',
 'Sustainability (SUST)',
 'Health Services Management and Leadership (HSML)',
 'Counseling (CNSL)',
 'Sociology (SOC)',
 'Geology (GEOL)',
 'Speech, Language, and Hearing Science (SLHS)',
 'Business Administration (BADM)']

In [8]:
fieldnames = ['department', 'course', 'description']

In [9]:
from csv import DictWriter
with open('gw_bulletin.csv', 'w') as f:
    writer = DictWriter(f, fieldnames)
    writer.writeheader()
    for dept, course_dict in cleaned_courses.items():
        if dept in sample_keys:
            for i, (title, desc) in enumerate(course_dict.items()):
                if i < 10:
                    writer.writerow(dict(zip(fieldnames, (dept, title, '|'.join(desc)))))