In [1]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
from copy import deepcopy
import json
from datetime import datetime
from scraping_config import config

### Scraping course info from GW Schedule of Classes

In [7]:
base_url = config['schedule_page_url']
params = {'campid': config['campus_id'],
        'termid': config['term_id']} # Summer 2023
# Get list of departments
dept_page = requests.get(base_url, params=params)

In [None]:
soup = BeautifulSoup(dept_page.text)

In [None]:
dept_links = soup.find_all("a", href=re.compile(r'.+&subjId=.+'))

In [None]:
# Extract dept. codes from list of departments
dept_codes = [d['href'].split('&subjId=')[-1] for d in dept_links]

In [None]:
# Get each page (first page of results for each dept)
course_url = config['course_page_url']
course_pages = defaultdict(list)
for code in dept_codes:
    params['subjid'] = code
    page = requests.post(course_url, params=params)
    course_pages[code].append(page.text)

In [None]:
# Extract course & section numbers from first page
courses = []
for course_code, pages in course_pages.items():
    soup_1 = BeautifulSoup(pages[0])
    courses.extend([course for course in extract_course_info(soup_1)])
    more_pages = list(get_more_results(soup_1, params, course_code))
    if more_pages:
        courses.extend([course for page in more_pages
                       for course in extract_course_info(BeautifulSoup(page))])

In [None]:
def extract_course_info(soup):
    listings = soup.find_all('tr', class_="crseRow1")
    for listing in listings:
        course = {'code': course_code}
        info = listing.find_all('td')
        # Course number should reside under the 3rd table element, in the <a> tag
        course['number'] = info[2].a.text.strip()
        # Course section is in the fourth element
        course['section'] = info[3].text.strip()
        # Title and instructor are in the fifth and seventh elements
        course['title'] = info[4].text.strip()
        course['instructor'] = info[6].text.strip()
        yield course

In [None]:
# Additional results for a given department may be on subsequent pages
# Identify any pages that have links to more results
def get_more_results(soup, params, course_code):
    pages = {t.text for t in soup.find_all('a', href=re.compile('javascript:goToPage')) if t.text != '1'}
    if pages:
        for page in pages:
            params['subjid'] = course_code
            r = requests.post(course_url, 
                              params=params, 
                              headers={'Content-Type': 'application/x-www-form-urlencoded'}, 
                              data=f"pageNum={page}")
            if r. status_code == 200:
                yield r.text

### Retrieving records from the GW Bookstore for each course

In [62]:
bkst_base_url = config['bookstore_url']
bkst_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0',
             'Accept': 'application/json, text/plain, */*',
             'Content-Type': 'application/json'}

In [63]:
def create_bkst_payload(course_data, term_id):
    return {'bookstoreId': config['bookstore_id'],
            'courses':[{'courseDisplayName': course['number'],
                       'departmentDisplayName': course['code'],
                       'sectionDisplayName': course['section']}
                      for course in course_data],
            'termId': term_id}

In [64]:
bkst_data = []

In [None]:
courses_list = deepcopy(courses)

In [65]:
with open('../data/gw-courses-mc-202302.json') as f:
    courses = json.load(f)

In [107]:
from time import sleep

In [121]:
while courses:
    course = courses[0]
    payload = create_bkst_payload([course], params['termid'])
    r = requests.post(bkst_base_url, headers=bkst_headers, json=payload)
    resp_j = r.json()
    if 'blockScript' in resp_j:
        print("Captcha detected")
        #break
        sleep(360)
    else:
        bkst_data.append(resp_j)
        courses.pop(0)

Captcha detected


KeyboardInterrupt: 

In [122]:
len(courses)

1152

In [123]:
len(bkst_data)

98

In [124]:
#with open(f'../data/gw-courses-mc-{params["termid"]}.json', 'w') as f:
#    json.dump(courses_list, f)
with open(f'../data/gw-books-mc-{params["termid"]}-{datetime.now().isoformat()}.json', 'w') as f:
    json.dump(bkst_data, f)

### Parsing GW Bookstore data

In [None]:
with open(f'../data/gw-books-mc-202302-2023-05-30T16:15:13.357706.json') as f:
    bkst_data = json.load(f)

In [None]:
# Length(s) of list for each result
{len(c) for c in bkst_data}

In [None]:
# Length(s) of course section for each result
{len(c[0]['courseSectionDTO']) for c in bkst_data}

In [None]:
# Courses with books have this key
with_books = [c for c in bkst_data if c[0]['courseSectionDTO'][0].get('courseMaterialResultsList')]

In [None]:
with_books[1][0]['courseSectionDTO'][0].keys()

In [None]:
# This key contains an indication of which books are required vs. recommended
{k for book in with_books
     for k in book[0]['courseSectionDTO'][0]['sectionAdoptionDTO']['materialAdoptions'].keys()}

In [None]:
# Keys for item types (e vs print)
{k for book in with_books
    for r in book[0]['courseSectionDTO'][0]['courseMaterialResultsList']
    for  k in r.keys() if k.endswith('ItemDTOs')}

In [None]:
# Keys for item subtypes (new. used, etc.)
{k for book in with_books
    for r in book[0]['courseSectionDTO'][0]['courseMaterialResultsList']
    for k in r.get('printItemDTOs', {})
}

In [None]:
# Keys for extraction from bookstore JSON
top_keys = ['storeId', # int
           'storeNumber', # str
           'storeDisplayName', # str
           'currency', # str
           'requirementTypeLabelMap', # dict
           'courseSectionDTO']  # list 
course_section_keys = ['courseSectionStatus', # dict
                      'termId', # str
                       'termName', # str
                       'termNumber', # str
                       'termStatus', # str
                       'termOpen', # bool
                       'programId', # str
                       'programName', # str
                       'campusId', # str
                       'campusName', # str
                       'institutionName', # str
                       'department', # str
                       'course', # str
                       'section', # str
                       'courseId', # str
                       'instructor', # str
                       'courseMaterialResultsList'] # list of dicts
course_materials_keys = ['title', # str
                        'edition', # str
                         'author', # str
                         'isbn',  # str
                         'materialType', # str
                         'requirementType', # str
                         'isPackage', # bool
                         'publisherCode', # str
                         'copyRightYear', # str
                         'publisher', # str
                         'priceRangeDisplay' # str
                         ,'digitalItemDTOs', # list
                         'printItemDTOs']  # dict
# dict
print_item_keys = ['BUY_NEW', 'BUY_USED', 'RENTAL_NEW', 'RENTAL_USED']
# dict
print_item_subkeys = ['typeCondition', 'priceDisplay', 'inventoryStatusDB',
                     'binding', 'priceNumeric', 'nonRentalChargesTotal', 
                      'nonRentalBreakageCharge', 'nonRentalRestockingFee'] 
# list of dict
digital_item_keys = ['subscription', 'typeCondition', 'priceDisplay', 'priceNumeric'] # subscription is optional key

In [None]:
def clean_course_material(material):
    '''Reduces a dict of info about a specific course material'''
    cleaned_material = {k: v for k,v in material.items() if k in course_materials_keys}
    # clean the inner list or dict of items
    for i, item in enumerate(cleaned_material.get('digitalItemDTOs', [])):
        cleaned_item = clean_digital_item(item)
        cleaned_material['digitalItemDTOs'][i] = cleaned_item
    for k, v in cleaned_material.get('printItemDTOs', {}).items():
        cleaned_material['printItemDTOs'][k] = clean_print_item(v)
    return cleaned_material
def clean_print_item(item):
    '''Reduces a dict of info about a specific print item for sale'''
    return {k: v for k, v in item.items() if k in print_item_subkeys}
def clean_digital_item(item):
    '''Reduces a dict of info about a specific digital item for sale'''
    return {k: v for k,v in item.items() if k in digital_item_keys}

In [None]:
bkst_data_cleaned = []
for d in bkst_data:
    d1 = {k: v for k,v in d[0].items() if k in top_keys}  # Top level elements present in all records
    # Reduce dictionaries in course section data
    d1['courseSectionDTO'] = [{k: v  for k,v in s.items() if k in course_section_keys} # elements present in all course-section blocks 
                               for s in d1['courseSectionDTO']]            
    for i, section in enumerate(d1['courseSectionDTO']):
        for j, material in enumerate(section.get('courseMaterialResultsList', [])):
            d1['courseSectionDTO'][i]['courseMaterialResultsList'][j] = clean_course_material(material)
    bkst_data_cleaned.append(d1)

In [None]:
with_books_cleaned = [c for c in bkst_data_cleaned if c['courseSectionDTO'][0].get('courseMaterialResultsList')]

In [None]:
with open('../data/bookstore-data-cleaned.json', 'w') as f:
    json.dump(bkst_data_cleaned, f)

In [None]:
assert len(with_books) == len(with_books_cleaned)

#### Cleaning the data further

In [None]:
with open('../data/bookstore-data-cleaned.json') as f:
    bkst_data = json.load(f)

In [None]:
# This inner is list is always length 1, so we can reduce it to its inner dict
len([b for b in bkst_data if len(b['courseSectionDTO']) > 1])

In [None]:
bkst_data_cleaned = []
for b in bkst_data:
    b['courseSection'] = b['courseSectionDTO'][0]
    del b['courseSectionDTO']
    bkst_data_cleaned.append(b)

In [None]:
# Shortening key names for useful items and removing some extraneous keys
for b in bkst_data_cleaned:
    del b['courseSection']['courseSectionStatus']
    section = b['courseSection']
    if 'courseMaterialResultsList' in section:
        section['courseMaterials'] = section['courseMaterialResultsList']
        del section['courseMaterialResultsList']
        for m in section['courseMaterials']:
            if 'printItemDTOs' in m:
                m['printItems'] = m['printItemDTOs']
                del m['printItemDTOs']
            if 'digitalItemDTOs' in m:
                m['digitalItems'] = m['digitalItemDTOs']
                del m['digitalItemDTOs']

In [None]:
with open('../data/bookstore-data-cleaned.json', 'w') as f:
    json.dump(bkst_data_cleaned, f)

#### Creating a simplified, uniform dataset

In [None]:
with open('../data/bookstore-data-cleaned.json') as f:
    bkst_data = json.load(f)

In [None]:
with_books = [b for b in bkst_data if b['courseSection'].get('courseMaterials')]

In [None]:
course_keys = ['department', 'course', 'section', 'instructor', 'termName']
book_keys = ['title', 'author', 'edition', 'isbn', 'materialType', 'requirementType',
            'copyRightYear', 'publisher']
item_keys = ['typeCondition', 'priceDisplay']


In [None]:
simplified = []
for course in bkst_data:
    course_data = {k: course['courseSection'].get(k) for k in course_keys} 
    books = []
    for book in course['courseSection'].get('courseMaterials',[]):
        book_data = {k: book.get(k) for k in book_keys}
        for item in book.get('printItems', {}).values():
            book_item = deepcopy(book_data)
            book_item.update({k: item[k] for k in item_keys})
            book_item['itemType'] = 'print'
            books.append(book_item)
        for item in book.get('digitalItems', []):
            book_item = deepcopy(book_data)
            book_item.update({k: item[k] for k in item_keys})
            book_item['itemType'] = 'digital'
            books.append(book_item)
    course_data['texts'] = books
    simplified.append(course_data)

In [None]:
# Convert camel case to snake case 
case_convert = re.compile(r'(?<!^)(?=[A-Z])')
def camel_to_snake(item):
    '''
    :param item: should be a dictionary
    Will recurse for nested lists of dicts
    '''
    if isinstance(item, dict):
        new_dict = {}
        for key, value in item.items():
            new_key = re.sub(case_convert, '_', key).lower()
            if isinstance(value, list):
                new_dict[new_key] = [camel_to_snake(v) for v in value]
            else:
                new_dict[new_key] = value
        return new_dict

In [None]:
simplified = [camel_to_snake(s) for s in simplified]

In [None]:
with open('../data/bookstore-data-simplified.json', 'w') as f:
    json.dump(simplified, f)