In [1]:
import collections
import requests
from bs4 import BeautifulSoup
import json

ModuleNotFoundError: No module named 'requests'

In [2]:
# Get all the class links
url = "https://catalog.upenn.edu/courses/"

response = requests.get(url)
header = "https://catalog.upenn.edu"
course_links = []

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    course_div = soup.find_all('div', class_='az_sitemap')

    course_uls = course_div[0].find_all('ul', class_=False)

    # Get and store all the course links
    for course_ul in course_uls:
        course_lis = course_ul.find_all('li', class_=False)
        for course_li in course_lis:
            a_elements = course_li.find_all('a')
            href_value = a_elements[0].get('href')
            course_links.append(header + href_value)
else:
    print("Failed to retrieve the page. Status code:", response.status_code)

In [11]:
def get_course_number(url):
    response = requests.get(url)
    course_numbers = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        course_listings = soup.find_all('div', class_='courseblock')

        for course in course_listings:
            p_tags = course.find_all('p')
            
            course_info = p_tags[0].text
            dept = course_info.split('\xa0')[0]
            course_number = course_info.split('\xa0')[1].split(' ')[0]
            
            course_numbers.append(dept + " " + course_number)
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)    
    
    return course_numbers

In [43]:
all_course_numbers = []

for course_link in course_links:
    course_numbers = get_course_number(course_link)
    all_course_numbers.extend(course_numbers)

all_course_numbers_set = set(all_course_numbers)

In [66]:
# Course relations dictionary 
course_relations = {}
for course_number in all_course_numbers:
    course_relations[course_number] = set()

In [72]:
# Fill in the relationships
def update_relationships(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        course_listings = soup.find_all('div', class_='courseblock')

        for course in course_listings:
            p_tags = course.find_all('p')


            if len(p_tags) < 5 or not p_tags[-2].text.startswith("Prerequisite: "):
                continue
        
                
            cur_course = p_tags[0].text.split('  ')[0].replace('\xa0', ' ')


            # Split the strings by AND
            whole_string = p_tags[-2].text[14:]
            prerequisites = whole_string.split("AND")

            modified_prerequisites = []
            for prerequisite in prerequisites:
                prerequisite = prerequisite.replace("\xa0", " ").strip()
                modified_prerequisites.append(prerequisite)

            prerequisites = modified_prerequisites.copy()

            # Deal with OR cases
            modified_prerequisites = []
            for prerequisite in prerequisites:
                prerequisite = prerequisite.strip('()')
                prerequisite_parts = prerequisite.split('OR')
                first_prerequisite = prerequisite_parts[0].strip()
                modified_prerequisites.append(first_prerequisite)

            
            # Miscellaneous words
            modified_prerequisites = {prerequisite for prerequisite in modified_prerequisites if prerequisite in all_course_numbers_set}

            course_relations[cur_course].update(modified_prerequisites)


    else:
        print("Failed to retrieve the page. Status code:", response.status_code)    


In [81]:
# Fill in the relationships
def update_relationships(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        course_listings = soup.find_all('div', class_='courseblock')

        for course in course_listings:
            p_tags = course.find_all('p')

            if len(p_tags) < 5 or not p_tags[-2].text.startswith("Prerequisite: "):
                continue

            # Get the current course
            cur_course = p_tags[0].text.split('  ')[0].replace('\xa0', ' ')


            # Extract prerequisites and clean them
            whole_string = p_tags[-2].text[14:]
            prerequisites = whole_string.split("AND")

            modified_prerequisites = []
            for prerequisite in prerequisites:
                prerequisite = prerequisite.replace("\xa0", " ").strip('()').split('OR')[0].strip()
                modified_prerequisites.append(prerequisite)

            # Filter out non-existent prerequisites
            modified_prerequisites = {prerequisite for prerequisite in modified_prerequisites if prerequisite in all_course_numbers_set}
            course_relations[cur_course].update(modified_prerequisites)

    else:
        print("Failed to retrieve the page. Status code:", response.status_code)


In [82]:
for course_link in course_links:
    update_relationships(course_link)

In [None]:
for course, prerequisites in course_relations.items():
    course_relations[course] = list(prerequisites)

with open('prerequisites.json', 'w') as f:
    json.dump(course_relations, f)

In [83]:
course_relations

{'ACFD 6000': set(),
 'ACFD 6010': set(),
 'ACFD 6020': set(),
 'ACFD 6030': set(),
 'ACFD 6999': set(),
 'ACCT 1010': set(),
 'ACCT 1020': set(),
 'ACCT 2110': set(),
 'ACCT 2120': {'ACCT 1010'},
 'ACCT 2420': {'ACCT 1010'},
 'ACCT 2430': {'ACCT 1010'},
 'ACCT 2640': {'ACCT 1010', 'BEPP 2500'},
 'ACCT 2700': {'ACCT 1010', 'STAT 1020'},
 'ACCT 2970': {'ACCT 1010', 'FNCE 1010'},
 'ACCT 3990': set(),
 'ACCT 6110': set(),
 'ACCT 6130': set(),
 'ACCT 7060': {'ACCT 6110'},
 'ACCT 7420': {'ACCT 6110'},
 'ACCT 7430': {'ACCT 6110'},
 'ACCT 7470': {'ACCT 6110'},
 'ACCT 7471': {'ACCT 6110'},
 'ACCT 7640': {'ACCT 6110', 'MGEC 6110', 'MGEC 6120'},
 'ACCT 8970': {'ACCT 6110', 'FNCE 6110'},
 'ACCT 9300': set(),
 'ACCT 9400': set(),
 'ACCT 9410': set(),
 'ACCT 9420': set(),
 'ACCT 9430': set(),
 'ACCT 9810': set(),
 'ACCT 9820': set(),
 'AFRC 0008': set(),
 'AFRC 0010': set(),
 'AFRC 0012': set(),
 'AFRC 0013': set(),
 'AFRC 0015': set(),
 'AFRC 0030': set(),
 'AFRC 0081': set(),
 'AFRC 0082': set(),