In [9]:
import requests
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
import torch
import json

ValueError: Unable to compare versions for numpy>=1.17: need=1.17 found=None. This is unusual. Consider reinstalling numpy.

1. Get all the classes and the course details
2. Generate the embeddings

# Get the classes and course details

In [11]:
# Get all the class links
url = "https://catalog.upenn.edu/courses/"

response = requests.get(url)
header = "https://catalog.upenn.edu"
course_links = []

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    course_div = soup.find_all('div', class_='az_sitemap')

    course_uls = course_div[0].find_all('ul', class_=False)

    # Get and store all the course links
    for course_ul in course_uls:
        course_lis = course_ul.find_all('li', class_=False)
        for course_li in course_lis:
            a_elements = course_li.find_all('a')
            href_value = a_elements[0].get('href')
            course_links.append(header + href_value)
else:
    print("Failed to retrieve the page. Status code:", response.status_code)

In [15]:
def get_course_content(url):
    response = requests.get(url)
    course_name_description, course_name_numbers = [], []

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        course_listings = soup.find_all('div', class_='courseblock')

        for course in course_listings:
            p_tags = course.find_all('p')
            
            course_info = p_tags[0].text
            course_number = course_info.split('\xa0')[0]
            course_name = course_info.split('  ')[1]
            description = p_tags[1].text.split('\n')[0]
            course_name_numbers.append(course_number + " " + course_name)
            course_name_description.append(course_number + " " + course_name + ". " + description)

    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
    return course_name_numbers, course_name_description


In [22]:
all_courses_name_numbers, all_courses_name_description = [], []

for course_link in course_links:
    course_name_numbers, course_name_description = get_course_content(course_link)

    all_courses_name_numbers.extend(course_name_numbers)
    all_courses_name_description.extend(course_name_description)

In [59]:
# Create the model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 194MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 329kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 5.32MB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:24<00:00, 18.0MB/s] 


In [None]:
inputs = tokenizer(all_courses_name_description, return_tensors='pt', padding=True, truncation=True)

# Forward pass through the model to get embeddings
with torch.no_grad():
    outputs = model(**inputs)

embeddings = outputs.last_hidden_state[:, 0, :]

# Save the embeddings
mapping = dict(zip(all_courses_name_numbers, embeddings.tolist()))
with open('mapping.json', 'w') as f:
    json.dump(mapping, f)

In [65]:
for i, embedding in enumerate(embeddings):
    print(f"Embeddings for sentence {i+1}:")
    print(embedding)

Embeddings for sentence 1:
tensor([-2.5101e-01, -2.7622e-01,  3.0551e-01, -3.0262e-01,  2.5604e-01,
        -4.5524e-01, -4.0104e-02,  2.7705e-01, -5.3633e-02, -6.9081e-01,
        -2.8824e-01,  4.0504e-01,  4.0530e-01,  9.7181e-02,  5.0694e-02,
         2.2388e-01,  5.6847e-01,  5.1658e-01,  5.0710e-02, -6.0547e-01,
        -7.0241e-01, -8.4299e-01,  9.4212e-02,  6.5429e-02,  3.1495e-01,
         4.1597e-01, -5.0355e-02,  3.9351e-02, -9.6031e-02, -4.1622e-01,
        -3.7771e-02, -1.0787e-01,  4.9569e-02, -3.4627e-01,  6.4969e-01,
         2.3960e-01, -1.3478e-01, -3.6807e-01,  8.9050e-02, -6.3756e-02,
        -4.3425e-01,  5.2534e-01,  1.9966e-01, -2.1629e-01, -3.0294e-01,
         2.3776e-01, -4.5784e+00, -5.1075e-01, -2.2224e-01, -1.2951e-01,
         3.3740e-01, -1.9536e-01, -8.3358e-01,  8.9677e-02,  6.3788e-01,
         2.1061e-01, -5.4546e-01, -3.5018e-01,  2.6760e-01, -5.9226e-01,
         2.0088e-01, -6.9501e-02, -1.2708e-01,  3.9667e-01, -3.8212e-02,
         1.1127e-01,  4.