In [None]:
import requests
from bs4 import BeautifulSoup
import json

In [None]:
# optional: Get urls of all the full time courses

# new_url = "http://www.drps.ed.ac.uk/23-24/dpt/drps_inf.htm"
# old_url = "http://www.drps.ed.ac.uk/22-23/dpt/drps_inf.htm"

URL = "http://www.drps.ed.ac.uk/23-24/dpt/drps_inf.htm"
base_url_c = "http://www.drps.ed.ac.uk/23-24/dpt/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
result = soup.find(id="ptList").find_all("a")
url_list = {}

for el in result:
    url_list[el.text.strip()] = base_url_c + el['href']
    print(el.text.strip()+ base_url_c + el['href'])

for c in url_list:
    if "(Full-time)" in c:
        print(c + " : " + base_url_c + el['href']) 

In [None]:
base_url = "http://www.drps.ed.ac.uk/23-24/dpt/"
url_list = [
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmscadein1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmscadtfc1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmscaintl1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmsccogsc1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmsccmpsi1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmsccsptr1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmscdatsc1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmscdesin1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmschpcmp1f.htm",
            "http://www.drps.ed.ac.uk/23-24/dpt/ptmschpcds1f.htm"]

course_catalog = {}

In [None]:
# Get semester
def get_semester(url):
    c_page = requests.get(url)
    c_soup = BeautifulSoup(c_page.content, "html.parser")
    try:
        semester = c_soup.find("td", string="Course Start").find_next_sibling("td").text
    except:
        semester = "N/A"
    return semester

In [None]:
# Add course details to the catalog
def add_course(course_code, course_name, url, credits):

    if course_code not in course_catalog.keys():
        semester = get_semester(url)
        course_catalog[course_code] = {
            "course_name": course_name,
            "url": url,
            "credits": credits,
            "semester": semester,
            "note": {
                "message": "",
                "url": ""
            }
        }
    else:
        pass


In [None]:
def get_course_info(url):
    info = {}

    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # basic course info
    cname = soup.find("h2", {"class": "page-header"}
                      ).text.strip().split('\n')[0].strip().split()[3:]
    info["course_name"] = " ".join(cname)
    info["course_code"] = soup.find("h2", {"class": "page-header"}).text.strip(
    ).split('\n')[1].strip().replace('(', '').replace(')', '')
    year = list(filter(None, soup.find("h3", {"class": "h2"}, {
                "id": "dpt-year-1"}).text.strip().split('\n')))
    info["year"] = year[-2].strip().split()[2].replace(",", "")
    
    # course note
    info["diet_note"] = soup.find("div", {"data-uoeid": "diet-help"}).text.strip()

    # ------------------ Add compulsory courses

    # compulsory courses
    info["compulsory_courses"] = {}
    compulsory_courses = soup.find("div", {"class": "panel panel-primary"}).find_all(
        'div', class_='dpt-course-card dpt-flex__item dpt-flex')
    i = 1
    tot = 0
    for val in compulsory_courses:
        data = val.text.strip().split('\n')
        cred = int(data[-1].split(" ")[0])

        # Get course url
        c_url = val.find_all("a")

        # Add course to the catalog
        add_course(data[-2], data[0], base_url + c_url[0]['href'], cred)

        # Add course to the compulsory courses: Moduel code
        info["compulsory_courses"][i] = str(data[-2])

        i += 1
        tot += cred

    # total credit for compulsory courses
    info["compulsory_courses_total_credit"] = tot

    # credit total group
    info["optional_courses_total_credit"] = int(soup.find(
        "header", {"class": "panel-heading dpt-block__header"}).text.strip().split('\n')[-1].split()[2])

    # optional courses
    optional_courses = {}
    opt_titles = soup.find(
        "div", {"class": "dpt-block__contents"}).find_all("h3", {"h5 dpt-rule__title"})
    opt_credits = soup.find(
        "div", {"class": "dpt-block__contents"}).find_all("span", {"text-nowrap"})

    # optional note
    nobj = {}
    opt_note = soup.find_all("div", {"class": "dpt-help dpt-help--rule"})
    for el in opt_note:
        t = el.text.strip().split('\n')[1]
        course_name = soup.find(text=t).parent.parent.find_previous('h3').text.strip()
        n_t = el.get_text().strip().split('\n')[1:]
        course_note = ' '.join(map(str, n_t))
        # old: course_note = el.text.strip().split('\n')[1]
        nobj[course_name] = course_note
    
    noteobj = {}
    for t in opt_titles:   
        if t.text in nobj:
            noteobj[t.text] = nobj[t.text]
        else:
            noteobj[t.text] = ""

    i = 1
    for (title, credit) in zip(opt_titles, opt_credits):
        optional_courses[i] = {"course_type": title.text.strip().split(
            '\n')[0],
                "course_note": noteobj[title.text.strip().split('\n')[0]], 
                "credit_range": {
                "min_credit": int(credit.text.strip().split('\n')[0].split()[2]),
                "max_credit": int(credit.text.strip().split('\n')[0].split()[4])
        }}
        i += 1

    course_list = []

    # urls
    main_urls = []
    url_list = []

    temp = {}

    optional_courses_l = soup.find("div", {"class": "dpt-block__contents"}).find_all(
        "div", {"class": "dpt-flex dpt-rule__courses panel-footer"})
    
    for val in optional_courses_l:
        # add urls to the main_urls list
        main_urls.append(val.find_all("a"))
        course_list.append(list(filter(None, val.text.strip().split('\n'))))

    # add base url to the urls
    for url in main_urls:
        for u in url:
            url_list.append(base_url + u['href'])

    # --------------------- Add optional courses to the catalog ---------------------

    # t: iterator for the course_list
    t = 0
    for i in range(0, len(course_list)):
        temp[int(i+1)] = {}
        temp[int(i+1)] = optional_courses[i+1]
        temp[int(i+1)]["courses"] = {}

        id = 1
        for j in range(0, len(course_list[i]), 3):
            add_course(course_list[i][j+1], course_list[i][j],
                       url_list[t], int(course_list[i][j+2].split(" ")[0]))
            temp[int(i+1)]["courses"][id] = str(course_list[i][j+1])
            t += 1
            id += 1

    info["optional_courses"] = temp

    # optional level courses
    l_range = len(optional_courses) - len(course_list)
    l_optional = {}
    for i in range(0, l_range):
        l_optional[i+1] = optional_courses[i+1+len(course_list)]

    info["optional_courses_levls"] = l_optional

    return info


In [None]:
def write_to_json(filename, data):
    json_object = json.dumps(data, indent=4)

    # Writing to sample.json
    with open(f"{filename}.json", "w") as outfile:
        outfile.write(json_object)

In [None]:
def write_course_catalog(filename, data):
    json_object = json.dumps(data, indent=4)

    # Writing to sample.json
    with open(f"{filename}.json", "w") as outfile:
        outfile.write(json_object)

In [None]:
for url in url_list:
    data = get_course_info(url)
    write_to_json(data["course_code"], data)
    write_course_catalog("course_catalog", course_catalog)