## Gesamtes Vorlesungsverzeichnis laden

Extrahiert Datum, Planpunkt, Lv-ID, Professoren aus dem VVZ der WU.  
Kombiniert Planpunkte mit ECTS.
Output:  
- vvzKurse.pkl - pandas dataframe mit Rohdaten des VVZ
- vvzModel.pkl - pandas dataframe mit Daten erweitert um Planpunkt und ECTS. LP und Greedy greifen auf diesen Dataframe zurück

In [201]:
%pip install beautifulsoup4 pqdm pulp networkx

Collecting networkx
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
    --------------------------------------- 0.0/1.7 MB 660.6 kB/s eta 0:00:03
   ------- -------------------------------- 0.3/1.7 MB 3.3 MB/s eta 0:00:01
   ------------------------ --------------- 1.1/1.7 MB 7.4 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 10.0 MB/s eta 0:00:00
Installing collected packages: networkx
Successfully installed networkx-3.4.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Philipp\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [18]:
import re
import bs4
import requests

import pandas as pd

from datetime import datetime
from pqdm.threads import pqdm

In [None]:
url = "https://vvz.wu.ac.at/cgi-bin/vvz.pl?C=S&LANG=DE&U=H&S=25S&LV=3&L2=S&L3=S&T=&L=&I=$lvid&JOIN=AND"


def get_planpunkt_id(planpunkt_url):
    try:
        return re.findall(r"P=([0-9]+);", planpunkt_url)[0]
    except Exception:
        return None


def extract_vorlesung(id: any, soup: bs4.BeautifulSoup):
    tables = soup.find_all('table')

    vvzInfo = {
        "id": id,
        "dates": [],
        "lvLeiter": None,
        "planpunkte": []
    }

    for table in tables:
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            for idx, cell in enumerate(cells):
                text = cell.text.strip()
                if text == "Planpunkte Bachelor" and idx + 1 < len(cells):
                    planpunkte_links = cells[idx + 1].find_all('a')
                    planpunkte = [
                        {
                            "text": a.text.strip(),
                            "href": a.get("href"),
                            "id": get_planpunkt_id(a.get("href"))
                        }
                        for a in planpunkte_links
                    ]
                    vvzInfo["planpunkte"] = planpunkte
                if text == "LV-Leiter/in" and idx + 1 < len(cells):
                    lv_leiter = cells[idx + 1].text.strip()
                    vvzInfo["lvLeiter"] = lv_leiter

        first_tr = table.find('tr')

        if first_tr:
            first_td = first_tr.find('td')
            if first_td and first_td.text.strip() == "Termine":
                for row in table.find_all('tr')[1:]:  # Skipping the header row
                    cells = row.find_all('td')

                    if len(cells) >= 5:
                        date_str = cells[1].text.strip()
                        date_obj = datetime.strptime(date_str, "%d.%m.%Y")

                        time_str = cells[2].text.strip().replace(" Uhr", "")
                        start_time_str, end_time_str = time_str.split('-')

                        start_time = datetime.strptime(f"{date_obj.strftime('%d.%m.%Y')} {start_time_str.strip()}", "%d.%m.%Y %H:%M")
                        end_time = datetime.strptime(f"{date_obj.strftime('%d.%m.%Y')} {end_time_str.strip()}", "%d.%m.%Y %H:%M")

                        info = cells[3].text.strip()

                        room_str = cells[4].text.strip()
                        room_match = re.match(r"([A-Za-z0-9.]+)", room_str)
                        room = room_match.group(1) if room_match else "Unknown"

                        vvzInfo["dates"].append({
                            "start": start_time,
                            "end": end_time,
                            "room": room,
                            "info": info
                        })
    return vvzInfo


def fetch_vorlesung(id):
    current_url = url.replace("$lvid", str(id))
    try:
        page = requests.get(current_url)
        if page.status_code == 200:
            soup = bs4.BeautifulSoup(page.text, 'html.parser')

            if "Keine Lehrveranstaltungen gefunden" in soup.get_text():
                return (None, id, None)
            else:
                return (True, extract_vorlesung(id, soup), None)
        else:
            return (False, id, None)
    except Exception as e:
        return (False, id, e)


result = pqdm(range(1, 10_000), fetch_vorlesung, n_jobs=8)

  from .autonotebook import tqdm as notebook_tqdm
QUEUEING TASKS | : 100%|██████████| 9999/9999 [00:00<00:00, 42348.57it/s]
PROCESSING TASKS | : 100%|██████████| 9999/9999 [11:08<00:00, 14.96it/s]  
COLLECTING RESULTS | : 100%|██████████| 9999/9999 [00:00<00:00, 834703.56it/s]


In [6]:
import pickle

with open("../0_daten/vvzKurse.pkl", "wb") as f:
    pickle.dump(result, f)

In [7]:
import pickle

with open("../0_daten/vvzKurse.pkl", "rb") as f:
    result = pickle.load(f)

In [9]:
only_existing_courses = [v[1] for v in result if v[0] == True]

In [16]:
def fetch_planpunkt(p_url):
    try:
        page = requests.get(f"https://vvz.wu.ac.at{p_url}")
        if page.status_code == 200:
            soup = bs4.BeautifulSoup(page.text, 'html.parser')

            for span in soup.select("span"):
                res = re.findall(r"([0-9]+) ECTS", span.text)
                if len(res) == 1:
                    return int(res[0])
            
            return None
        else:
            return (False, id, "code!=200")
    except Exception as e:
        return (False, id, e)


planpunkte = {}

for course in tqdm.tqdm(only_existing_courses):
    for planpunkt in course["planpunkte"]:
        if planpunkt["id"] in planpunkte:
            continue

        planpunkte[planpunkt["id"]] = fetch_planpunkt(planpunkt["href"])

100%|██████████| 2154/2154 [00:39<00:00, 54.38it/s] 


In [28]:
vvzModel = pd.DataFrame(only_existing_courses)

vvzModel["planpunkte_ids"] = vvzModel["planpunkte"].apply(
    lambda x: [p["id"] for p in x] if isinstance(x, list) else []
)
vvzModel['ects'] = vvzModel['planpunkte_ids'].apply(
    lambda x: next((planpunkte.get(str(id), -1) for id in x), -1) if x else -1
)

vvzModel.to_pickle("../0_daten/vvzModel.pkl")