## Fetch entire VVZ and get dates

In [1]:
%pip install beautifulsoup4 pqdm





[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\Philipp\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [18]:
import re
import bs4
import tqdm
import requests

import pandas as pd

from datetime import datetime
from pqdm.threads import pqdm

In [None]:
url = "https://vvz.wu.ac.at/cgi-bin/vvz.pl?C=S&LANG=DE&U=H&S=25S&LV=3&L2=S&L3=S&T=&L=&I=$lvid&JOIN=AND"


def get_planpunkt_id(planpunkt_url):
    try:
        return re.findall(r"P=([0-9]+);", planpunkt_url)[0]
    except Exception:
        return None


def extract_vorlesung(id: any, soup: bs4.BeautifulSoup):
    tables = soup.find_all('table')

    vvzInfo = {
        "id": id,
        "dates": [],
        "lvLeiter": None,
        "planpunkte": []
    }

    for table in tables:
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            for idx, cell in enumerate(cells):
                text = cell.text.strip()
                if text == "Planpunkte Bachelor" and idx + 1 < len(cells):
                    planpunkte_links = cells[idx + 1].find_all('a')
                    planpunkte = [
                        {
                            "text": a.text.strip(),
                            "href": a.get("href"),
                            "id": get_planpunkt_id(a.get("href"))
                        }
                        for a in planpunkte_links
                    ]
                    vvzInfo["planpunkte"] = planpunkte
                if text == "LV-Leiter/in" and idx + 1 < len(cells):
                    lv_leiter = cells[idx + 1].text.strip()
                    vvzInfo["lvLeiter"] = lv_leiter

        first_tr = table.find('tr')

        if first_tr:
            first_td = first_tr.find('td')
            if first_td and first_td.text.strip() == "Termine":
                for row in table.find_all('tr')[1:]:  # Skipping the header row
                    cells = row.find_all('td')

                    if len(cells) >= 5:
                        date_str = cells[1].text.strip()
                        date_obj = datetime.strptime(date_str, "%d.%m.%Y")

                        time_str = cells[2].text.strip().replace(" Uhr", "")
                        start_time_str, end_time_str = time_str.split('-')

                        start_time = datetime.strptime(f"{date_obj.strftime('%d.%m.%Y')} {start_time_str.strip()}", "%d.%m.%Y %H:%M")
                        end_time = datetime.strptime(f"{date_obj.strftime('%d.%m.%Y')} {end_time_str.strip()}", "%d.%m.%Y %H:%M")

                        info = cells[3].text.strip()

                        room_str = cells[4].text.strip()
                        room_match = re.match(r"([A-Za-z0-9.]+)", room_str)
                        room = room_match.group(1) if room_match else "Unknown"

                        vvzInfo["dates"].append({
                            "start": start_time,
                            "end": end_time,
                            "room": room,
                            "info": info
                        })
    return vvzInfo


def fetch_vorlesung(id):
    current_url = url.replace("$lvid", str(id))
    try:
        page = requests.get(current_url)
        if page.status_code == 200:
            soup = bs4.BeautifulSoup(page.text, 'html.parser')

            if "Keine Lehrveranstaltungen gefunden" in soup.get_text():
                return (None, id, None)
            else:
                return (True, extract_vorlesung(id, soup), None)
        else:
            return (False, id, None)
    except Exception as e:
        return (False, id, e)


result = pqdm(range(1, 10_000), fetch_vorlesung, n_jobs=8)

  from .autonotebook import tqdm as notebook_tqdm
QUEUEING TASKS | : 100%|██████████| 9999/9999 [00:00<00:00, 42348.57it/s]
PROCESSING TASKS | : 100%|██████████| 9999/9999 [11:08<00:00, 14.96it/s]  
COLLECTING RESULTS | : 100%|██████████| 9999/9999 [00:00<00:00, 834703.56it/s]


In [6]:
import pickle

with open("./vvzKurse.pkl", "wb") as f:
    pickle.dump(result, f)

In [7]:
import pickle

with open("./vvzKurse.pkl", "rb") as f:
    result = pickle.load(f)

In [9]:
only_existing_courses = [v[1] for v in result if v[0] == True]

In [16]:
def fetch_planpunkt(p_url):
    try:
        page = requests.get(f"https://vvz.wu.ac.at{p_url}")
        if page.status_code == 200:
            soup = bs4.BeautifulSoup(page.text, 'html.parser')

            for span in soup.select("span"):
                res = re.findall(r"([0-9]+) ECTS", span.text)
                if len(res) == 1:
                    return int(res[0])
            
            return None
        else:
            return (False, id, "code!=200")
    except Exception as e:
        return (False, id, e)


planpunkte = {}

for course in tqdm.tqdm(only_existing_courses):
    for planpunkt in course["planpunkte"]:
        if planpunkt["id"] in planpunkte:
            continue

        planpunkte[planpunkt["id"]] = fetch_planpunkt(planpunkt["href"])

100%|██████████| 2154/2154 [00:39<00:00, 54.38it/s] 


In [17]:
planpunkte

{'6014': 4,
 '6016': 4,
 '6023': 4,
 '6024': 4,
 '6058': 3,
 '6795': 3,
 '6026': 3,
 '6692': 4,
 '6794': 3,
 '6590': 5,
 '9485': 4,
 '6043': 4,
 '6025': 4,
 '6691': 4,
 '6910': 4,
 '6046': 4,
 '6055': 4,
 '6591': 5,
 '6077': 4,
 '6078': 4,
 '6079': 4,
 '9539': 4,
 '9540': 4,
 '9541': 4,
 '6075': 4,
 '9496': 4,
 '9537': 4,
 '6076': 4,
 '9500': 4,
 '9538': 4,
 '9505': 4,
 '9507': 4,
 '6057': 4,
 '6453': 4,
 '9160': 8,
 '9503': 4,
 '6259': 4,
 '9457': 4,
 '5109': 4,
 '6021': 4,
 '5118': 4,
 '9225': 4,
 '6065': 4,
 '6909': 10,
 '9493': 8,
 '5136': 3,
 '6911': 3,
 '5059': 4,
 '6059': 4,
 '9491': 4,
 '6053': 4,
 '6011': 6,
 '6232': 4,
 '6216': 4,
 '6267': 3,
 '9512': 3,
 '9458': 4,
 '6218': 4,
 '6476': 8,
 '5886': 3,
 '9294': 2,
 '5298': 8,
 '6256': 8,
 '6260': 4,
 '9488': 4,
 '9536': 8,
 '6345': 4,
 '6689': 4,
 '5171': 4,
 '6366': 4,
 '9489': 4,
 '9502': 8,
 '9501': 5,
 '6337': 4,
 '6262': 4,
 '6234': 4,
 '6233': 4,
 '6317': 4,
 '6686': 4,
 '6236': 4,
 '9498': 8,
 '9279': 4,
 '9690': 4,
 '6

In [28]:
vvzModel = pd.DataFrame(only_existing_courses)

In [31]:
vvzModel["planpunkte_ids"] = vvzModel["planpunkte"].apply(
    lambda x: [p["id"] for p in x] if isinstance(x, list) else []
)
vvzModel['ects'] = vvzModel['planpunkte_ids'].apply(
    lambda x: next((planpunkte.get(str(id), -1) for id in x), -1) if x else -1
)

In [37]:
print(vvzModel[:5])

     id                                              dates  \
0  4001  [{'start': 2025-03-14 11:00:00, 'end': 2025-03...   
1  4002  [{'start': 2025-05-07 08:00:00, 'end': 2025-05...   
2  4003  [{'start': 2025-03-10 16:30:00, 'end': 2025-03...   
3  4004  [{'start': 2025-03-12 14:00:00, 'end': 2025-03...   
4  4005  [{'start': 2025-03-10 09:30:00, 'end': 2025-03...   

                                            lvLeiter  \
0                                 Dr. Margit Kastner   
1                                 Dr. Margit Kastner   
2  Univ.Prof. Dr. Gustaf Neumann, Univ.Prof. Dipl...   
3                                   Niklas Hey, MSc.   
4                 Dipl.-Ing. Robert Bajons, MSc (WU)   

                                          planpunkte planpunkte_ids  ects  
0  [{'text': 'Marketing', 'href': '/cgi-bin/vvz.p...         [6014]   4.0  
1  [{'text': 'Marketing', 'href': '/cgi-bin/vvz.p...         [6014]   4.0  
2  [{'text': 'Betriebliche Informationssysteme I'...         [

In [39]:
winf_cbk = [
    "5105", # Jahresabschluss und Unternehmensberichte
    "5107", # Global Business
    "5106", # Grundlagen der Wirtschaftsinformatik
    "5108", # Funktionsübergreifende Betriebswirtschaftslehre - Prozesse und Entscheidungen
    "5056", # Mikroökonomik (6056 ist Angewandte Mikroökonomik)
    "5059", # Makroökonomik (6059 ist Internationale Makroökonomik)
    "5117", # Zukunftsfähiges Wirtschaften: Vertiefung und Anwendung
    "5109", # Wirtschaftsprivatrecht (6021 ist Wirtschaft im rechtlichen Kontext - Wirtschaftsprivatrecht I)
    "6023", # Mathematik
    "6024", # Statistik
    "5136", # Standards wissenschaftlichen Arbeitens und Zitierens (6911 ist Grundlagen wissenschaftlichen Arbeitens)
]

# Vorraussetzungen um Kurse aus dem Hauptstudium zu belegen:
# mind 20 ECTS aus dem CBK

winf_hauptstudium = [
    "6012", # Beschaffung, Logistik, Produktion
    "5155", # Grundlagen und Methoden des Data und Knowledge Engineering
    "9485", # Algorithmisches Denken und Programmierung
    "5158", # Rechnernetzwerke und Datenübermittlung: Grundlagen und Sicherheit
    "5160", # Design von betrieblichen Informationssystemen
    "5161", # Governance und Management von IT-Projekten
    "5162", # Forschungsmethoden der Wirtschaftsinformatik
]

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import ast

def greedy_schedule(df, cbk_ids, hs_ids, excluded_days=["Monday", "Tuesday"]):
    df = df.copy()
    df['dates'] = df['dates'].apply(lambda d: ast.literal_eval(d) if isinstance(d, str) else d)

    def is_valid_day(session):
        return all(s['start'].strftime('%A') not in excluded_days for s in session)

    def get_first_start(session):
        return min(s['start'] for s in session)

    def get_last_end(session):
        return max(s['end'] for s in session)

    # Parse session times
    df['first_start'] = df['dates'].apply(get_first_start)
    df['last_end'] = df['dates'].apply(get_last_end)

    # Filter out invalid days
    df = df[df['dates'].apply(is_valid_day)].sort_values(by='first_start')

    selected_courses = []
    cbk_ects = 0
    last_end_time = None

    for _, row in df.iterrows():
        is_cbk = any(plan_id in cbk_ids for plan_id in row['planpunkte_ids'])
        is_hs = any(plan_id in hs_ids for plan_id in row['planpunkte_ids'])

        # Check if course is allowed
        if is_hs and cbk_ects < 20:
            continue  # Not yet eligible for HS courses

        # Check for overlaps
        conflict = False
        for course in selected_courses:
            for s1 in row['dates']:
                for s2 in course['dates']:
                    latest_start = max(s1['start'], s2['start'])
                    earliest_end = min(s1['end'], s2['end'])
                    overlap = (earliest_end - latest_start).total_seconds() / 60
                    if overlap > 15:
                        conflict = True
                        break
                if conflict:
                    break
        if conflict:
            continue

        # If all checks pass, add course
        selected_courses.append(row)
        if is_cbk:
            cbk_ects += row['ects']
        last_end_time = row['last_end']

    return pd.DataFrame(selected_courses)

# Usage:
final_selection = greedy_schedule(vvzModel[vvzModel], winf_cbk, winf_hauptstudium)

ValueError: min() arg is an empty sequence