In [1]:
!pip install selenium webdriver-manager huggingface-hub datasets

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import time
import re
import pandas as pd
from huggingface_hub import login
from datasets import Dataset

# Set up Selenium options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Define the KTH Academic Year page URL dynamically
KTH_ACADEMIC_YEAR_URL = f"https://intra.kth.se/en/utbildning/schema-och-lokalbokning/lasarsindelning/lasaret-{datetime.now().year}-{datetime.now().year + 1}"
print(KTH_ACADEMIC_YEAR_URL)



  from .autonotebook import tqdm as notebook_tqdm


https://intra.kth.se/en/utbildning/schema-och-lokalbokning/lasarsindelning/lasaret-2024-2025


In [2]:
HUGGINGFACE_TOKEN = "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"
REPO_NAME = "kth-academic-scraper"

# Login to Hugging Face
login(token=HUGGINGFACE_TOKEN)


In [3]:
# Function to scrape the academic year information
def scrape_academic_year():
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(KTH_ACADEMIC_YEAR_URL)
        time.sleep(3)  # Wait for the page to load
        content = driver.find_element(By.TAG_NAME, "body").text
        return content
    finally:
        if driver:
            driver.quit()


In [14]:
import re
import pandas as pd
from datetime import datetime, timedelta

def parse_date_ranges(date_str, year):
    # date_str already includes the correct year now, so just parse directly:
    parts = date_str.split()
    day_part = parts[0]
    month_part = parts[1]
    
    if "–" in day_part:
        start_day_str, end_day_str = day_part.split("–")
        start_day = int(start_day_str)
        end_day = int(end_day_str)
    else:
        start_day = int(day_part)
        end_day = start_day

    month_number = datetime.strptime(month_part, "%B").month
    start_date = datetime(year, month_number, start_day)
    end_date = datetime(year, month_number, end_day)

    return start_date, end_date

def parse_intervals(lines, determine_year_for_date):
    intervals = []
    for line in lines:
        for part in line.split(" and "):
            # Determine the correct year for each date range before parsing
            # Use the custom determine_year_for_date function on the first date in the range
            # to guess the year
            # We'll parse the month from the first date token
            test_part = part.strip().split()[0]  # e.g. "17–18" or "20"
            # We'll guess month from second token
            month_part = part.strip().split()[1]
            year = determine_year_for_date(month_part)
            start_date, end_date = parse_date_ranges(f"{part.strip()} {year}", year)
            intervals.append((start_date, end_date))
    return intervals

def parse_academic_year(content):
    # Modify the period pattern to capture the period number as well
    period_pattern = r"Study period (\d+), (\d{1,2} \w+) [\u2013-] (\d{1,2} \w+)"
    exam_pattern = r"Exam period \d+: ((?:\d{1,2}(?:[\u2013-]\d{1,2})? \w+(?: \d{4})?(?: and )?)+)"
    self_study_pattern = r"Own work: ((?:\d{1,2}(?:[\u2013-]\d{1,2})? \w+(?: \d{4})?(?: and )?)+)"
    re_exam_pattern = r"(?:Own work / re-exams|Re-exam period): ((?:\d{1,2}(?:[\u2013-]\d{1,2})? \w+(?: \d{4})?(?: and )?)+)"

    periods = re.findall(period_pattern, content)
    exams = re.findall(exam_pattern, content)
    self_study_days = re.findall(self_study_pattern, content)
    re_exam_days = re.findall(re_exam_pattern, content)

    print("Raw periods found:", periods)
    print("Raw exams found:", exams)
    print("Raw self-study days found:", self_study_days)
    print("Raw re-exam days found:", re_exam_days)

    # Helper function to determine year based on the month and period number
    # Autumn semester (periods 1 and 2) in 2024: (Aug-Dec)
    # Spring semester (periods 3 and 4) in 2025: (Jan-May)
    def period_year(period_num):
        # If period <= 2 => 2024, else => 2025
        return 2024 if period_num <= 2 else 2025

    # We'll first parse all study periods with correct years
    study_periods = []
    for (pnum_str, sd_str, ed_str) in periods:
        pnum = int(pnum_str)
        start_year = period_year(pnum)
        end_year = period_year(pnum)

        sd_start, sd_end = parse_date_ranges(f"{sd_str} {start_year}", start_year)
        ed_start, ed_end = parse_date_ranges(f"{ed_str} {end_year}", end_year)
        study_periods.append((pnum, sd_start, ed_end))

    # Now we know which periods correspond to which year range.
    # We need a function that given a month name, determines which year it should be.
    # If month is in Aug-Dec => 2024, if Jan-May => 2025 (based on academic year structure).
    def determine_year_for_date(month_str):
        month_number = datetime.strptime(month_str, "%B").month
        # Aug(8)-Dec(12) 2024, Jan(1)-May(5) 2025
        if month_number >= 8: 
            return 2024
        else:
            return 2025

    # Parse intervals using the determine_year_for_date heuristic
    exam_intervals = parse_intervals(exams, determine_year_for_date)
    self_study_intervals = parse_intervals(self_study_days, determine_year_for_date)
    re_exam_intervals = parse_intervals(re_exam_days, determine_year_for_date)

    print("Parsed study periods:", study_periods)
    print("Parsed exam intervals:", exam_intervals)
    print("Parsed self-study intervals:", self_study_intervals)
    print("Parsed re-exam intervals:", re_exam_intervals)

    # Find Study period 1 start date
    period_1_data = [p for p in study_periods if p[0] == 1]
    if not period_1_data:
        # If no period 1 found, default to earliest study period
        earliest_study_start = min(p[1] for p in study_periods)
    else:
        earliest_study_start = period_1_data[0][1]

    # Determine the end date from all intervals
    all_intervals = [(s, e) for _, s, e in study_periods] + exam_intervals + self_study_intervals + re_exam_intervals
    if not all_intervals:
        # If no intervals, just show period 1
        period_1_end = period_1_data[0][2] if period_1_data else datetime(2024, 12, 31)
        year_start = earliest_study_start
        year_end = period_1_end
    else:
        max_date = max(end for start, end in all_intervals)
        year_start = earliest_study_start
        year_end = max_date

    def day_of_week(date):
        return date.isoweekday()

    data = []

    # Convert study_periods to a list of tuples without period number for easy checking
    sp_intervals = [(s, e) for _, s, e in study_periods]

    current_date = year_start
    while current_date <= year_end:
        day_of_year = (current_date - datetime(current_date.year, 1, 1)).days + 1
        day_of_week_num = day_of_week(current_date)

        in_study_period = any(start <= current_date <= end for start, end in sp_intervals)
        in_exam = any(start <= current_date <= end for start, end in exam_intervals)
        in_self_study = any(start <= current_date <= end for start, end in self_study_intervals)
        in_re_exam = any(start <= current_date <= end for start, end in re_exam_intervals)

        events = []
        if in_exam:
            events.append("Exam")
        if in_self_study:
            events.append("Self-study")
        if in_re_exam:
            events.append("Re-exam")

        if not events:
            if in_study_period:
                events.append("Normal")
            else:
                events.append("Holyday")

        days_until_exam = 0
        if "Exam" not in events:
            future_exams = [start for start, end in exam_intervals if start >= current_date]
            if future_exams:
                next_exam = min(future_exams)
                days_until_exam = (next_exam - current_date).days

        data.append({
            "Date": current_date.strftime("%Y-%m-%d"),
            "Year": current_date.year,
            "Day of Year": day_of_year,
            "Day of Week": day_of_week_num,
            "Days Until Exam": days_until_exam,
            "Event": ", ".join(events)
        })

        current_date += timedelta(days=1)

    return pd.DataFrame(data)


In [19]:

# Function to push data to Hugging Face
def push_to_huggingface(data):
    df = pd.DataFrame(data)
    dataset = Dataset.from_pandas(df)
    dataset.push_to_hub(REPO_NAME, token=HUGGINGFACE_TOKEN)
    print(f"Data successfully pushed to Hugging Face: {REPO_NAME}")


In [16]:
content = scrape_academic_year()
print("Scraped Content:")
print(content[:1200])  # Display the first 1000 characters to verify


Scraped Content:
kth.se
Student web
Intranet
Login
Search
Svenska
Menu
Academic year
Academic year 2024 - 2025
Introductory weeks 
Autumn semester
Autumn semester dates: 26 august 2024 - 13 January 2025
Study period 1, 26 August - 11 October
Own work: 14–16 October 2024
Exam period 1: 17–18 October and 21–25 October 2024
Study period 2, 28 October - 13 December
Own work / re-exams: 16 December–19 December
Own work: 20 December 2024–3 January 2025
Exam period 2: 7–11 January 2025 and 13 January 2025
National Holiday: 25–26 December 2024, 1 January 2025 and 6 January 2025
Spring semester
Spring semester dates: 14 January 2025 - 2 June 2025
Study period 3, 14 January - 3 March
Own work: 4–6 March 2025
Exam period 3: 7–8 March and 10–14 March 2025
Study period 4, 17 March - 20 May
Own work / re-exams: 22–25 April 2025
Own work: 2 May 2025 and 21–23 May 2025
Exam period 4: 26 May–28 May and 30 May–31 May and 2 June 2025
National Holiday: 18 April, 21 April 2025, 1 May and 29 May 2025
Re-exa

In [17]:
data = parse_academic_year(content)
print(data)  # This prints the first few rows of your dataframe


Raw periods found: [('1', '26 August', '11 October'), ('2', '28 October', '13 December'), ('3', '14 January', '3 March'), ('4', '17 March', '20 May')]
Raw exams found: ['17–18 October and 21–25 October 2024', '7–11 January 2025 and 13 January 2025', '7–8 March and 10–14 March 2025', '26 May']
Raw self-study days found: ['14–16 October 2024', '20 December 2024', '4–6 March 2025', '2 May 2025 and 21–23 May 2025']
Raw re-exam days found: ['16 December', '22–25 April 2025', '3 June']
Parsed study periods: [(1, datetime.datetime(2024, 8, 26, 0, 0), datetime.datetime(2024, 10, 11, 0, 0)), (2, datetime.datetime(2024, 10, 28, 0, 0), datetime.datetime(2024, 12, 13, 0, 0)), (3, datetime.datetime(2025, 1, 14, 0, 0), datetime.datetime(2025, 3, 3, 0, 0)), (4, datetime.datetime(2025, 3, 17, 0, 0), datetime.datetime(2025, 5, 20, 0, 0))]
Parsed exam intervals: [(datetime.datetime(2024, 10, 17, 0, 0), datetime.datetime(2024, 10, 18, 0, 0)), (datetime.datetime(2024, 10, 21, 0, 0), datetime.datetime(2024

In [20]:
push_to_huggingface(data)




ploading the dataset shards: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]

Data successfully pushed to Hugging Face: kth-academic-scraper


In [18]:
data.to_csv("academic_year.csv", index=False)
