In [3]:
!pip install selenium webdriver-manager huggingface-hub datasets

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import time
import re
import pandas as pd
from huggingface_hub import login
from datasets import Dataset

# Set up Selenium options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")




In [13]:
from datetime import datetime

# Ottieni l'anno e il mese attuali
current_year = datetime.now().year
current_month = datetime.now().month

# Se il mese è entro agosto, usa l'anno accademico corrente
if current_month <= 8:
    academic_year_start = current_year - 1
    academic_year_end = current_year
else:
    academic_year_start = current_year
    academic_year_end = current_year + 1

# Genera l'URL corretto
KTH_ACADEMIC_YEAR_URL = (
    f"https://intra.kth.se/en/utbildning/schema-och-lokalbokning/lasarsindelning/lasaret-{academic_year_start}-{academic_year_end}"
)

print(KTH_ACADEMIC_YEAR_URL)


https://intra.kth.se/en/utbildning/schema-och-lokalbokning/lasarsindelning/lasaret-2024-2025


In [14]:
# Function to scrape the academic year information
def scrape_academic_year():
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(KTH_ACADEMIC_YEAR_URL)
        time.sleep(3)  # Wait for the page to load
        content = driver.find_element(By.TAG_NAME, "body").text
        return content
    finally:
        if driver:
            driver.quit()


In [15]:
from datetime import datetime, timedelta
import pandas as pd
import re

def parse_date_ranges(date_str, year):
    parts = date_str.split()
    day_part = parts[0]
    month_part = parts[1]
    
    if "–" in day_part:
        start_day_str, end_day_str = day_part.split("–")
        start_day = int(start_day_str)
        end_day = int(end_day_str)
    else:
        start_day = int(day_part)
        end_day = start_day

    month_number = datetime.strptime(month_part, "%B").month
    start_date = datetime(year, month_number, start_day)
    end_date = datetime(year, month_number, end_day)

    return start_date, end_date

def parse_intervals(lines, determine_year_for_date):
    intervals = []
    for line in lines:
        for part in line.split(" and "):
            test_part = part.strip().split()[0]
            month_part = part.strip().split()[1]
            year = determine_year_for_date(month_part)
            start_date, end_date = parse_date_ranges(f"{part.strip()} {year}", year)
            intervals.append((start_date, end_date))
    return intervals

def parse_academic_year(content):
    current_year = datetime.now().year
    current_month = datetime.now().month
    
    # Academic year determination based on the current date
    if current_month <= 8:
        academic_start_year = current_year - 1
    else:
        academic_start_year = current_year
    academic_end_year = academic_start_year + 1

    period_pattern = r"Period (\d+)\nTeaching, workdays (\d{1,2} \w+)[\u2013-](\d{1,2} \w+ \d{4})"
    exam_pattern = r"Examination period \d+: ((?:\d{1,2}(?:[\u2013-]\d{1,2})? \w+(?: \d{4})?(?: and )?)+)"
    self_study_pattern = r"Own work: ((?:\d{1,2}(?:[\u2013-]\d{1,2})? \w+(?: \d{4})?(?: and )?)+)"
    re_exam_pattern = r"[Rr]e-[xe]amination[^:]*: ((?:\d{1,2}(?:[\u2013-]\d{1,2})? \w+(?: \d{4})?(?: and )?)+)"

    periods = re.findall(period_pattern, content)
    exams = re.findall(exam_pattern, content)
    self_study_days = re.findall(self_study_pattern, content)
    re_exam_days = re.findall(re_exam_pattern, content)

    def period_year(period_num):
        return academic_start_year if period_num <= 2 else academic_end_year

    study_periods = []
    for (pnum_str, sd_str, ed_str) in periods:
        pnum = int(pnum_str)
        start_year = period_year(pnum)
        end_year = period_year(pnum)

        sd_start, sd_end = parse_date_ranges(f"{sd_str} {start_year}", start_year)
        ed_start, ed_end = parse_date_ranges(f"{ed_str} {end_year}", end_year)
        study_periods.append((pnum, sd_start, ed_end))

    def determine_year_for_date(month_str):
        month_number = datetime.strptime(month_str, "%B").month
        if month_number >= 8:
            return academic_start_year
        else:
            return academic_end_year

    exam_intervals = parse_intervals(exams, determine_year_for_date)
    self_study_intervals = parse_intervals(self_study_days, determine_year_for_date)
    re_exam_intervals = parse_intervals(re_exam_days, determine_year_for_date)

    if not study_periods:
        teaching_pattern = r"Teaching, workdays (\d{1,2} \w+)[\u2013-](\d{1,2} \w+ \d{4})"
        teaching_periods = re.findall(teaching_pattern, content)
        if teaching_periods:
            earliest_study_start = parse_date_ranges(teaching_periods[0][0] + f" {academic_start_year}", academic_start_year)[0]
        else:
            earliest_study_start = datetime(academic_start_year, 8, 1)
    else:
        period_1_data = [p for p in study_periods if p[0] == 1]
        if not period_1_data:
            earliest_study_start = min(p[1] for p in study_periods)
        else:
            earliest_study_start = period_1_data[0][1]

    all_intervals = [(s, e) for _, s, e in study_periods] + exam_intervals + self_study_intervals + re_exam_intervals
    if not all_intervals:
        year_end = datetime(academic_end_year, 6, 1)
    else:
        year_end = max(end for start, end in all_intervals)
    
    year_start = earliest_study_start

    def day_of_week(date):
        return date.isoweekday()

    data = []
    sp_intervals = [(s, e) for _, s, e in study_periods]

    current_date = year_start
    while current_date <= year_end:
        day_of_year = (current_date - datetime(current_date.year, 1, 1)).days + 1
        day_of_week_num = day_of_week(current_date)

        in_study_period = any(start <= current_date <= end for start, end in sp_intervals)
        in_exam = any(start <= current_date <= end for start, end in exam_intervals)
        in_self_study = any(start <= current_date <= end for start, end in self_study_intervals)
        in_re_exam = any(start <= current_date <= end for start, end in re_exam_intervals)

        events = []
        if in_exam:
            events.append("Exam")
        if in_self_study:
            events.append("Self-study")
        if in_re_exam:
            events.append("Re-exam")

        if not events:
            if in_study_period:
                events.append("Normal")
            else:
                events.append("Holiday")

        days_until_exam = 0
        if "Exam" not in events:
            future_exams = [start for start, end in exam_intervals if start >= current_date]
            if future_exams:
                next_exam = min(future_exams)
                days_until_exam = (next_exam - current_date).days

        data.append({
            "Date": current_date.strftime("%Y-%m-%d"),
            "Year": current_date.year,
            "Day of Year": day_of_year,
            "Day of Week": day_of_week_num,
            "Days Until Exam": days_until_exam,
            "Event": ", ".join(events)
        })

        current_date += timedelta(days=1)

    return pd.DataFrame(data)


In [16]:
# Function to push data to Hugging Face
def push_to_huggingface(data):
    df = pd.DataFrame(data)
    dataset = Dataset.from_pandas(df)
    dataset.push_to_hub(REPO_NAME, token=HUGGINGFACE_TOKEN)
    print(f"Data successfully pushed to Hugging Face: {REPO_NAME}")


In [17]:
content = scrape_academic_year()
print("Scraped Content:")
print(content[:1200])  # Display the first 1000 characters to verify


Scraped Content:
kth.se
Student web
Intranet
Login
Search
Svenska
Menu
Academic year
Academic year 2024 - 2025
Introductory weeks 
Autumn semester
Autumn semester dates: 26 august 2024 - 13 January 2025
Study period 1, 26 August - 11 October
Own work: 14–16 October 2024
Exam period 1: 17–18 October and 21–25 October 2024
Study period 2, 28 October - 13 December
Own work / re-exams: 16 December–19 December
Own work: 20 December 2024–3 January 2025
Exam period 2: 7–11 January 2025 and 13 January 2025
National Holiday: 25–26 December 2024, 1 January 2025 and 6 January 2025
Spring semester
Spring semester dates: 14 January 2025 - 2 June 2025
Study period 3, 14 January - 3 March
Own work: 4–6 March 2025
Exam period 3: 7–8 March and 10–14 March 2025
Study period 4, 17 March - 20 May
Own work / re-exams: 22–25 April 2025
Own work: 2 May 2025 and 21–23 May 2025
Exam period 4: 26 May–28 May and 30 May–31 May and 2 June 2025
National Holiday: 18 April, 21 April 2025, 1 May and 29 May 2025
Re-exa

In [18]:
data = parse_academic_year(content)
print(data)  # This prints the first few rows of your dataframe


           Date  Year  Day of Year  Day of Week  Days Until Exam       Event
0    2024-08-01  2024          214            4                0     Holiday
1    2024-08-02  2024          215            5                0     Holiday
2    2024-08-03  2024          216            6                0     Holiday
3    2024-08-04  2024          217            7                0     Holiday
4    2024-08-05  2024          218            1                0     Holiday
..          ...   ...          ...          ...              ...         ...
291  2025-05-19  2025          139            1                0     Holiday
292  2025-05-20  2025          140            2                0     Holiday
293  2025-05-21  2025          141            3                0  Self-study
294  2025-05-22  2025          142            4                0  Self-study
295  2025-05-23  2025          143            5                0  Self-study

[296 rows x 6 columns]


In [19]:
import pandas as pd

# Assuming 'df' is your existing DataFrame
data.reset_index(inplace=True)
data.rename(columns={'index': 'id'}, inplace=True)
data

Unnamed: 0,id,Date,Year,Day of Year,Day of Week,Days Until Exam,Event
0,0,2024-08-01,2024,214,4,0,Holiday
1,1,2024-08-02,2024,215,5,0,Holiday
2,2,2024-08-03,2024,216,6,0,Holiday
3,3,2024-08-04,2024,217,7,0,Holiday
4,4,2024-08-05,2024,218,1,0,Holiday
...,...,...,...,...,...,...,...
291,291,2025-05-19,2025,139,1,0,Holiday
292,292,2025-05-20,2025,140,2,0,Holiday
293,293,2025-05-21,2025,141,3,0,Self-study
294,294,2025-05-22,2025,142,4,0,Self-study


In [20]:
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               296 non-null    int64         
 1   Date             296 non-null    datetime64[ns]
 2   Year             296 non-null    int64         
 3   Day of Year      296 non-null    int64         
 4   Day of Week      296 non-null    int64         
 5   Days Until Exam  296 non-null    int64         
 6   Event            296 non-null    object        
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 16.3+ KB


In [21]:
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')
data

Unnamed: 0,id,date,year,day_of_year,day_of_week,days_until_exam,event
0,0,2024-08-01,2024,214,4,0,Holiday
1,1,2024-08-02,2024,215,5,0,Holiday
2,2,2024-08-03,2024,216,6,0,Holiday
3,3,2024-08-04,2024,217,7,0,Holiday
4,4,2024-08-05,2024,218,1,0,Holiday
...,...,...,...,...,...,...,...
291,291,2025-05-19,2025,139,1,0,Holiday
292,292,2025-05-20,2025,140,2,0,Holiday
293,293,2025-05-21,2025,141,3,0,Self-study
294,294,2025-05-22,2025,142,4,0,Self-study


In [22]:
import hopsworks
import os

# Hopsworks setup
project = hopsworks.login()
fs = project.get_feature_store()

# Get or create feature group
feature_group = fs.get_or_create_feature_group(
    name="kth_academic_year",
    version=1,
    description="Weather data history",
    primary_key=['id'],
    event_time=['date'],  # Ensure this matches the DataFrame
    online_enabled=True
)

feature_group.insert(data)
print("Dataset successfully uploaded to Hopsworks Feature Store.")


2025-01-07 09:51:14,064 INFO: Initializing external client
2025-01-07 09:51:14,064 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-07 09:51:15,389 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1205426


Uploading Dataframe: 100.00% |██████████| Rows 296/296 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: kth_academic_year_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1205426/jobs/named/kth_academic_year_1_offline_fg_materialization/executions
Dataset successfully uploaded to Hopsworks Feature Store.
