In [None]:
!pip install selenium webdriver-manager huggingface-hub datasets

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import time
import re
from datasets import Dataset, load_dataset
import pandas as pd
from huggingface_hub import login
from datasets import Dataset

# Set up Selenium options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Define the KTH Academic Year page URL dynamically
KTH_ACADEMIC_YEAR_URL = "https://www.kth.se/en/biblioteket/anvanda-biblioteket/oppettider-kontakt"
print(KTH_ACADEMIC_YEAR_URL)

https://www.kth.se/en/biblioteket/anvanda-biblioteket/oppettider-kontakt


In [12]:
import os
from huggingface_hub import login

# Ottieni il token dai secrets di GitHub
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

# Verifica che il token sia definito
if not HUGGINGFACE_TOKEN:
    raise ValueError("HUGGINGFACE_TOKEN non è stato definito. Assicurati che sia passato come variabile d'ambiente.")


repo_name = "davnas/date_kth"
login(token=HUGGINGFACE_TOKEN)

In [13]:
# Load the dataset from Hugging Face
hf_dataset = load_dataset(repo_name)

df_main = pd.concat(
    [split.to_pandas() for split in hf_dataset.values()], 
    ignore_index=True
)

# Set the index if the 'index' column exists
if 'index' in df_main.columns:
    df_main.set_index('index', inplace=True)

# Display the DataFrame
print(df_main)


         date        day  opening_hour  closing_hour  is_open
0  2024-12-09     Monday            18            20        1
1  2024-12-10    Tuesday            10            20        1
2  2024-12-11  Wednesday             8            20        1
3  2024-12-12   Thursday             8            20        1
4  2024-12-13     Friday             8            17        1
5  2024-12-14   Saturday             0             0        0
6  2024-12-15     Sunday             0             0        0
7  2024-12-16     Monday            10            20        1
8  2024-12-17    Tuesday            10            20        1
9  2024-12-18  Wednesday             8            20        1
10 2024-12-19   Thursday             8            20        1
11 2024-12-20     Friday             8            17        1
12 2024-12-21   Saturday             0             0        0
13 2024-12-22     Sunday             0             0        0
14 2024-12-23     Monday            10            14        1
15 2024-

In [14]:
import pandas as pd
import re
from typing import Dict, List, Tuple

def parse_hours(time_str: str) -> Tuple[int, int, int]:
    """
    Parse opening hours string into opening hour, closing hour, and status.
    
    Args:
        time_str (str): String containing hours (e.g., '10–14' or 'Closed')
    
    Returns:
        Tuple of (opening_hour, closing_hour, is_open)
    """
    if time_str == 'Closed':
        return (0, 0, 0)
    elif '–' in time_str:
        open_hour, close_hour = map(int, time_str.split('–'))
        return (open_hour, close_hour, 1)
    return (0, 0, 0)

def create_main_library_hours_df(text: str) -> pd.DataFrame:
    """
    Create a DataFrame with main library hours from the provided text.
    
    Args:
        text (str): Raw text containing library opening hours
    
    Returns:
        pandas DataFrame with columns: date, day, opening_hour, closing_hour, is_open
    """
    # Initialize lists for DataFrame
    dates = []
    days = []
    opening_hours = []
    closing_hours = []
    is_open = []
    
    # Define days of the week
    weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    
    # Set date range
    start_date = pd.to_datetime('2024-12-23')
    dates_range = pd.date_range(start=start_date, periods=7, freq='D')
    
    # Split text into lines and clean them
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Find main library section
    main_library_data = {}
    in_main_library = False
    current_day = None
    
    for line in lines:
        if "Main Library" in line:
            in_main_library = True
            continue
        elif "Södertälje" in line:
            in_main_library = False
            continue
            
        if in_main_library:
            if line in weekdays:
                current_day = line
            elif current_day and (line == "Closed" or '–' in line):
                main_library_data[current_day] = line
                current_day = None
    
    # Create DataFrame entries
    for day in weekdays:
        if day in main_library_data:
            hours_str = main_library_data[day]
            open_hour, close_hour, status = parse_hours(hours_str)
            
            day_index = weekdays.index(day)
            dates.append(dates_range[day_index])
            days.append(day)
            opening_hours.append(open_hour)
            closing_hours.append(close_hour)
            is_open.append(status)
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': dates,
        'day': days,
        'opening_hour': opening_hours,
        'closing_hour': closing_hours,
        'is_open': is_open
    })
    
    # Sort by date
    df = df.sort_values('date').reset_index(drop=True)
    
    return df

In [15]:
# Function to scrape the academic year information
def scrape_academic_year():
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(KTH_ACADEMIC_YEAR_URL)
        time.sleep(10)  # Wait for the page to load
        content = driver.find_element(By.TAG_NAME, "body").text
        return content
    finally:
        if driver:
            driver.quit()

In [16]:
content = scrape_academic_year()
print("Scraped Content:")
print(content)  # Display the first 1000 characters to verify


Scraped Content:
kth.se
Student web
Intranet
Login
Search
Svenska
Menu
Opening hours and contact
Opening hours and contact
KTH Library is a public library, open to everyone. Below you will find our contact information, opening hours and directions to the main library and our library in Södertälje.
Please note that the library has changed opening hours during Christmas and New Year's, see more in the weekly schedule below. Telephone service is closed 23 Dec–6 January. Read more about opening hours during Christmas and New Year's
Opening hours
Main Library
Today December 29
Closed
23/12–29/12
Monday
10–14
Tuesday
Closed
Wednesday
Closed
Thursday
Closed
Friday
10–14
Saturday
Closed
Sunday
Closed
Next
Södertälje
Today December 29
Closed
23/12–29/12
Monday
Closed
Tuesday
Closed
Wednesday
Closed
Thursday
Closed
Friday
Closed
Saturday
Closed
Sunday
Closed
Next
For KTH students and employees with access card and valid ID, the Main Library is available from 8 and opens for everyone at 9 (workin

In [17]:
df = create_main_library_hours_df(content)
print("\nMain Library Hours DataFrame:")
print(df.to_string(index=False))


Main Library Hours DataFrame:
      date       day  opening_hour  closing_hour  is_open
2024-12-23    Monday            10            14        1
2024-12-24   Tuesday             0             0        0
2024-12-25 Wednesday             0             0        0
2024-12-26  Thursday             0             0        0
2024-12-27    Friday            10            14        1
2024-12-28  Saturday             0             0        0
2024-12-29    Sunday             0             0        0


In [18]:

# Merge the dataframes: update existing entries and add new ones
df_merged = pd.concat([df_main.set_index("date"), df.set_index("date")], axis=0)

# Ensure the index (date) is of datetime type
df_merged.index = pd.to_datetime(df_merged.index, errors="coerce")

# Remove duplicate indexes, keeping the last occurrence
df_merged = df_merged[~df_merged.index.duplicated(keep="last")]

# Reset the index to make 'date' a column
df_merged = df_merged.reset_index()

# Sort the DataFrame by 'date' in ascending order
df_merged_sorted = df_merged.sort_values(by="date", ascending=True).reset_index(drop=True)

# Display the cleaned DataFrame
print(df_merged_sorted)


         date        day  opening_hour  closing_hour  is_open
0  2024-12-09     Monday            18            20        1
1  2024-12-10    Tuesday            10            20        1
2  2024-12-11  Wednesday             8            20        1
3  2024-12-12   Thursday             8            20        1
4  2024-12-13     Friday             8            17        1
5  2024-12-14   Saturday             0             0        0
6  2024-12-15     Sunday             0             0        0
7  2024-12-16     Monday            10            20        1
8  2024-12-17    Tuesday            10            20        1
9  2024-12-18  Wednesday             8            20        1
10 2024-12-19   Thursday             8            20        1
11 2024-12-20     Friday             8            17        1
12 2024-12-21   Saturday             0             0        0
13 2024-12-22     Sunday             0             0        0
14 2024-12-23     Monday            10            14        1
15 2024-

In [19]:
hf_dataset = Dataset.from_pandas(df_merged)
hf_dataset.push_to_hub(
    repo_id=repo_name,
    token=HUGGINGFACE_TOKEN,
    private=False,
    commit_message="Update dataset"
)
print(f"Dataset pushed to Hugging Face repository: {repo_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Dataset pushed to Hugging Face repository: davnas/date_kth
