In [1]:
!pip install selenium webdriver-manager huggingface-hub datasets



In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from datetime import datetime, timedelta
import time
import re
from datasets import Dataset, load_dataset
import pandas as pd
from huggingface_hub import login
from datasets import Dataset

# Set up Selenium options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Define the KTH Academic Year page URL dynamically
KTH_ACADEMIC_YEAR_URL = "https://www.kth.se/en/biblioteket/anvanda-biblioteket/oppettider-kontakt"
print(KTH_ACADEMIC_YEAR_URL)

https://www.kth.se/en/biblioteket/anvanda-biblioteket/oppettider-kontakt


In [None]:
import os
from huggingface_hub import login

# Ottieni il token dai secrets di GitHub
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

# Verifica che il token sia definito
if not HUGGINGFACE_TOKEN:
    raise ValueError("HUGGINGFACE_TOKEN non è stato definito. Assicurati che sia passato come variabile d'ambiente.")


login(token=HUGGINGFACE_TOKEN)

In [4]:
repo_name = "davnas/date_kth"

# Load the dataset from Hugging Face
hf_dataset = load_dataset(repo_name)

df_main = pd.concat(
    [split.to_pandas() for split in hf_dataset.values()], 
    ignore_index=True
)

# Set the index if the 'index' column exists
if 'index' in df_main.columns:
    df_main.set_index('index', inplace=True)

# Display the DataFrame
print(df_main)


README.md:   0%|          | 0.00/415 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23 [00:00<?, ? examples/s]

         date        day  opening_hour  closing_hour  is_open
0  2024-12-09     Monday            18            20        1
1  2024-12-10    Tuesday            10            20        1
2  2024-12-11  Wednesday             8            20        1
3  2024-12-12   Thursday             8            20        1
4  2024-12-13     Friday             8            17        1
5  2024-12-14   Saturday             0             0        0
6  2024-12-15     Sunday             0             0        0
7  2024-12-16     Monday            10            20        1
8  2024-12-17    Tuesday            10            20        1
9  2024-12-18  Wednesday             8            20        1
10 2024-12-19   Thursday             8            20        1
11 2024-12-20     Friday             8            17        1
12 2024-12-21   Saturday             0             0        0
13 2024-12-22     Sunday             0             0        0
14 2024-12-23     Monday            10            14        1
15 2024-

In [5]:
import pandas as pd
import re
from datetime import datetime
from typing import Optional, Tuple

def parse_hours(time_str: str) -> Tuple[int, int, int]:
    """Parse opening hours string into opening hour, closing hour, and status."""
    time_str = time_str.replace('*', '')
    if time_str == 'Closed':
        return (0, 0, 0)
    elif '–' in time_str:
        open_hour, close_hour = map(int, time_str.split('–'))
        return (open_hour, close_hour, 1)
    return (0, 0, 0)

def get_base_year(df: Optional[pd.DataFrame]) -> int:
    """Get the base year from the last entry of the DataFrame, or use the current year."""
    if df is not None and not df.empty:
        last_date = pd.to_datetime(df['date'].iloc[-1])
        return last_date.year
    return datetime.now().year

def extract_date_range(text: str, base_year: int) -> pd.DatetimeIndex:
    """Extract date range dynamically and handle year transitions."""
    date_range_match = re.search(r'(\d{2}/\d{2})–(\d{2}/\d{2})', text)
    if date_range_match:
        start_date_str, end_date_str = date_range_match.groups()
        
        # Parse start and end dates
        start_day, start_month = map(int, start_date_str.split('/'))
        end_day, end_month = map(int, end_date_str.split('/'))
        
        # Adjust for year transitions
        start_year = base_year if start_month >= 12 else base_year
        end_year = base_year if end_month >= start_month else base_year + 1
        
        start_date = pd.to_datetime(f'{start_year}-{start_month:02d}-{start_day:02d}')
        end_date = pd.to_datetime(f'{end_year}-{end_month:02d}-{end_day:02d}')
        
        return pd.date_range(start=start_date, end=end_date)
    return None

def create_main_library_hours_df(text: str, existing_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
    """Create a DataFrame with main library hours from the provided text."""
    # Initialize lists
    dates = []
    days = []
    opening_hours = []
    closing_hours = []
    is_open = []
    
    # Define days of the week
    weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    
    # Determine base year
    base_year = get_base_year(existing_df)
    dates_range = extract_date_range(text, base_year)
    if dates_range is None:
        return pd.DataFrame()
    
    # Split text into lines and clean them
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Find main library section and parse hours
    main_library_data = {}
    in_main_library = False
    current_day = None
    
    for line in lines:
        if "Main Library" in line:
            in_main_library = True
            continue
        elif "Södertälje" in line:
            in_main_library = False
            continue
            
        if in_main_library:
            if line in weekdays:
                current_day = line
            elif current_day and (line == "Closed" or '–' in line):
                main_library_data[current_day] = line
                current_day = None

    # Create DataFrame entries
    for date in dates_range:
        day = weekdays[date.weekday()]  # Get day name from date
        if day in main_library_data:
            hours_str = main_library_data[day]
            open_hour, close_hour, status = parse_hours(hours_str)
            
            dates.append(date)
            days.append(day)
            opening_hours.append(open_hour)
            closing_hours.append(close_hour)
            is_open.append(status)
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': dates,
        'day': days,
        'opening_hour': opening_hours,
        'closing_hour': closing_hours,
        'is_open': is_open
    })
    
    return df.sort_values('date').reset_index(drop=True)

In [6]:
# Function to scrape the academic year information
def scrape_academic_year():
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(KTH_ACADEMIC_YEAR_URL)
        time.sleep(10)  # Wait for the page to load
        content_current = driver.find_element(By.TAG_NAME, "body").text
        
        try:
            button = driver.find_element(By.XPATH, "//div[@class='nextweek']/a[@data-libraryname='main']")
            button.click()

        except Exception as e:
            print("Button not found or couldn't be clicked:", e)
            
        time.sleep(10)  # Wait for the page to load
        content_future = driver.find_element(By.TAG_NAME, "body").text
        
        return content_current, content_future
    finally:
        if driver:
            driver.quit()

In [7]:
content_current, content_future = scrape_academic_year()
content_future_clean = content_future.replace("*", "")
print("Scraped Content:")


Scraped Content:


In [8]:
df = create_main_library_hours_df(content_current)
df

Unnamed: 0,date,day,opening_hour,closing_hour,is_open
0,2024-12-30,Monday,10,14,1
1,2024-12-31,Tuesday,0,0,0
2,2025-01-01,Wednesday,0,0,0
3,2025-01-02,Thursday,10,14,1
4,2025-01-03,Friday,10,14,1
5,2025-01-04,Saturday,0,0,0
6,2025-01-05,Sunday,0,0,0


In [9]:
df1 = create_main_library_hours_df(content_future_clean,df)
df1

Unnamed: 0,date,day,opening_hour,closing_hour,is_open
0,2025-01-06,Monday,0,0,0
1,2025-01-07,Tuesday,8,21,1
2,2025-01-08,Wednesday,8,21,1
3,2025-01-09,Thursday,8,21,1
4,2025-01-10,Friday,8,19,1
5,2025-01-11,Saturday,10,16,1
6,2025-01-12,Sunday,0,0,0


In [10]:
df_long = pd.concat([df,df1], ignore_index=True)
print(df_long)

         date        day  opening_hour  closing_hour  is_open
0  2024-12-30     Monday            10            14        1
1  2024-12-31    Tuesday             0             0        0
2  2025-01-01  Wednesday             0             0        0
3  2025-01-02   Thursday            10            14        1
4  2025-01-03     Friday            10            14        1
5  2025-01-04   Saturday             0             0        0
6  2025-01-05     Sunday             0             0        0
7  2025-01-06     Monday             0             0        0
8  2025-01-07    Tuesday             8            21        1
9  2025-01-08  Wednesday             8            21        1
10 2025-01-09   Thursday             8            21        1
11 2025-01-10     Friday             8            19        1
12 2025-01-11   Saturday            10            16        1
13 2025-01-12     Sunday             0             0        0


In [11]:
print(df_main)

         date        day  opening_hour  closing_hour  is_open
0  2024-12-09     Monday            18            20        1
1  2024-12-10    Tuesday            10            20        1
2  2024-12-11  Wednesday             8            20        1
3  2024-12-12   Thursday             8            20        1
4  2024-12-13     Friday             8            17        1
5  2024-12-14   Saturday             0             0        0
6  2024-12-15     Sunday             0             0        0
7  2024-12-16     Monday            10            20        1
8  2024-12-17    Tuesday            10            20        1
9  2024-12-18  Wednesday             8            20        1
10 2024-12-19   Thursday             8            20        1
11 2024-12-20     Friday             8            17        1
12 2024-12-21   Saturday             0             0        0
13 2024-12-22     Sunday             0             0        0
14 2024-12-23     Monday            10            14        1
15 2024-

In [12]:
# Merge the dataframes
df_merged = pd.concat([df_main.set_index("date"), df_long.set_index("date")], axis=0)

# Ensure the index (date) is of datetime type
df_merged.index = pd.to_datetime(df_merged.index)

# Remove duplicates keeping the most recent entry
df_merged = df_merged[~df_merged.index.duplicated(keep="last")]

# Reset index
df_merged = df_merged.reset_index()

# Correggi le date di gennaio 2024 in gennaio 2025
mask = (df_merged['date'].dt.year == 2024) & (df_merged['date'].dt.month == 1)
df_merged.loc[mask, 'date'] = df_merged.loc[mask, 'date'] + pd.DateOffset(years=1)

# Ordina cronologicamente
df_merged_sorted = df_merged.sort_values('date').reset_index(drop=True)

print(df_merged_sorted)

         date        day  opening_hour  closing_hour  is_open
0  2024-12-09     Monday            18            20        1
1  2024-12-10    Tuesday            10            20        1
2  2024-12-11  Wednesday             8            20        1
3  2024-12-12   Thursday             8            20        1
4  2024-12-13     Friday             8            17        1
5  2024-12-14   Saturday             0             0        0
6  2024-12-15     Sunday             0             0        0
7  2024-12-16     Monday            10            20        1
8  2024-12-17    Tuesday            10            20        1
9  2024-12-18  Wednesday             8            20        1
10 2024-12-19   Thursday             8            20        1
11 2024-12-20     Friday             8            17        1
12 2024-12-21   Saturday             0             0        0
13 2024-12-22     Sunday             0             0        0
14 2024-12-23     Monday            10            14        1
15 2024-

In [13]:
hf_dataset = Dataset.from_pandas(df_merged_sorted)
hf_dataset.push_to_hub(
    repo_id=repo_name,
    token= "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",
    private=False,
    commit_message="Update dataset"
)
print(f"Dataset pushed to Hugging Face repository: {repo_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to Hugging Face repository: davnas/date_kth
