In [1]:
!pip install -q selenium webdriver-manager huggingface-hub datasets

In [3]:
from huggingface_hub import login
from datasets import Dataset, load_dataset
import pandas as pd
import hopsworks
import os

In [9]:
# Hopsworks setup
project = hopsworks.login()
fs = project.get_feature_store()

# Get or create feature group
feature_group = fs.get_or_create_feature_group(
    name="kth_opening_hour",
    version=1,
    description="Feature group containing store opening and closing hours.",
    primary_key=['id'],
    event_time=['date'],
    online_enabled=True
)

# Load existing data
df_main = feature_group.read()
df_main = df_main.sort_values(by='date', ascending=True)
df_main

if 'id' in df_main.columns:
    df_main.set_index('id', inplace=True)
    
# Remove timezone information
df_main['date'] = df_main['date'].dt.tz_localize(None)

df_main.tail()

2025-01-07 09:40:45,181 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-07 09:40:45,188 INFO: Initializing external client
2025-01-07 09:40:45,190 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-07 09:40:46,609 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1205426
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.52s) 


Unnamed: 0_level_0,date,day,opening_hour,closing_hour,is_open
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
51,2025-01-29,Friday,8,19,1
52,2025-01-30,Saturday,10,16,1
53,2025-01-31,Sunday,0,0,0
54,2025-02-01,Monday,8,21,1
55,2025-02-02,Tuesday,8,21,1


In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from datetime import datetime, timedelta
import time
import re
from datasets import Dataset, load_dataset
import pandas as pd
from huggingface_hub import login
from datasets import Dataset

# Set up Selenium options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Define the KTH Academic Year page URL dynamically
KTH_ACADEMIC_YEAR_URL = "https://www.kth.se/en/biblioteket/anvanda-biblioteket/oppettider-kontakt"
print(KTH_ACADEMIC_YEAR_URL)

https://www.kth.se/en/biblioteket/anvanda-biblioteket/oppettider-kontakt


In [6]:
import pandas as pd
import re
from datetime import datetime
from typing import Optional, Tuple, List
import logging

# Optional: Configure logging for debugging purposes
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def parse_hours(time_str: str) -> Tuple[int, int, int]:
    """
    Parse opening hours string into opening hour, closing hour, and status.
    
    Parameters:
    - time_str (str): Opening hours string (e.g., "10–14" or "Closed").
    
    Returns:
    - Tuple[int, int, int]: (opening_hour, closing_hour, is_open)
    """
    time_str = time_str.replace('*', '').strip()
    if time_str.lower() == 'closed':
        return (0, 0, 0)
    elif '–' in time_str:
        try:
            open_hour, close_hour = map(int, time_str.split('–'))
            return (open_hour, close_hour, 1)
        except ValueError as e:
            logger.error(f"Error parsing hours '{time_str}': {e}")
            return (0, 0, 0)
    return (0, 0, 0)

def get_base_year(df: Optional[pd.DataFrame]) -> int:
    """
    Get the base year from the last entry of the DataFrame, or use the current year.
    
    Parameters:
    - df (Optional[pd.DataFrame]): Existing DataFrame with a 'date' column.
    
    Returns:
    - int: Base year for date parsing.
    """
    if df is not None and not df.empty:
        last_date = pd.to_datetime(df['date'].iloc[-1])
        return last_date.year
    return datetime.now().year

def extract_date_ranges(text: str, base_year: int) -> pd.DatetimeIndex:
    """
    Extract multiple date ranges from text and handle year transitions.
    
    Parameters:
    - text (str): Input text containing date ranges in 'dd/mm–dd/mm' format.
    - base_year (int): The base year to start parsing dates.
    
    Returns:
    - pd.DatetimeIndex: A continuous range of dates covering all extracted ranges.
    """
    date_ranges = re.findall(r'(\d{2}/\d{2})–(\d{2}/\d{2})', text)
    if not date_ranges:
        logger.warning("No date ranges found in the text.")
        return pd.DatetimeIndex([])
    
    all_dates = []
    current_year = base_year
    previous_end_month = None
    previous_end_day = None
    
    for start_str, end_str in date_ranges:
        try:
            start_day, start_month = map(int, start_str.split('/'))
            end_day, end_month = map(int, end_str.split('/'))
        except ValueError as e:
            logger.error(f"Error parsing date strings '{start_str}–{end_str}': {e}")
            continue
        
        # Determine if the year should be incremented based on previous date range
        if previous_end_month is not None:
            if (start_month < previous_end_month) or \
               (start_month == previous_end_month and start_day <= previous_end_day):
                current_year += 1
        
        start_year = current_year
        
        # Determine end_year based on whether the end date is before the start date
        if (end_month < start_month) or \
           (end_month == start_month and end_day < start_day):
            end_year = start_year + 1
        else:
            end_year = start_year
        
        # Create datetime objects with error handling
        try:
            start_date = pd.to_datetime(f'{start_year}-{start_month:02d}-{start_day:02d}')
            end_date = pd.to_datetime(f'{end_year}-{end_month:02d}-{end_day:02d}')
        except ValueError as e:
            logger.error(f"Error creating datetime objects for '{start_str}–{end_str}': {e}")
            continue
        
        # Log the parsed date range
        logger.info(f"Parsed date range: {start_date.date()} to {end_date.date()}")
        
        # Append the date range to all_dates
        all_dates.extend(pd.date_range(start=start_date, end=end_date))
        
        # Update previous_end_month and previous_end_day for next iteration
        previous_end_month = end_month
        previous_end_day = end_day
    
    return pd.DatetimeIndex(all_dates)

def create_main_library_hours_df(text: str, existing_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
    """
    Create a DataFrame with main library hours from the provided text.
    
    Parameters:
    - text (str): Input text containing library hours information.
    - existing_df (Optional[pd.DataFrame]): Existing DataFrame to determine the base year.
    
    Returns:
    - pd.DataFrame: DataFrame containing date, day, opening_hour, closing_hour, and is_open.
    """
    # Initialize lists to store DataFrame entries
    dates = []
    days = []
    opening_hours = []
    closing_hours = []
    is_open = []
    
    # Define days of the week for reference
    weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    
    # Determine the base year from the existing DataFrame or use the current year
    base_year = get_base_year(existing_df)
    logger.info(f"Base year for parsing dates: {base_year}")
    
    # Extract all date ranges from the text
    dates_ranges = extract_date_ranges(text, base_year)
    if dates_ranges.empty:
        logger.warning("No dates extracted. Returning an empty DataFrame.")
        return pd.DataFrame()
    
    # Split text into lines and remove any leading/trailing whitespace
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Initialize variables to track the current section and day
    main_library_data = {}
    in_main_library = False
    current_day = None
    
    for line in lines:
        # Identify the start of the Main Library section
        if "Main Library" in line:
            in_main_library = True
            logger.debug("Entering Main Library section.")
            continue
        # Identify the end of the Main Library section
        elif "Södertälje" in line:
            in_main_library = False
            logger.debug("Exiting Main Library section.")
            continue
        
        # Parse lines within the Main Library section
        if in_main_library:
            if line in weekdays:
                current_day = line
                logger.debug(f"Current day set to: {current_day}")
            elif current_day and (line.lower() == "closed" or '–' in line):
                main_library_data[current_day] = line
                logger.debug(f"Set hours for {current_day}: {line}")
                current_day = None  # Reset for the next day
    
    # Log the parsed main library data
    logger.info(f"Main Library Hours Data: {main_library_data}")
    
    # Iterate over each date in the extracted date ranges
    for date in dates_ranges:
        day = weekdays[date.weekday()]  # Get day name from date
        if day in main_library_data:
            hours_str = main_library_data[day]
            open_hour, close_hour, status = parse_hours(hours_str)
            
            dates.append(date)
            days.append(day)
            opening_hours.append(open_hour)
            closing_hours.append(close_hour)
            is_open.append(status)
            logger.debug(f"Added entry: {date.date()}, {day}, {open_hour}-{close_hour}, Open: {status}")
        else:
            logger.debug(f"No hours data for {day} on {date.date()}. Skipping.")
    
    # Create the DataFrame from the collected data
    df = pd.DataFrame({
        'date': dates,
        'day': days,
        'opening_hour': opening_hours,
        'closing_hour': closing_hours,
        'is_open': is_open
    })
    
    # Sort the DataFrame by date to ensure linear progression
    df_sorted = df.sort_values('date').reset_index(drop=True)
    
    logger.info("DataFrame creation complete.")
    return df_sorted


In [7]:
# Function to scrape the academic year information
def scrape_academic_year():
    driver = None
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(KTH_ACADEMIC_YEAR_URL)
        time.sleep(10)  # Wait for the page to load
        content_current = driver.find_element(By.TAG_NAME, "body").text
        
        try:
            button = driver.find_element(By.XPATH, "//div[@class='nextweek']/a[@data-libraryname='main']")
            button.click()

        except Exception as e:
            print("Button not found or couldn't be clicked:", e)
            
        time.sleep(10)  # Wait for the page to load
        content_future = driver.find_element(By.TAG_NAME, "body").text
        
        return content_current, content_future
    finally:
        if driver:
            driver.quit()

In [8]:
content_current, content_future = scrape_academic_year()
content_future_clean = content_future.replace("*", "")
print("Scraped Content:")


2025-01-07 09:40:06,383 INFO: Get LATEST chromedriver version for google-chrome
2025-01-07 09:40:06,538 INFO: Get LATEST chromedriver version for google-chrome
2025-01-07 09:40:06,565 INFO: Get LATEST chromedriver version for google-chrome
2025-01-07 09:40:06,632 INFO: WebDriver version 131.0.6778.204 selected
2025-01-07 09:40:06,635 INFO: Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.204/linux64/chromedriver-linux64.zip
2025-01-07 09:40:06,635 INFO: About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.204/linux64/chromedriver-linux64.zip
2025-01-07 09:40:06,803 INFO: Driver downloading response is 200
2025-01-07 09:40:07,356 INFO: Get LATEST chromedriver version for google-chrome
2025-01-07 09:40:07,522 INFO: Driver has been saved in cache [/home/dave/.wdm/drivers/chromedriver/linux64/131.0.6778.204]
Scraped Content:


In [8]:
df = create_main_library_hours_df(content_current)
df

2025-01-06 21:38:13,156 INFO: Base year for parsing dates: 2025
2025-01-06 21:38:13,161 INFO: Parsed date range: 2025-01-06 to 2025-01-12
2025-01-06 21:38:13,166 INFO: Parsed date range: 2026-01-06 to 2026-01-12
2025-01-06 21:38:13,169 INFO: Main Library Hours Data: {'Monday': 'Closed', 'Tuesday': '8*–21', 'Wednesday': '8*–21', 'Thursday': '8*–21', 'Friday': '8*–19', 'Saturday': '10–16', 'Sunday': 'Closed'}
2025-01-06 21:38:13,172 INFO: DataFrame creation complete.


Unnamed: 0,date,day,opening_hour,closing_hour,is_open
0,2025-01-06,Monday,0,0,0
1,2025-01-07,Tuesday,8,21,1
2,2025-01-08,Wednesday,8,21,1
3,2025-01-09,Thursday,8,21,1
4,2025-01-10,Friday,8,19,1
5,2025-01-11,Saturday,10,16,1
6,2025-01-12,Sunday,0,0,0
7,2026-01-06,Tuesday,8,21,1
8,2026-01-07,Wednesday,8,21,1
9,2026-01-08,Thursday,8,21,1


In [9]:
df1 = create_main_library_hours_df(content_future_clean,df)
df1

2025-01-06 21:38:13,191 INFO: Base year for parsing dates: 2026
2025-01-06 21:38:13,195 INFO: Parsed date range: 2026-01-13 to 2026-01-19
2025-01-06 21:38:13,198 INFO: Parsed date range: 2027-01-06 to 2027-01-12
2025-01-06 21:38:13,200 INFO: Main Library Hours Data: {'Monday': '8–21', 'Tuesday': '8–21', 'Wednesday': '8–21', 'Thursday': '8–21', 'Friday': '8–19', 'Saturday': '10–16', 'Sunday': 'Closed'}
2025-01-06 21:38:13,203 INFO: DataFrame creation complete.


Unnamed: 0,date,day,opening_hour,closing_hour,is_open
0,2026-01-13,Tuesday,8,21,1
1,2026-01-14,Wednesday,8,21,1
2,2026-01-15,Thursday,8,21,1
3,2026-01-16,Friday,8,19,1
4,2026-01-17,Saturday,10,16,1
5,2026-01-18,Sunday,0,0,0
6,2026-01-19,Monday,8,21,1
7,2027-01-06,Wednesday,8,21,1
8,2027-01-07,Thursday,8,21,1
9,2027-01-08,Friday,8,19,1


In [10]:
df_long = pd.concat([df,df1], ignore_index=True)
print(df_long)

         date        day  opening_hour  closing_hour  is_open
0  2025-01-06     Monday             0             0        0
1  2025-01-07    Tuesday             8            21        1
2  2025-01-08  Wednesday             8            21        1
3  2025-01-09   Thursday             8            21        1
4  2025-01-10     Friday             8            19        1
5  2025-01-11   Saturday            10            16        1
6  2025-01-12     Sunday             0             0        0
7  2026-01-06    Tuesday             8            21        1
8  2026-01-07  Wednesday             8            21        1
9  2026-01-08   Thursday             8            21        1
10 2026-01-09     Friday             8            19        1
11 2026-01-10   Saturday            10            16        1
12 2026-01-11     Sunday             0             0        0
13 2026-01-12     Monday             0             0        0
14 2026-01-13    Tuesday             8            21        1
15 2026-

In [11]:
df = df_long.copy()

df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x: x.replace(year=2000))
df

Unnamed: 0,date,day,opening_hour,closing_hour,is_open
0,2000-01-06,Monday,0,0,0
1,2000-01-07,Tuesday,8,21,1
2,2000-01-08,Wednesday,8,21,1
3,2000-01-09,Thursday,8,21,1
4,2000-01-10,Friday,8,19,1
5,2000-01-11,Saturday,10,16,1
6,2000-01-12,Sunday,0,0,0
7,2000-01-06,Tuesday,8,21,1
8,2000-01-07,Wednesday,8,21,1
9,2000-01-08,Thursday,8,21,1


In [12]:
import pandas as pd
from datetime import datetime, timedelta

# Today's date, day, and month
today = datetime.now()
day_month_today = today.strftime("%m-%d")

# Adjusting the DataFrame
def adjust_dates(df, today):
    adjusted_df = df.copy()
    for i, row in adjusted_df.iterrows():
        row_day_month = row['date'].strftime("%m-%d")
        if row_day_month == day_month_today:
            current_year = today.year
            row_year = row['date'].year
            adjusted_year = current_year if row_year == 2000 else row_year + 1
            adjusted_df.loc[i:, 'date'] = pd.date_range(
                start=row['date'].replace(year=adjusted_year),
                periods=len(adjusted_df) - i,
                freq='D'
            )
            adjusted_df.loc[:i-1, 'date'] = pd.date_range(
                end=row['date'].replace(year=adjusted_year) - timedelta(days=1),
                periods=i,
                freq='D'
            )
            break
    return adjusted_df

adjusted_df = adjust_dates(df, today)

In [13]:
df_long = adjusted_df.copy()
df_long

Unnamed: 0,date,day,opening_hour,closing_hour,is_open
0,2025-01-06,Monday,0,0,0
1,2025-01-07,Tuesday,8,21,1
2,2025-01-08,Wednesday,8,21,1
3,2025-01-09,Thursday,8,21,1
4,2025-01-10,Friday,8,19,1
5,2025-01-11,Saturday,10,16,1
6,2025-01-12,Sunday,0,0,0
7,2025-01-13,Tuesday,8,21,1
8,2025-01-14,Wednesday,8,21,1
9,2025-01-15,Thursday,8,21,1


In [14]:
# Merge the dataframes
df_merged = pd.concat([df_main.set_index("date"), df_long.set_index("date")], axis=0)

# Ensure the index (date) is of datetime type
df_merged.index = pd.to_datetime(df_merged.index)

# Remove duplicates keeping the most recent entry
df_merged = df_merged[~df_merged.index.duplicated(keep="last")]

# Reset index
df_merged = df_merged.reset_index()

# Correggi le date di gennaio 2024 in gennaio 2025
mask = (df_merged['date'].dt.year == 2024) & (df_merged['date'].dt.month == 1)
df_merged.loc[mask, 'date'] = df_merged.loc[mask, 'date'] + pd.DateOffset(years=1)

# Ordina cronologicamente
df_merged_sorted = df_merged.sort_values('date').reset_index(drop=True)

print(df_merged_sorted)

         date        day  opening_hour  closing_hour  is_open
0  2024-12-09     Monday            18            21        1
1  2024-12-10    Tuesday            10            21        1
2  2024-12-11  Wednesday             8            21        1
3  2024-12-12   Thursday             8            21        1
4  2024-12-13     Friday             8            18        1
5  2024-12-14   Saturday             0             0        0
6  2024-12-15     Sunday             0             0        0
7  2024-12-16     Monday            10            21        1
8  2024-12-17    Tuesday            10            21        1
9  2024-12-18  Wednesday             8            21        1
10 2024-12-19   Thursday             8            21        1
11 2024-12-20     Friday             8            19        1
12 2024-12-21   Saturday             0             0        0
13 2024-12-22     Sunday             0             0        0
14 2024-12-23     Monday            10            14        1
15 2024-

In [15]:
# Print column names
print(df_merged_sorted.columns)

Index(['date', 'day', 'opening_hour', 'closing_hour', 'is_open'], dtype='object')


In [16]:
import pandas as pd

# Assuming 'df' is your existing DataFrame
df_merged_sorted.reset_index(inplace=True)
df_merged_sorted.rename(columns={'index': 'id'}, inplace=True)
df_merged_sorted

Unnamed: 0,id,date,day,opening_hour,closing_hour,is_open
0,0,2024-12-09,Monday,18,21,1
1,1,2024-12-10,Tuesday,10,21,1
2,2,2024-12-11,Wednesday,8,21,1
3,3,2024-12-12,Thursday,8,21,1
4,4,2024-12-13,Friday,8,18,1
5,5,2024-12-14,Saturday,0,0,0
6,6,2024-12-15,Sunday,0,0,0
7,7,2024-12-16,Monday,10,21,1
8,8,2024-12-17,Tuesday,10,21,1
9,9,2024-12-18,Wednesday,8,21,1


In [17]:
feature_group.insert(df_merged_sorted)
print("Dataset successfully uploaded to Hopsworks Feature Store.")

Uploading Dataframe: 100.00% |██████████| Rows 56/56 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: kth_opening_hour_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1205426/jobs/named/kth_opening_hour_1_offline_fg_materialization/executions
Dataset successfully uploaded to Hopsworks Feature Store.
