In [4]:
!pip install -q huggingface_hub pandas Dataset datasets


## 1. ADDING TOMORROW FORECAST TO THE PREDICTED DATA

In [None]:
import os
from huggingface_hub import hf_hub_download
import joblib

# Get the token from environment variables
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

if not HUGGINGFACE_TOKEN:
    raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")



In [5]:
import pandas as pd

# URL of the file in the Hugging Face dataset repository
file_url = "https://huggingface.co/datasets/davnas/library-occupancy/resolve/main/forecast_tomorrow.csv"

# Load the CSV file directly into a Pandas DataFrame
df_tomorrow_forecast = pd.read_csv(file_url)

# Add a "Prediction_date" column with CommitTime day + 1 day, and set the time to midnight
df_tomorrow_forecast['CommitTime'] = pd.to_datetime(df_tomorrow_forecast['CommitTime'])
df_tomorrow_forecast['Prediction_date'] = (df_tomorrow_forecast['CommitTime'] + pd.Timedelta(days=1)).dt.normalize()

# Display the updated DataFrame
df_tomorrow_forecast.tail()


Unnamed: 0,CommitTime,Time,Occupancy_main,Occupancy_southEast,Occupancy_north,Occupancy_south,Occupancy_angdomen,Occupancy_newton,Prediction_date
24,2025-01-01 19:26:26,19:30,0,0,0,0,0,0,2025-01-02
25,2025-01-01 19:26:26,20:00,0,0,0,0,0,0,2025-01-02
26,2025-01-01 19:26:26,20:30,0,0,0,0,0,0,2025-01-02
27,2025-01-01 19:26:26,21:00,0,0,0,0,0,0,2025-01-02
28,2025-01-01 19:26:26,21:30,0,0,0,0,0,0,2025-01-02


In [7]:
from datasets import load_dataset

# Step 2: Load the full forecast
repo_name = "davnas/library-occupancy"
hf_full_forecast = load_dataset(repo_name)
df_full_forecast = pd.DataFrame(hf_full_forecast['train'])

df_full_forecast.tail()

Generating train split:   0%|          | 0/58 [00:00<?, ? examples/s]

Unnamed: 0,CommitTime,Time,Occupancy_main,Occupancy_southEast,Occupancy_north,Occupancy_south,Occupancy_angdomen,Occupancy_newton,Prediction_date
53,2025-01-01 18:16:50,08:30,0,0,0,0,0,0,2025-01-02
54,2025-01-01 18:16:50,08:00,0,0,0,0,0,0,2025-01-02
55,2025-01-01 18:16:50,07:30,0,0,0,0,0,0,2025-01-02
56,2025-01-01 18:16:50,11:00,0,0,0,0,0,0,2025-01-02
57,2025-01-01 18:16:50,21:30,0,0,0,0,0,0,2025-01-02


In [8]:
from datasets import Dataset

# Merge the dataframes, keeping all columns
df_merged = pd.concat([df_full_forecast, df_tomorrow_forecast], ignore_index=True)

# Sort by CommitTime to ensure latest predictions are kept
df_merged = df_merged.sort_values('CommitTime')

# Drop duplicates keeping latest entry for each Time and Prediction_date combination
df_merged = df_merged.drop_duplicates(subset=['Time', 'Prediction_date'], keep='last')

# Reset index
df_merged = df_merged.reset_index(drop=True)

# Remove the column '__index_level_0__' if it already exists (as a safeguard)
if "__index_level_0__" in df_merged.columns:
    df_merged = df_merged.drop(columns=["__index_level_0__"])

df_merged.tail()

Unnamed: 0,CommitTime,Time,Occupancy_main,Occupancy_southEast,Occupancy_north,Occupancy_south,Occupancy_angdomen,Occupancy_newton,Prediction_date
53,2025-01-01 19:26:26,08:30,0,0,0,0,0,0,2025-01-02
54,2025-01-01 19:26:26,08:00,0,0,0,0,0,0,2025-01-02
55,2025-01-01 19:26:26,07:30,0,0,0,0,0,0,2025-01-02
56,2025-01-01 19:26:26,14:00,0,0,0,0,0,0,2025-01-02
57,2025-01-01 19:26:26,21:30,0,0,0,0,0,0,2025-01-02


In [9]:
# Convert the cleaned DataFrame to a Hugging Face dataset
hf_dataset = Dataset.from_pandas(df_merged)

# Push the dataset to the Hugging Face Hub
hf_dataset.push_to_hub(
    repo_id=repo_name,
    token=HUGGINGFACE_TOKEN,
    private=False,
    commit_message="Updated dataset without index column"
)

print("Dataset successfully uploaded without index column!")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset successfully uploaded without index column!
