In [1]:
!pip install -q huggingface_hub pandas Dataset datasets


## 2. Real-Prediction comparison

In [3]:
import os
from huggingface_hub import hf_hub_download
import joblib

# Get the token from environment variables
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

if not HUGGINGFACE_TOKEN:
    raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")



### 1. Retrieve Todays Prediction

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from datasets import Dataset

import matplotlib.pyplot as plt
from datetime import timedelta

# Step 1: Load the seating dataset
repo_name_seating = "davnas/occupancy_perc"
hf_dataset_seating = load_dataset(repo_name_seating)

# Combine all splits into a single DataFrame with appropriate data types
df_seating = pd.concat(
    [split.to_pandas().astype({'KTH Library': int, 'South-East Gallery': int, 'North Gallery': int,
                               'South Gallery': int, 'Ångdomen': int, 'Newton': int})
     for split in hf_dataset_seating.values()],
    ignore_index=True
)
df_seating.set_index('index', inplace=True)
df_seating.index = pd.to_datetime(df_seating.index)  # Ensure the index is datetime

# Resample to 30-minute intervals and fill missing data (while keeping original rounding)
df_seating_resampled = df_seating.resample('30T').mean().interpolate(method='linear').round().astype(int)

# Filter data to only include today and tomorrow
today = pd.Timestamp.today().normalize()  # Today's date without time
tomorrow_midnight = (today + timedelta(days=1)).normalize()  # Tomorrow's midnight
end_of_tomorrow = tomorrow_midnight + timedelta(days=1)  # End of tomorrow, for the full 24 hours

# Get the last available entry for today
last_valid_entry = df_seating_resampled[df_seating_resampled.index.date == today.date()].index[-1]

# Create the datetime range from the last entry until tomorrow midnight
time_range = pd.date_range(start=last_valid_entry + timedelta(minutes=30), end=end_of_tomorrow, freq='30T')

# Create a DataFrame for this time range with NaN values
nan_df = pd.DataFrame(np.nan, index=time_range, columns=df_seating_resampled.columns)

# Concatenate the original data with the NaN data (after the last valid entry until tomorrow midnight)
df_seating_combined = pd.concat([df_seating_resampled, nan_df])

# Ensure the data has the correct format, with NaN entries for after today until tomorrow midnight
df_seating_combined = df_seating_combined.sort_index()

# Filter to keep only data from today and tomorrow
df_today_tomorrow = df_seating_combined[(df_seating_combined.index >= today) & (df_seating_combined.index < end_of_tomorrow)]

# Split into downloaded data (non-NaN) and null data (NaN)
df_today_real = df_today_tomorrow[df_today_tomorrow.notna().all(axis=1)].copy()
df_null = df_today_tomorrow[df_today_tomorrow.isna().any(axis=1)].copy()

# Recombine into the full DataFrame
df_full = pd.concat([df_today_real, df_null]).sort_index()

# Plot the data for today and tomorrow
#plt.figure(figsize=(10, 6))
#plt.plot(df_full.index, df_full['South-East Gallery'], label="South-East Gallery")
#plt.title("South-East Gallery Occupancy (Today's and Tomorrow's Data)")
#plt.xlabel("Time")
#plt.ylabel("Occupancy")
#plt.legend()
#plt.xticks(rotation=45)
#plt.grid(True)
#plt.tight_layout()
#plt.show()

# Optionally, print the split DataFrames to check
df_today_real.tail()




README.md:   0%|          | 0.00/511 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/120k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10731 [00:00<?, ? examples/s]

Unnamed: 0,KTH Library,South-East Gallery,North Gallery,South Gallery,Ångdomen,Newton
2025-01-01 19:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-01 20:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-01 20:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-01 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-01 21:30:00,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Check if the last entry's time is before 21:30
last_entry_time = df_today_real.index[-1].time()

# Set the threshold time as 21:30
threshold_time = pd.to_datetime("21:30").time()

if last_entry_time < threshold_time:
    raise ValueError("Not enough data in df_todays_meas")


In [16]:
# Filter data for time between 07:30 and 21:30
start_time = pd.to_datetime("07:30").time()
end_time = pd.to_datetime("21:30").time()

df_filtered = df_today_real.between_time(start_time, end_time)
df_today_real = df_filtered.copy()
df_today_real.tail()

Unnamed: 0,Occupancy_main_real,Occupancy_southEast_real,Occupancy_north_real,Occupancy_south_real,Occupancy_angdomen_real,Occupancy_newton_real,Date,Time
2025-01-01 07:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,07:30:00
2025-01-01 08:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,08:00:00
2025-01-01 08:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,08:30:00
2025-01-01 09:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,09:00:00
2025-01-01 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,09:30:00
2025-01-01 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,10:00:00
2025-01-01 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,10:30:00
2025-01-01 11:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,11:00:00
2025-01-01 11:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,11:30:00
2025-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,12:00:00


### 2. Retrieve past Prediction for today:


In [17]:
# Step 2: Load the full forecast
repo_name = "davnas/library-occupancy"
hf_full_forecast = load_dataset(repo_name)
df_full_forecast = pd.DataFrame(hf_full_forecast['train'])
df_full_forecast.tail()

Unnamed: 0,CommitTime,Time,Occupancy_main,Occupancy_southEast,Occupancy_north,Occupancy_south,Occupancy_angdomen,Occupancy_newton,Prediction_date
53,2025-01-01 21:20:41,08:30,0,0,0,0,0,0,2025-01-02
54,2025-01-01 21:20:41,08:00,0,0,0,0,0,0,2025-01-02
55,2025-01-01 21:20:41,07:30,0,0,0,0,0,0,2025-01-02
56,2025-01-01 21:20:41,14:00,0,0,0,0,0,0,2025-01-02
57,2025-01-01 21:20:41,21:30,0,0,0,0,0,0,2025-01-02


In [18]:
today = pd.Timestamp.now().date()
df_today_predicted = df_full_forecast[df_full_forecast['Prediction_date'] == str(today)]
df_today_predicted = df_today_predicted.sort_values('Time')
df_today_predicted.tail()

Unnamed: 0,CommitTime,Time,Occupancy_main,Occupancy_southEast,Occupancy_north,Occupancy_south,Occupancy_angdomen,Occupancy_newton,Prediction_date
9,2024-12-31 17:23:34,19:30,0,0,0,0,0,0,2025-01-01
10,2024-12-31 17:23:34,20:00,0,0,0,0,0,0,2025-01-01
11,2024-12-31 17:23:34,20:30,0,0,0,0,0,0,2025-01-01
12,2024-12-31 17:23:34,21:00,0,0,0,0,0,0,2025-01-01
7,2024-12-31 17:23:34,21:30,0,0,0,0,0,0,2025-01-01


### 3. Merging the predicted and reality

In [19]:
df_today_real

Unnamed: 0,Occupancy_main_real,Occupancy_southEast_real,Occupancy_north_real,Occupancy_south_real,Occupancy_angdomen_real,Occupancy_newton_real,Date,Time
2025-01-01 07:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,07:30:00
2025-01-01 08:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,08:00:00
2025-01-01 08:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,08:30:00
2025-01-01 09:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,09:00:00
2025-01-01 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,09:30:00
2025-01-01 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,10:00:00
2025-01-01 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,10:30:00
2025-01-01 11:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,11:00:00
2025-01-01 11:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,11:30:00
2025-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,12:00:00


In [20]:
df_today_real.tail()

Unnamed: 0,Occupancy_main_real,Occupancy_southEast_real,Occupancy_north_real,Occupancy_south_real,Occupancy_angdomen_real,Occupancy_newton_real,Date,Time
2025-01-01 19:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,19:30:00
2025-01-01 20:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,20:00:00
2025-01-01 20:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,20:30:00
2025-01-01 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,21:00:00
2025-01-01 21:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,21:30:00


In [21]:
df_today_predicted.tail()

Unnamed: 0,CommitTime,Time,Occupancy_main,Occupancy_southEast,Occupancy_north,Occupancy_south,Occupancy_angdomen,Occupancy_newton,Prediction_date
9,2024-12-31 17:23:34,19:30,0,0,0,0,0,0,2025-01-01
10,2024-12-31 17:23:34,20:00,0,0,0,0,0,0,2025-01-01
11,2024-12-31 17:23:34,20:30,0,0,0,0,0,0,2025-01-01
12,2024-12-31 17:23:34,21:00,0,0,0,0,0,0,2025-01-01
7,2024-12-31 17:23:34,21:30,0,0,0,0,0,0,2025-01-01


In [24]:
# Step 1: Format df_today_real (ensure Date, Time, and values are correctly aligned)
df_today_real.rename(columns={
    'KTH Library': 'Occupancy_main_real',
    'South-East Gallery': 'Occupancy_southEast_real',
    'North Gallery': 'Occupancy_north_real',
    'South Gallery': 'Occupancy_south_real',
    'Ångdomen': 'Occupancy_angdomen_real',
    'Newton': 'Occupancy_newton_real'
}, inplace=True)

df_today_real['Date'] = df_today_real.index.date
df_today_real['Time'] = df_today_real.index.strftime('%H:%M')
df_today_real.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_today_real.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_today_real['Date'] = df_today_real.index.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_today_real['Time'] = df_today_real.index.strftime('%H:%M')


Unnamed: 0,Occupancy_main_real,Occupancy_southEast_real,Occupancy_north_real,Occupancy_south_real,Occupancy_angdomen_real,Occupancy_newton_real,Date,Time
2025-01-01 19:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,19:30
2025-01-01 20:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,20:00
2025-01-01 20:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,20:30
2025-01-01 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,21:00
2025-01-01 21:30:00,0.0,0.0,0.0,0.0,0.0,0.0,2025-01-01,21:30


In [25]:
# Step 3: Rename the columns in df_today_predicted to match the desired output
df_today_predicted.rename(columns={
    'Occupancy_main': 'Occupancy_main_predicted',
    'Occupancy_southEast': 'Occupancy_southEast_predicted',
    'Occupancy_north': 'Occupancy_north_predicted',
    'Occupancy_south': 'Occupancy_south_predicted',
    'Occupancy_angdomen': 'Occupancy_angdomen_predicted',
    'Occupancy_newton': 'Occupancy_newton_predicted'
}, inplace=True)

# Step 4: Format df_today_predicted (ensure Date, Time, and values are correctly aligned)
df_today_predicted['Date'] = pd.to_datetime(df_today_predicted['Prediction_date']).dt.date
df_today_predicted['Time'] = df_today_predicted['Time']
df_today_predicted.tail()

Unnamed: 0,CommitTime,Time,Occupancy_main_predicted,Occupancy_southEast_predicted,Occupancy_north_predicted,Occupancy_south_predicted,Occupancy_angdomen_predicted,Occupancy_newton_predicted,Prediction_date,Date
9,2024-12-31 17:23:34,19:30,0,0,0,0,0,0,2025-01-01,2025-01-01
10,2024-12-31 17:23:34,20:00,0,0,0,0,0,0,2025-01-01,2025-01-01
11,2024-12-31 17:23:34,20:30,0,0,0,0,0,0,2025-01-01,2025-01-01
12,2024-12-31 17:23:34,21:00,0,0,0,0,0,0,2025-01-01,2025-01-01
7,2024-12-31 17:23:34,21:30,0,0,0,0,0,0,2025-01-01,2025-01-01


In [26]:
df_merged = pd.merge(df_today_real, df_today_predicted, on=['Date', 'Time'], how='outer')


# Step 6: Reorder and finalize the columns
final_columns = ['Date', 'Time', 'Occupancy_main_real', 'Occupancy_main_predicted',
                 'Occupancy_southEast_real', 'Occupancy_southEast_predicted',
                 'Occupancy_north_real', 'Occupancy_north_predicted',
                 'Occupancy_south_real', 'Occupancy_south_predicted',
                 'Occupancy_angdomen_real', 'Occupancy_angdomen_predicted',
                 'Occupancy_newton_real', 'Occupancy_newton_predicted']

df_final = df_merged[final_columns]
df_final.to_csv("df_final.csv", index=False)
df_final.tail()

Unnamed: 0,Date,Time,Occupancy_main_real,Occupancy_main_predicted,Occupancy_southEast_real,Occupancy_southEast_predicted,Occupancy_north_real,Occupancy_north_predicted,Occupancy_south_real,Occupancy_south_predicted,Occupancy_angdomen_real,Occupancy_angdomen_predicted,Occupancy_newton_real,Occupancy_newton_predicted
24,2025-01-01,19:30,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
25,2025-01-01,20:00,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
26,2025-01-01,20:30,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
27,2025-01-01,21:00,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
28,2025-01-01,21:30,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [None]:
print(df_final.isnull().sum())  # Check for NaN values
#print(df_final.describe())  # Check for zero values or anomalies


### 4. Calculate Metrics

In [28]:
# Calculate metrics
locations = ['main', 'southEast', 'north', 'south', 'angdomen', 'newton']
date = df_final['Date'].iloc[0]

metrics_data = []
for metric in ['RME', 'MAPE']:
    row = {'Date': date, 'Time': metric}
    for loc in locations:
        real_col = f'Occupancy_{loc}_real'
        pred_col = f'Occupancy_{loc}_predicted'
        y_true = df_final[real_col].values
        y_pred = df_final[pred_col].values

        if metric == 'MAPE':
            if np.all(y_true == 0):
                value = 0.0  # Assign 0.0 or NaN if all y_true values are zero
            else:
                y_true_safe = np.where(y_true == 0, 1e-9, y_true)  # Replace zeros with a small number
                value = round(np.mean(np.abs((y_true_safe - y_pred) / y_true_safe)) * 100, 2)
        else:  # RME
            value = round(np.sqrt(np.mean((y_true - y_pred) ** 2)), 2)

        row[real_col] = value
        row[pred_col] = value
    metrics_data.append(row)

# Add metrics rows
df_final = pd.concat([df_final, pd.DataFrame(metrics_data)], ignore_index=True)

# Save CSV
csv_file_path = "occupancy_today.csv"
df_final.to_csv(csv_file_path, index=False)


Metrics added and saved to occupancy_today.csv


### 5. Upload to Hugginface

In [16]:
import pandas as pd
from huggingface_hub import HfApi, HfFolder

# Hugging Face token and repository details
repo_name = "davnas/library-occupancy"  # Replace with your repository name
#csv_file_path = "data.csv"  # Replace with your CSV file path

# Authenticate using the token
HfFolder.save_token(HUGGINGFACE_TOKEN)
api = HfApi()

# Upload the CSV file
api.upload_file(
    path_or_fileobj=csv_file_path,
    path_in_repo="Real_vs_Predicted_Occupancy_Data.csv",  # The name of the file in the repository
    repo_id=repo_name,
    repo_type="dataset",  # Indicates this is a dataset repository
    token=HUGGINGFACE_TOKEN
)


print(f"CSV file successfully uploaded to Hugging Face repository: {repo_name}")



CSV file successfully uploaded to Hugging Face repository: davnas/library-occupancy


In [16]:
'''
Final CSV format
Date,Time,Occupancy_main_real,Occupancy_main_predicted,Occupancy_southEast_real,Occupancy_southEast_predicted,Occupancy_north_real,Occupancy_north_predicted,Occupancy_south_real,Occupancy_south_predicted,Occupancy_angdomen_real,Occupancy_angdomen_predicted,Occupancy_newton_real,Occupancy_newton_predicted
2024-12-26,08:00,0,-2,5,4,3,6,4,1,2,3,0,-1
2024-12-26,08:30,0,-4,6,6,4,-1,3,1,2,-3,5,7
2024-12-26,09:00,28,33,35,32,30,30,25,27,20,22,10,11
2024-12-26,09:30,40,44,45,41,38,34,32,37,28,28,22,19
2024-12-26,10:00,54,50,50,54,48,49,42,47,38,40,35,30
2024-12-26,10:30,64,60,60,65,58,55,52,53,48,49,45,44
2024-12-26,11:00,71,66,68,71,65,64,60,62,55,50,50,54
2024-12-26,11:30,70,69,67,72,64,61,59,57,54,59,49,50
2024-12-26,12:00,51,55,50,47,48,51,45,44,43,47,40,38
2024-12-26,12:30,53,50,52,53,50,49,47,49,45,44,42,40
2024-12-26,13:00,64,63,62,67,60,58,55,59,50,54,48,47
2024-12-26,13:30,68,69,66,64,64,65,59,63,55,57,53,54
2024-12-26,14:00,70,68,68,69,65,67,60,59,55,57,53,55
2024-12-26,14:30,71,70,69,67,67,71,62,64,58,57,56,55
2024-12-26,15:00,71,66,70,74,68,69,63,61,59,55,57,54
2024-12-26,15:30,70,65,69,66,67,66,62,64,58,55,56,53
2024-12-26,16:00,64,61,63,62,60,57,58,56,55,56,53,54
2024-12-26,16:30,55,54,54,58,52,57,50,53,48,51,45,49
2024-12-26,17:00,88,83,85,87,82,79,80,77,78,80,75,78
2024-12-26,17:30,35,36,34,39,32,29,30,33,28,27,25,24
2024-12-26,18:00,29,33,28,30,27,31,25,27,23,21,20,21
2024-12-26,18:30,24,22,23,26,22,20,20,22,18,19,15,18
2024-12-26,19:00,20,24,19,18,18,22,16,20,15,12,12,15
2024-12-26,19:30,15,11,14,17,13,10,12,10,10,11,8,6
2024-12-26,20:00,11,12,10,8,9,8,8,6,7,10,5,6
2024-12-26,20:30,7,5,6,8,5,3,4,7,3,6,2,50
2024-12-26,21:00,0,1,0,0,0,0,0,0,0,0,0,0
2024-12-26,RME,15.24,14.76,15.67,16.24,14.45,13.67,14.56,14.89,13.42,14.11,13.78,13.21
2024-12-26,MAPE,8.54,8.24,9.15,9.67,8.11,8.45,8.78,8.56,7.89,8.12,7.67,7.98
'''


'\nFinal CSV format\nDate,Time,Occupancy_main_real,Occupancy_main_predicted,Occupancy_southEast_real,Occupancy_southEast_predicted,Occupancy_north_real,Occupancy_north_predicted,Occupancy_south_real,Occupancy_south_predicted,Occupancy_angdomen_real,Occupancy_angdomen_predicted,Occupancy_newton_real,Occupancy_newton_predicted\n2024-12-26,08:00,0,-2,5,4,3,6,4,1,2,3,0,-1\n2024-12-26,08:30,0,-4,6,6,4,-1,3,1,2,-3,5,7\n2024-12-26,09:00,28,33,35,32,30,30,25,27,20,22,10,11\n2024-12-26,09:30,40,44,45,41,38,34,32,37,28,28,22,19\n2024-12-26,10:00,54,50,50,54,48,49,42,47,38,40,35,30\n2024-12-26,10:30,64,60,60,65,58,55,52,53,48,49,45,44\n2024-12-26,11:00,71,66,68,71,65,64,60,62,55,50,50,54\n2024-12-26,11:30,70,69,67,72,64,61,59,57,54,59,49,50\n2024-12-26,12:00,51,55,50,47,48,51,45,44,43,47,40,38\n2024-12-26,12:30,53,50,52,53,50,49,47,49,45,44,42,40\n2024-12-26,13:00,64,63,62,67,60,58,55,59,50,54,48,47\n2024-12-26,13:30,68,69,66,64,64,65,59,63,55,57,53,54\n2024-12-26,14:00,70,68,68,69,65,67,60,59,55