## Export Model data

We create a new Dataframe on which the ML and NN Models will be trained on. We export this to a csv to reduce the need to compute it later


### Get Holiday Data


In [3]:
import pandas as pd


# Jahre, für die Sie Daten haben
years = ['2018', '2019', '2020', '2021', '2022', '2023', '2024']

# Leere Liste, um alle DataFrames zu sammeln
dataframes = []

for year in years:
    # Pfad zur CSV-Datei
    file_path = f'../../HolidayData/holidays_{year}.csv'
    # Lesen der CSV-Datei
    df = pd.read_csv(file_path)

    # Nur internationale Feiertage oder Feiertage in Kalifornien
    df = df[(pd.isna(df['region'])) | (df['region'] == 'CA')]

    
    
    # Hinzufügen des DataFrame zur Liste
    dataframes.append(df)

# Kombinieren aller DataFrames in einen
holiday_df = pd.concat(dataframes)

# Entfernen der Spalten 'locale', 'types', 'notes' und 'region'
holiday_df.drop(['locale', 'type', 'notes', 'region'], axis=1, inplace=True)

# Konvertierung der Datentypen
holiday_df['date'] = pd.to_datetime(holiday_df['date']).dt.date
holiday_df['description'] = holiday_df['description'].astype(str)

holiday_df

Unnamed: 0,date,description
0,2018-01-01,New Year's Day
1,2018-01-15,"Birthday of Martin Luther King, Jr."
2,2018-02-19,Washington's Birthday
5,2018-05-28,Memorial Day
6,2018-07-04,Independence Day
...,...,...
9,2024-11-11,Veterans Day
10,2024-11-22,Day after Thanksgiving
11,2024-11-28,Thanksgiving Day
12,2024-12-24,Christmas Eve


### Get Weather Data


In [6]:
import openmeteo_requests

import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 34.052235,
	"longitude": -118.243683,
	"start_date": "2018-04-25",
	"end_date": "2021-09-14",
	"hourly": ["temperature_2m", "weather_code"]
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(1).ValuesAsNumpy()

hourly_data = {"time": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s"),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s"),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["temperature"] = hourly_temperature_2m
hourly_data["weather_code"] = hourly_weather_code

hourly_dataframe = pd.DataFrame(data = hourly_data)

hourly_dataframe['time'] = hourly_dataframe['time'].dt.tz_localize('UTC').dt.tz_convert('America/Los_Angeles')

def convert_weather_code(code):
    mapping = {
        0.0: "Cloud development not observed or not observable",
        1.0: "Clouds generally dissolving or becoming less developed",
        2.0: "State of sky on the whole unchanged",
        3.0: "Clouds generally forming or developing",
        51.0: "Drizzle, not freezing, continuous",
        53.0: "Drizzle, not freezing, intermittent moderate",
        61.0: "Rain, not freezing, continuous",
        63.0: "Rain, not freezing, intermittent moderate",
        55.0: "Drizzle, not freezing, continuous",
        65.0: "Rain, not freezing, continuous"
    }
    return mapping.get(code, "Unknown")

hourly_dataframe['weather_description'] = hourly_dataframe['weather_code'].apply(convert_weather_code)
hourly_dataframe.drop('weather_code', axis=1, inplace=True)

hourly_dataframe

Unnamed: 0,time,temperature,weather_description
0,2018-04-24 17:00:00-07:00,20.976999,Cloud development not observed or not observable
1,2018-04-24 18:00:00-07:00,20.726999,Cloud development not observed or not observable
2,2018-04-24 19:00:00-07:00,17.876999,Cloud development not observed or not observable
3,2018-04-24 20:00:00-07:00,15.327001,Cloud development not observed or not observable
4,2018-04-24 21:00:00-07:00,13.727000,Cloud development not observed or not observable
...,...,...,...
29731,2021-09-14 12:00:00-07:00,25.876999,Cloud development not observed or not observable
29732,2021-09-14 13:00:00-07:00,28.677000,Cloud development not observed or not observable
29733,2021-09-14 14:00:00-07:00,29.976999,Cloud development not observed or not observable
29734,2021-09-14 15:00:00-07:00,28.876999,Cloud development not observed or not observable


### Create the new Dataframe with data from connections, weather data and holiday data


In [9]:
%run ../data_preparation.ipynb
%store -r merged_df

df = merged_df

# Frühster und spätester Zeitpunkt als Bereich definieren
start_time = pd.to_datetime("2018-04-25 04:00:00").tz_localize('America/Los_Angeles')
end_time = pd.to_datetime("2021-09-14 07:00:00").tz_localize('America/Los_Angeles')
time_range = pd.date_range(start=start_time, end=end_time, freq='h', tz='America/Los_Angeles')

results = []
# Durchgehen der Zeitpunkte und Berechnung der belegten spaces
for time in time_range:
    for site_id in df['siteID'].unique():
        occupied_count = df[(df['connectionTime'] <= time) & (df['disconnectTime'] > time) & (df['siteID'] == site_id)].shape[0]
        results.append({'time': time, 'siteID': site_id, 'occupied_count': occupied_count})

# Ergebnisse in einem DataFrame speichern
model_df = pd.DataFrame(results)

# Hinzufügen der Feiertage
model_df['is_holiday'] = model_df['time'].dt.date.isin(holiday_df['date'])

# Hinzufügen der Wetterdaten
model_df = model_df.merge(hourly_dataframe, on='time', how='left')

model_df['Weekday'] = model_df['time'].dt.dayofweek 

# Umwandlung der Wochentag-Integer in Wochentag-Namen
weekday_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
model_df['Weekday'] = model_df['Weekday'].map(weekday_map)

# Konvertieren der 'time'-Spalte in UTC vor dem Exportieren
model_df['time'] = model_df['time'].dt.tz_convert('UTC')


# Die Daten in csv exportieren, damit die Daten nicht jedes mal neu berechnet werden müssen
model_df.to_csv('../../model_data.csv', index=False)

model_df

Stored 'merged_df' (DataFrame)


Unnamed: 0,time,siteID,occupied_count,is_holiday,temperature,weather_description,Weekday
0,2018-04-25 11:00:00+00:00,1,0,False,11.327001,Clouds generally forming or developing,Wednesday
1,2018-04-25 11:00:00+00:00,2,0,False,11.327001,Clouds generally forming or developing,Wednesday
2,2018-04-25 12:00:00+00:00,1,0,False,11.277000,Clouds generally forming or developing,Wednesday
3,2018-04-25 12:00:00+00:00,2,1,False,11.277000,Clouds generally forming or developing,Wednesday
4,2018-04-25 13:00:00+00:00,1,0,False,11.527000,Clouds generally forming or developing,Wednesday
...,...,...,...,...,...,...,...
59427,2021-09-14 12:00:00+00:00,2,0,False,14.427000,Clouds generally forming or developing,Tuesday
59428,2021-09-14 13:00:00+00:00,1,1,False,13.777000,Clouds generally forming or developing,Tuesday
59429,2021-09-14 13:00:00+00:00,2,0,False,13.777000,Clouds generally forming or developing,Tuesday
59430,2021-09-14 14:00:00+00:00,1,1,False,13.727000,Clouds generally forming or developing,Tuesday
