In [12]:
import os
import holidays
import pandas as pd
import numpy as np

In [13]:
energy = []
folder_path = "../src/data/raw"

for file_name in sorted(os.listdir(folder_path)):
    if file_name.endswith(".csv"):
        df = pd.read_csv(os.path.join(folder_path, file_name))
        energy.append(df)

energy = pd.concat(energy, axis=0, ignore_index=True)
energy["SETTLEMENTDATE"] = pd.to_datetime(energy["SETTLEMENTDATE"])
energy.drop(columns=["REGION", "PERIODTYPE"], inplace=True)
energy["Weekday"] = energy["SETTLEMENTDATE"].dt.day_name()

# Add public holidays
vic_holidays = holidays.Australia(years=range(2018, 2024), state="VIC")
energy["Holiday"] = energy["SETTLEMENTDATE"].dt.date.map(lambda x: x in vic_holidays)
energy["Holiday"] = energy["Holiday"].fillna(False)

In [14]:
energy

Unnamed: 0,SETTLEMENTDATE,TOTALDEMAND,RRP,Weekday,Holiday
0,2018-01-01 00:30:00,4251.18,92.46,Monday,True
1,2018-01-01 01:00:00,4092.53,87.62,Monday,True
2,2018-01-01 01:30:00,3958.95,73.08,Monday,True
3,2018-01-01 02:00:00,3785.27,70.18,Monday,True
4,2018-01-01 02:30:00,3673.72,67.43,Monday,True
...,...,...,...,...,...
302443,2023-12-31 23:40:00,4070.49,51.94,Sunday,False
302444,2023-12-31 23:45:00,4035.14,52.23,Sunday,False
302445,2023-12-31 23:50:00,4039.52,51.83,Sunday,False
302446,2023-12-31 23:55:00,4036.91,51.43,Sunday,False


In [15]:
tem = energy[65711:]
energy = energy[:65711]

In [16]:
def get_30min_start(timestamp):
    minute = timestamp.minute
    if minute in [5, 10, 15, 20, 25]:
        return timestamp.replace(minute=0, second=0, microsecond=0)
    elif minute in [35, 40, 45, 50, 55]:
        return timestamp.replace(minute=30, second=0, microsecond=0)
    else:
        return timestamp

tem.loc[:,"30min_interval"] = tem["SETTLEMENTDATE"].apply(get_30min_start)

tem = tem.groupby("30min_interval").agg({
    "TOTALDEMAND": "mean",
    "RRP": "mean",
    "Weekday": "first",  # Assuming Weekday remains constant
    "Holiday": "first",  # Assuming Holiday remains constant
}).reset_index()

# Rename 30min_interval to SETTLEMENTDATE
tem = tem.rename(columns={"30min_interval": "SETTLEMENTDATE"})

# round TOTALDEMAND and RRP to 2 decimal places
tem.loc[:,"TOTALDEMAND"] = tem["TOTALDEMAND"].round(2)
tem.loc[:,"RRP"] = tem["RRP"].round(2)

energy = pd.concat([energy, tem], ignore_index=True)
energy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tem.loc[:,"30min_interval"] = tem["SETTLEMENTDATE"].apply(get_30min_start)


Unnamed: 0,SETTLEMENTDATE,TOTALDEMAND,RRP,Weekday,Holiday
0,2018-01-01 00:30:00,4251.18,92.46,Monday,True
1,2018-01-01 01:00:00,4092.53,87.62,Monday,True
2,2018-01-01 01:30:00,3958.95,73.08,Monday,True
3,2018-01-01 02:00:00,3785.27,70.18,Monday,True
4,2018-01-01 02:30:00,3673.72,67.43,Monday,True
...,...,...,...,...,...
105163,2023-12-31 22:00:00,3810.70,44.20,Sunday,False
105164,2023-12-31 22:30:00,3858.55,48.64,Sunday,False
105165,2023-12-31 23:00:00,3974.08,49.99,Sunday,False
105166,2023-12-31 23:30:00,4057.02,51.40,Sunday,False


In [17]:
energy.loc[:, "Hour"] = energy["SETTLEMENTDATE"].dt.hour
energy.loc[:, "Minute"] = energy["SETTLEMENTDATE"].dt.minute
energy.loc[:, "Time"] = energy["Hour"] + energy["Minute"] / 60
energy.loc[:, "Day"] = energy["SETTLEMENTDATE"].dt.day
energy.loc[:, "Month"] = energy["SETTLEMENTDATE"].dt.month
energy

Unnamed: 0,SETTLEMENTDATE,TOTALDEMAND,RRP,Weekday,Holiday,Hour,Minute,Time,Day,Month
0,2018-01-01 00:30:00,4251.18,92.46,Monday,True,0,30,0.5,1,1
1,2018-01-01 01:00:00,4092.53,87.62,Monday,True,1,0,1.0,1,1
2,2018-01-01 01:30:00,3958.95,73.08,Monday,True,1,30,1.5,1,1
3,2018-01-01 02:00:00,3785.27,70.18,Monday,True,2,0,2.0,1,1
4,2018-01-01 02:30:00,3673.72,67.43,Monday,True,2,30,2.5,1,1
...,...,...,...,...,...,...,...,...,...,...
105163,2023-12-31 22:00:00,3810.70,44.20,Sunday,False,22,0,22.0,31,12
105164,2023-12-31 22:30:00,3858.55,48.64,Sunday,False,22,30,22.5,31,12
105165,2023-12-31 23:00:00,3974.08,49.99,Sunday,False,23,0,23.0,31,12
105166,2023-12-31 23:30:00,4057.02,51.40,Sunday,False,23,30,23.5,31,12


In [18]:
output_path = "D:/GitHub_Web/energy/src/data/processed/combined_data.csv"
energy.to_csv(output_path, index=False)