In [87]:
import os
import pandas as pd

# Folder containing the CSV files
folder_path = 'data'

# List to hold individual DataFrames
dfs = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dfs, ignore_index=True)


In [88]:
combined_df.columns

Index(['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)',
       'Intraday Period (CET/CEST)', 'Intraday Price (EUR/MWh)'],
      dtype='object')

In [89]:
# Extract start datetime from MTU column
start_times = combined_df['MTU (CET/CEST)'].str.split(' - ').str[0]

# Remove any potential timezone suffix
start_times_clean = start_times.str.replace(r' \(CET\)| \(CEST\)', '', regex=True)

# Parse datetime with day-first format (since the format is dd/mm/yyyy)
combined_df['Start DateTime'] = pd.to_datetime(start_times_clean, dayfirst=True)
combined_df.head()

Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh),Intraday Period (CET/CEST),Intraday Price (EUR/MWh),Start DateTime
0,01/01/2021 00:00:00 - 01/01/2021 01:00:00,BZN|ES,Without sequence,50.87,,,2021-01-01 00:00:00
1,01/01/2021 01:00:00 - 01/01/2021 02:00:00,BZN|ES,Without sequence,48.19,,,2021-01-01 01:00:00
2,01/01/2021 02:00:00 - 01/01/2021 03:00:00,BZN|ES,Without sequence,44.68,,,2021-01-01 02:00:00
3,01/01/2021 03:00:00 - 01/01/2021 04:00:00,BZN|ES,Without sequence,38.5,,,2021-01-01 03:00:00
4,01/01/2021 04:00:00 - 01/01/2021 05:00:00,BZN|ES,Without sequence,36.8,,,2021-01-01 04:00:00


In [92]:
# Define your filtering range
start_date = pd.to_datetime('2021-05-21')
end_date = pd.to_datetime('2025-05-01')

# Apply filter
combined_df['range'] = (combined_df['Start DateTime'] >= start_date) & (combined_df['Start DateTime'] < end_date)
combined_df

Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh),Intraday Period (CET/CEST),Intraday Price (EUR/MWh),Start DateTime,range
0,01/01/2021 00:00:00 - 01/01/2021 01:00:00,BZN|ES,Without sequence,50.87,,,2021-01-01 00:00:00,False
1,01/01/2021 01:00:00 - 01/01/2021 02:00:00,BZN|ES,Without sequence,48.19,,,2021-01-01 01:00:00,False
2,01/01/2021 02:00:00 - 01/01/2021 03:00:00,BZN|ES,Without sequence,44.68,,,2021-01-01 02:00:00,False
3,01/01/2021 03:00:00 - 01/01/2021 04:00:00,BZN|ES,Without sequence,38.50,,,2021-01-01 03:00:00,False
4,01/01/2021 04:00:00 - 01/01/2021 05:00:00,BZN|ES,Without sequence,36.80,,,2021-01-01 04:00:00,False
...,...,...,...,...,...,...,...,...
115480,23/05/2025 19:00:00 - 23/05/2025 20:00:00,BZN|PT,Without sequence,4.69,,,2025-05-23 19:00:00,False
115481,23/05/2025 20:00:00 - 23/05/2025 21:00:00,BZN|PT,Without sequence,5.76,,,2025-05-23 20:00:00,False
115482,23/05/2025 21:00:00 - 23/05/2025 22:00:00,BZN|PT,Without sequence,31.00,,,2025-05-23 21:00:00,False
115483,23/05/2025 22:00:00 - 23/05/2025 23:00:00,BZN|PT,Without sequence,17.94,,,2025-05-23 22:00:00,False


In [98]:
# Step 0: Sort to ensure time order within groups
filtered_df = combined_df.sort_values(['Area', 'Sequence', 'Start DateTime']).reset_index(drop=True)

# Step 1: Compute time differences within each group
filtered_df['TimeDiff'] = filtered_df.groupby(['Area', 'Sequence'])['Start DateTime'].diff()

# Step 2: Identify rows where diff is not 1 hour (and not null)
filtered_df['NonHourly'] = (filtered_df['TimeDiff'] != pd.Timedelta(hours=1)) & (~filtered_df['TimeDiff'].isna())

# Step 3: Get all indices with issues
problem_indices = filtered_df.index[filtered_df['NonHourly']]

# Step 4: Include previous row for context
context_indices = problem_indices.union(problem_indices - 1)

# Step 5: Extract and display these rows
irregular_context = filtered_df.loc[context_indices].sort_values(['Area', 'Sequence', 'Start DateTime'])

# Optional: Only show key columns
print(irregular_context[['Area', 'Sequence', 'Start DateTime', 'TimeDiff']])



          Area          Sequence      Start DateTime        TimeDiff
2065    BZN|ES  Without sequence 2021-03-28 01:00:00 0 days 01:00:00
2066    BZN|ES  Without sequence 2021-03-28 03:00:00 0 days 02:00:00
7273    BZN|ES  Without sequence 2021-10-31 02:00:00 0 days 01:00:00
7274    BZN|ES  Without sequence 2021-10-31 02:00:00 0 days 00:00:00
10801   BZN|ES  Without sequence 2022-03-27 01:00:00 0 days 01:00:00
10802   BZN|ES  Without sequence 2022-03-27 03:00:00 0 days 02:00:00
16009   BZN|ES  Without sequence 2022-10-30 02:00:00 0 days 01:00:00
16010   BZN|ES  Without sequence 2022-10-30 02:00:00 0 days 00:00:00
19537   BZN|ES  Without sequence 2023-03-26 01:00:00 0 days 01:00:00
19538   BZN|ES  Without sequence 2023-03-26 03:00:00 0 days 02:00:00
24745   BZN|ES  Without sequence 2023-10-29 02:00:00 0 days 01:00:00
24746   BZN|ES  Without sequence 2023-10-29 02:00:00 0 days 00:00:00
28441   BZN|ES  Without sequence 2024-03-31 01:00:00 0 days 01:00:00
28442   BZN|ES  Without sequence 2

In [None]:
filtered_df.to_parquet('parquet_files/filtered_data.parquet', index=False)