 # Step 1: Load and Combine CSV Files

 **Goal:** Aggregate multiple CSV files of the electricity prices from various countries into a single DataFrame.

In [22]:
import os
import pandas as pd

folder_path = 'data'  # Directory containing the CSV files
dfs = []  # List to store each individual DataFrame

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)


 # Step 2: Examine Columns and Rows in Combined Data

 **Goal:** Understand the structure of the merged dataset.

In [23]:
print('The columns are: ', [col for col in combined_df.columns])
print('Number of rows: ', len(combined_df))


The columns are:  ['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)', 'Intraday Period (CET/CEST)', 'Intraday Price (EUR/MWh)']
Number of rows:  115485


 # Step 3: Extract and Clean Start Times

 **Goal:** Parse the 'MTU (CET/CEST)' column to derive clean start datetime values.

In [24]:
start_times = combined_df['MTU (CET/CEST)'].str.split(' - ').str[0]
start_times_clean = start_times.str.replace(r' \(CET\)| \(CEST\)', '', regex=True)
combined_df['Start DateTime'] = pd.to_datetime(start_times_clean, dayfirst=True)
combined_df.head()


Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh),Intraday Period (CET/CEST),Intraday Price (EUR/MWh),Start DateTime
0,01/01/2021 00:00:00 - 01/01/2021 01:00:00,BZN|ES,Without sequence,50.87,,,2021-01-01 00:00:00
1,01/01/2021 01:00:00 - 01/01/2021 02:00:00,BZN|ES,Without sequence,48.19,,,2021-01-01 01:00:00
2,01/01/2021 02:00:00 - 01/01/2021 03:00:00,BZN|ES,Without sequence,44.68,,,2021-01-01 02:00:00
3,01/01/2021 03:00:00 - 01/01/2021 04:00:00,BZN|ES,Without sequence,38.5,,,2021-01-01 03:00:00
4,01/01/2021 04:00:00 - 01/01/2021 05:00:00,BZN|ES,Without sequence,36.8,,,2021-01-01 04:00:00


 # Step 4: Filter Data by Time Interval

 **Goal:** Keep only rows that fall within the desired datetime range.

In [26]:
start_date = pd.to_datetime('2021-05-21')  # Lower bound for filtering
end_date = pd.to_datetime('2025-05-01')    # Upper bound (exclusive)
filtered_df = combined_df[(combined_df['Start DateTime'] >= start_date) & (combined_df['Start DateTime'] < end_date)]
print(filtered_df['Start DateTime'].head(5))
print(filtered_df['Start DateTime'].tail(5))


3359   2021-05-21 00:00:00
3360   2021-05-21 01:00:00
3361   2021-05-21 02:00:00
3362   2021-05-21 03:00:00
3363   2021-05-21 04:00:00
Name: Start DateTime, dtype: datetime64[ns]
114928   2025-04-30 19:00:00
114929   2025-04-30 20:00:00
114930   2025-04-30 21:00:00
114931   2025-04-30 22:00:00
114932   2025-04-30 23:00:00
Name: Start DateTime, dtype: datetime64[ns]


 # Step 5: Detect Irregular Time Intervals

Identify gaps or anomalies in the hourly data sequence grouped by Area and Sequence.  These datetimes are like this because of the Daylight Saving Time (DST).

In [28]:
filtered_df = filtered_df.sort_values(['Area', 'Sequence', 'Start DateTime']).reset_index(drop=True)
filtered_df['TimeDiff'] = filtered_df.groupby(['Area', 'Sequence'])['Start DateTime'].diff()
filtered_df['NonHourly'] = (filtered_df['TimeDiff'] != pd.Timedelta(hours=1)) & (~filtered_df['TimeDiff'].isna())

problem_indices = filtered_df.index[filtered_df['NonHourly']]
context_indices = problem_indices.union(problem_indices - 1)

irregular_context = filtered_df.loc[context_indices].sort_values(['Area', 'Sequence', 'Start DateTime'])
print(irregular_context[['Area', 'Sequence', 'Start DateTime', 'TimeDiff']].head(10))


         Area          Sequence      Start DateTime        TimeDiff
3914   BZN|ES  Without sequence 2021-10-31 02:00:00 0 days 01:00:00
3915   BZN|ES  Without sequence 2021-10-31 02:00:00 0 days 00:00:00
7442   BZN|ES  Without sequence 2022-03-27 01:00:00 0 days 01:00:00
7443   BZN|ES  Without sequence 2022-03-27 03:00:00 0 days 02:00:00
12650  BZN|ES  Without sequence 2022-10-30 02:00:00 0 days 01:00:00
12651  BZN|ES  Without sequence 2022-10-30 02:00:00 0 days 00:00:00
16178  BZN|ES  Without sequence 2023-03-26 01:00:00 0 days 01:00:00
16179  BZN|ES  Without sequence 2023-03-26 03:00:00 0 days 02:00:00
21386  BZN|ES  Without sequence 2023-10-29 02:00:00 0 days 01:00:00
21387  BZN|ES  Without sequence 2023-10-29 02:00:00 0 days 00:00:00


 # Step 6: Export Filtered Data

Save the cleaned and validated dataset to a Parquet file for further use.

In [29]:
filtered_df.to_parquet('parquet_files/filtered_data.parquet', index=False)
