In [1]:
# Load libraries

import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
import pandas as pd

 # Step 1: Load and Combine CSV Files

 **Goal:** Aggregate multiple CSV files of the electricity prices from various countries into a single DataFrame.

In [2]:
folder_path = 'data/spain_france_portugal'  # Directory containing the CSV files
dfs = []  # List to store each individual DataFrame

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)


In [None]:
# --- Create dropdown ---
option_selector = widgets.Dropdown(
    options=['europe', 'australia'],
    value='europe',  # initial default
    description='Dataset:',
    disabled=False,
)

# --- Create button ---
button = widgets.Button(
    description="Load Dataset",
    button_style='success',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to load selected dataset',
    icon='check'  # (optional) FontAwesome icon
)

# --- Define button click handler ---
def on_button_click(b):
    global combined_df
    clear_output(wait=True)  # Clears previous output to keep it clean
    display(option_selector, button)  # Re-display widgets after clear
    
    selected_option = option_selector.value
    
    if selected_option == 'europe':
        # Example: create dummy combined_df
        combined_df = combined_df.copy()
        print("✅ Europe dataset selected (combined_df copied).")
        print(combined_df.head())
    
    elif selected_option == 'australia':
        file_path = 'parquet_files/australia_data.parquet'
        try:
            combined_df = pd.read_parquet(file_path)
            print(f"✅ Australia dataset loaded from {file_path}.")
            print(combined_df.head())
        except Exception as e:
            print(f"❌ Error loading Australia dataset: {e}")

# --- Attach button click handler ---
button.on_click(on_button_click)

# --- Display UI ---
display(option_selector, button)


Dropdown(description='Dataset:', index=1, options=('europe', 'australia'), value='australia')

Button(button_style='success', description='Load Dataset', icon='check', style=ButtonStyle(), tooltip='Click t…

In [4]:
combined_df.head()

Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh)
0,01/07/2009 00:00:00 - 01/07/2009 00:05:00,nsw,Without sequence,16.941263
1,01/07/2009 00:05:00 - 01/07/2009 00:10:00,nsw,Without sequence,17.709524
2,01/07/2009 00:10:00 - 01/07/2009 00:15:00,nsw,Without sequence,17.678644
3,01/07/2009 00:15:00 - 01/07/2009 00:20:00,nsw,Without sequence,16.736212
4,01/07/2009 00:20:00 - 01/07/2009 00:25:00,nsw,Without sequence,15.63884


 # Step 2: Examine Columns and Rows in Combined Data

 **Goal:** Understand the structure of the merged dataset.

In [5]:
print('The columns are: ', [col for col in combined_df.columns])
print('Number of rows: ', len(combined_df))


The columns are:  ['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)']
Number of rows:  5083200


 # Step 3: Extract and Clean Start Times

 **Goal:** Parse the 'MTU (CET/CEST)' column to derive clean start datetime values.

In [6]:
start_times = combined_df['MTU (CET/CEST)'].str.split(' - ').str[0]
start_times_clean = start_times.str.replace(r' \(CET\)| \(CEST\)', '', regex=True)
combined_df['Start DateTime'] = pd.to_datetime(start_times_clean, dayfirst=True)
combined_df.head()


Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh),Start DateTime
0,01/07/2009 00:00:00 - 01/07/2009 00:05:00,nsw,Without sequence,16.941263,2009-07-01 00:00:00
1,01/07/2009 00:05:00 - 01/07/2009 00:10:00,nsw,Without sequence,17.709524,2009-07-01 00:05:00
2,01/07/2009 00:10:00 - 01/07/2009 00:15:00,nsw,Without sequence,17.678644,2009-07-01 00:10:00
3,01/07/2009 00:15:00 - 01/07/2009 00:20:00,nsw,Without sequence,16.736212,2009-07-01 00:15:00
4,01/07/2009 00:20:00 - 01/07/2009 00:25:00,nsw,Without sequence,15.63884,2009-07-01 00:20:00


 # Step 4: Filter Data by Time Interval

 **Goal:** Keep only rows that fall within the desired datetime range.

In [7]:
if option_selector.value == 'australia':
    filtered_df = combined_df.copy()
    print(filtered_df['Start DateTime'].head(5))
    print(filtered_df['Start DateTime'].tail(5))
else:
    start_date = pd.to_datetime('2021-05-21')  # Lower bound for filtering
    end_date = pd.to_datetime('2025-05-01')    # Upper bound (exclusive)
    filtered_df = combined_df[(combined_df['Start DateTime'] >= start_date) & (combined_df['Start DateTime'] < end_date)]
    print(filtered_df['Start DateTime'].head(5))
    print(filtered_df['Start DateTime'].tail(5))


0   2009-07-01 00:00:00
1   2009-07-01 00:05:00
2   2009-07-01 00:10:00
3   2009-07-01 00:15:00
4   2009-07-01 00:20:00
Name: Start DateTime, dtype: datetime64[ns]
5083195   2019-02-28 23:35:00
5083196   2019-02-28 23:40:00
5083197   2019-02-28 23:45:00
5083198   2019-02-28 23:50:00
5083199   2019-02-28 23:55:00
Name: Start DateTime, dtype: datetime64[ns]


 # Step 5: Detect Irregular Time Intervals

Identify gaps or anomalies in the hourly data sequence grouped by Area and Sequence.  These datetimes are like this because of the Daylight Saving Time (DST).

In [11]:
if option_selector.value == 'australia':
    minutes = 5
else:
    minutes = 60

filtered_df = filtered_df.sort_values(['Area', 'Sequence', 'Start DateTime']).reset_index(drop=True)
filtered_df['TimeDiff'] = filtered_df.groupby(['Area', 'Sequence'])['Start DateTime'].diff()
filtered_df['NonHourly'] = (filtered_df['TimeDiff'] != pd.Timedelta(minutes=minutes)) & (~filtered_df['TimeDiff'].isna())

problem_indices = filtered_df.index[filtered_df['NonHourly']]
context_indices = problem_indices.union(problem_indices - 1)

irregular_context = filtered_df.loc[context_indices].sort_values(['Area', 'Sequence', 'Start DateTime'])
print(irregular_context[['Area', 'Sequence', 'Start DateTime', 'TimeDiff']].head(10))
print(irregular_context['TimeDiff'].unique())


Empty DataFrame
Columns: [Area, Sequence, Start DateTime, TimeDiff]
Index: []
<TimedeltaArray>
[]
Length: 0, dtype: timedelta64[ns]


 # Step 6: Export Filtered Data

Save the cleaned and validated dataset to a Parquet file for further use.

In [12]:
if option_selector.value == 'australia':
    filtered_df.to_parquet('parquet_files/filtered_data_australia.parquet', index=False)
else:
    filtered_df.to_parquet('parquet_files/filtered_data.parquet', index=False)