In [1]:
import pandas as pd

# Load original file
file_path = 'data/australia_data/AEMO_PUBLIC_DVD_DISPATCHPRICE_wide.zip'
df = pd.read_csv(file_path)

In [2]:
df.head()

Unnamed: 0,year,month,day,fivemin,rrp_nsw,rrp_qld,rrp_sa,rrp_tas,rrp_vic
0,2009,7,1,1,16.941263,17.65,16.73028,15.67154,15.5
1,2009,7,1,2,17.709524,18.810089,17.82049,16.057039,15.5
2,2009,7,1,3,17.678644,18.617599,18.123159,15.90246,15.39
3,2009,7,1,4,16.736212,18.6113,17.623659,14.27313,12.81297
4,2009,7,1,5,15.63884,17.65,16.334089,13.24149,11.8


In [3]:


# Compute minutes since midnight
df['minutes_since_midnight'] = (df['fivemin'] - 1) * 5

# Now create full timestamp
df['datetime'] = pd.to_datetime(df['year'].astype(str) + '-' +
                                df['month'].astype(str).str.zfill(2) + '-' +
                                df['day'].astype(str).str.zfill(2) + ' ' +
                                (df['minutes_since_midnight'] // 60).astype(str).str.zfill(2) + ':' +
                                (df['minutes_since_midnight'] % 60).astype(str).str.zfill(2) + ':00',
                                format='%Y-%m-%d %H:%M:%S')

df.head()


Unnamed: 0,year,month,day,fivemin,rrp_nsw,rrp_qld,rrp_sa,rrp_tas,rrp_vic,minutes_since_midnight,datetime
0,2009,7,1,1,16.941263,17.65,16.73028,15.67154,15.5,0,2009-07-01 00:00:00
1,2009,7,1,2,17.709524,18.810089,17.82049,16.057039,15.5,5,2009-07-01 00:05:00
2,2009,7,1,3,17.678644,18.617599,18.123159,15.90246,15.39,10,2009-07-01 00:10:00
3,2009,7,1,4,16.736212,18.6113,17.623659,14.27313,12.81297,15,2009-07-01 00:15:00
4,2009,7,1,5,15.63884,17.65,16.334089,13.24149,11.8,20,2009-07-01 00:20:00


In [4]:

df['MTU (CET/CEST)'] = df['datetime'].dt.strftime('%d/%m/%Y %H:%M:%S') + ' - ' + \
    (df['datetime'] + pd.Timedelta(minutes=5)).dt.strftime('%d/%m/%Y %H:%M:%S')
df.head()

Unnamed: 0,year,month,day,fivemin,rrp_nsw,rrp_qld,rrp_sa,rrp_tas,rrp_vic,minutes_since_midnight,datetime,MTU (CET/CEST)
0,2009,7,1,1,16.941263,17.65,16.73028,15.67154,15.5,0,2009-07-01 00:00:00,01/07/2009 00:00:00 - 01/07/2009 00:05:00
1,2009,7,1,2,17.709524,18.810089,17.82049,16.057039,15.5,5,2009-07-01 00:05:00,01/07/2009 00:05:00 - 01/07/2009 00:10:00
2,2009,7,1,3,17.678644,18.617599,18.123159,15.90246,15.39,10,2009-07-01 00:10:00,01/07/2009 00:10:00 - 01/07/2009 00:15:00
3,2009,7,1,4,16.736212,18.6113,17.623659,14.27313,12.81297,15,2009-07-01 00:15:00,01/07/2009 00:15:00 - 01/07/2009 00:20:00
4,2009,7,1,5,15.63884,17.65,16.334089,13.24149,11.8,20,2009-07-01 00:20:00,01/07/2009 00:20:00 - 01/07/2009 00:25:00


In [5]:

# Now "melt" the price columns → create one row per area
df_melted = df.melt(id_vars=['MTU (CET/CEST)'],
                    value_vars=['rrp_nsw', 'rrp_qld', 'rrp_sa', 'rrp_tas', 'rrp_vic'],
                    var_name='Area',
                    value_name='Day-ahead Price (EUR/MWh)')
df_melted.head()

Unnamed: 0,MTU (CET/CEST),Area,Day-ahead Price (EUR/MWh)
0,01/07/2009 00:00:00 - 01/07/2009 00:05:00,rrp_nsw,16.941263
1,01/07/2009 00:05:00 - 01/07/2009 00:10:00,rrp_nsw,17.709524
2,01/07/2009 00:10:00 - 01/07/2009 00:15:00,rrp_nsw,17.678644
3,01/07/2009 00:15:00 - 01/07/2009 00:20:00,rrp_nsw,16.736212
4,01/07/2009 00:20:00 - 01/07/2009 00:25:00,rrp_nsw,15.63884


In [6]:

# Clean 'Area' column → remove 'rrp_' prefix
df_melted['Area'] = df_melted['Area'].str.replace('rrp_', '', regex=False)
df_melted.head()

Unnamed: 0,MTU (CET/CEST),Area,Day-ahead Price (EUR/MWh)
0,01/07/2009 00:00:00 - 01/07/2009 00:05:00,nsw,16.941263
1,01/07/2009 00:05:00 - 01/07/2009 00:10:00,nsw,17.709524
2,01/07/2009 00:10:00 - 01/07/2009 00:15:00,nsw,17.678644
3,01/07/2009 00:15:00 - 01/07/2009 00:20:00,nsw,16.736212
4,01/07/2009 00:20:00 - 01/07/2009 00:25:00,nsw,15.63884


In [7]:
# Add 'Sequence' column
df_melted['Sequence'] = 'Without sequence'

# Final column order
df_target = df_melted[['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)']]

# Inspect result
print(df_target.head(10))
print(df_target.tail(10))
print(df_target.columns.tolist())

                              MTU (CET/CEST) Area          Sequence  \
0  01/07/2009 00:00:00 - 01/07/2009 00:05:00  nsw  Without sequence   
1  01/07/2009 00:05:00 - 01/07/2009 00:10:00  nsw  Without sequence   
2  01/07/2009 00:10:00 - 01/07/2009 00:15:00  nsw  Without sequence   
3  01/07/2009 00:15:00 - 01/07/2009 00:20:00  nsw  Without sequence   
4  01/07/2009 00:20:00 - 01/07/2009 00:25:00  nsw  Without sequence   
5  01/07/2009 00:25:00 - 01/07/2009 00:30:00  nsw  Without sequence   
6  01/07/2009 00:30:00 - 01/07/2009 00:35:00  nsw  Without sequence   
7  01/07/2009 00:35:00 - 01/07/2009 00:40:00  nsw  Without sequence   
8  01/07/2009 00:40:00 - 01/07/2009 00:45:00  nsw  Without sequence   
9  01/07/2009 00:45:00 - 01/07/2009 00:50:00  nsw  Without sequence   

   Day-ahead Price (EUR/MWh)  
0                  16.941263  
1                  17.709524  
2                  17.678644  
3                  16.736212  
4                  15.638840  
5                  13.728767  
6

In [8]:
output_path = 'parquet_files/australia_data.parquet'
df_target.to_parquet(output_path, index=False)

### Check

In [9]:
df = df_target.copy()

# Extract the date part (first 10 characters of the string)
df['date'] = df['MTU (CET/CEST)'].str[:10]


interval_counts = df.groupby(['date', 'Area']).size().reset_index(name='num_intervals')


# How many unique interval counts exist?
print(interval_counts['num_intervals'].value_counts())

# Optional — show days that don't have 288 intervals
non_288_days = interval_counts[interval_counts['num_intervals'] != 288]
print(non_288_days['date'].unique())


num_intervals
288    17070
287      580
Name: count, dtype: int64
['01/01/2010' '01/01/2011' '01/01/2012' '01/01/2013' '01/01/2014'
 '01/01/2015' '01/01/2016' '01/01/2017' '01/01/2018' '01/01/2019'
 '01/02/2010' '01/02/2011' '01/02/2012' '01/02/2013' '01/02/2014'
 '01/02/2015' '01/02/2016' '01/02/2017' '01/02/2018' '01/02/2019'
 '01/03/2010' '01/03/2011' '01/03/2012' '01/03/2013' '01/03/2014'
 '01/03/2015' '01/03/2016' '01/03/2017' '01/03/2018' '01/04/2010'
 '01/04/2011' '01/04/2012' '01/04/2013' '01/04/2014' '01/04/2015'
 '01/04/2016' '01/04/2017' '01/04/2018' '01/05/2010' '01/05/2011'
 '01/05/2012' '01/05/2013' '01/05/2014' '01/05/2015' '01/05/2016'
 '01/05/2017' '01/05/2018' '01/06/2010' '01/06/2011' '01/06/2012'
 '01/06/2013' '01/06/2014' '01/06/2015' '01/06/2016' '01/06/2017'
 '01/06/2018' '01/07/2009' '01/07/2010' '01/07/2011' '01/07/2012'
 '01/07/2013' '01/07/2014' '01/07/2015' '01/07/2016' '01/07/2017'
 '01/07/2018' '01/08/2009' '01/08/2010' '01/08/2011' '01/08/2012'
 '01/08/20