In [None]:
import pandas as pd

# Load original file
file_path = 'data/australia_data/AEMO_PUBLIC_DVD_DISPATCHPRICE_wide.zip'
df = pd.read_csv(file_path)

# Compute minutes since midnight
df['minutes_since_midnight'] = (df['fivemin'] - 1) * 5

# Now create full timestamp
df['datetime'] = pd.to_datetime(df['year'].astype(str) + '-' +
                                df['month'].astype(str).str.zfill(2) + '-' +
                                df['day'].astype(str).str.zfill(2) + ' ' +
                                (df['minutes_since_midnight'] // 60).astype(str).str.zfill(2) + ':' +
                                (df['minutes_since_midnight'] % 60).astype(str).str.zfill(2) + ':00',
                                format='%Y-%m-%d %H:%M:%S')


df['MTU (CET/CEST)'] = df['datetime'].dt.strftime('%d/%m/%Y %H:%M:%S') + ' - ' + \
    (df['datetime'] + pd.Timedelta(minutes=5)).dt.strftime('%d/%m/%Y %H:%M:%S')

# 2️⃣ Now "melt" the price columns → create one row per area
df_melted = df.melt(id_vars=['MTU (CET/CEST)'],
                    value_vars=['rrp_nsw', 'rrp_qld', 'rrp_sa', 'rrp_tas', 'rrp_vic'],
                    var_name='Area',
                    value_name='Day-ahead Price (EUR/MWh)')

# 3️⃣ Clean 'Area' column → remove 'rrp_' prefix
df_melted['Area'] = df_melted['Area'].str.replace('rrp_', '', regex=False)

# 4️⃣ Add 'Sequence' column
df_melted['Sequence'] = 'Without sequence'

# 5️⃣ Final column order
df_target = df_melted[['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)']]

# 6️⃣ (Optional) inspect result
print(df_target.head(10))
print(df_target.tail(10))
print(df_target.columns.tolist())

                              MTU (CET/CEST) Area          Sequence  \
0  01/07/2009 00:00:00 - 01/07/2009 00:05:00  nsw  Without sequence   
1  01/07/2009 00:05:00 - 01/07/2009 00:10:00  nsw  Without sequence   
2  01/07/2009 00:10:00 - 01/07/2009 00:15:00  nsw  Without sequence   
3  01/07/2009 00:15:00 - 01/07/2009 00:20:00  nsw  Without sequence   
4  01/07/2009 00:20:00 - 01/07/2009 00:25:00  nsw  Without sequence   
5  01/07/2009 00:25:00 - 01/07/2009 00:30:00  nsw  Without sequence   
6  01/07/2009 00:30:00 - 01/07/2009 00:35:00  nsw  Without sequence   
7  01/07/2009 00:35:00 - 01/07/2009 00:40:00  nsw  Without sequence   
8  01/07/2009 00:40:00 - 01/07/2009 00:45:00  nsw  Without sequence   
9  01/07/2009 00:45:00 - 01/07/2009 00:50:00  nsw  Without sequence   

   Day-ahead Price (EUR/MWh)  
0                  16.941263  
1                  17.709524  
2                  17.678644  
3                  16.736212  
4                  15.638840  
5                  13.728767  
6

In [11]:
output_path = 'parquet_files/australia_data.parquet'
df_target.to_parquet(output_path, index=False)